In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('seaborn')

In [3]:
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.base import clone
from sklearn.preprocessing import scale

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')

import IPython
from IPython import display

In [9]:
raw_train_data = pd.read_csv('data\\train.csv')
raw_test_data = pd.read_csv('data\\test.csv')

In [None]:
def ownprocess(original_DF):
    DF = original_DF.copy()
    
    age_map = [int(DF[DF.Pclass==c].Age.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Age.isnull() & (DF.Pclass==c), 'Age'
        ] = age_map[c-1]
    DF['Age'] = DF.Age.astype(int)
    
    
    DF['Embarked'].fillna(DF.Embarked.mode()[0], inplace = True)
    
    
    Fare_map = [int(DF[DF.Pclass==c].Fare.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Fare.isnull() & (DF.Pclass==c), 'Fare'
        ] = Fare_map[c-1]
        
    DF['FamilySize'] = DF.SibSp + DF.Parch
    
    
    DF['Cabin'] = DF.Cabin.notnull().astype(int)
    
    return DF  

In [86]:
def process(original_DF):
    DF = original_DF.copy()
    
    #COMPLETE    
    DF['Age'].fillna(DF.Age.median(), inplace = True)    
    DF['Embarked'].fillna(DF.Embarked.mode()[0], inplace = True)    
    DF['Fare'].fillna(DF.Fare.median(), inplace = True)    
    DF['FamilySize'] = DF.SibSp + DF.Parch
    DF['isAlone'] = DF.SibSp + DF.Parch == 0
    DF['isAlone'] = DF.isAlone.astype(int)
    
    #CREATE
    DF['FareBin'] = pd.qcut(DF.Fare, 4)
    DF['AgeBin'] = pd.qcut(DF.Age, 4)
    
    DF['Title'] = DF['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    titlefrq = DF.Title.value_counts() < 10
    DF['Title'].apply(lambda s: 'Misc' if titlefrq.loc[s] == True else s)
    
    
    
    drop_columns = ['PassengerId', 'Cabin', 'Ticket']
    DF.drop(drop_columns, axis=1, inplace = True)   
    return DF  

In [87]:
train_data = process(raw_train_data)
test_data = process(raw_test_data)

In [88]:
train_data.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,isAlone
Survived,1.0,-0.338481,-0.06491,-0.035322,0.081629,0.257307,0.016639,-0.203367
Pclass,-0.338481,1.0,-0.339898,0.083081,0.018443,-0.5495,0.065997,0.135207
Age,-0.06491,-0.339898,1.0,-0.233296,-0.172482,0.096688,-0.245619,0.171647
SibSp,-0.035322,0.083081,-0.233296,1.0,0.414838,0.159651,0.890712,-0.584471
Parch,0.081629,0.018443,-0.172482,0.414838,1.0,0.216225,0.783111,-0.583398
Fare,0.257307,-0.5495,0.096688,0.159651,0.216225,1.0,0.217138,-0.271832
FamilySize,0.016639,0.065997,-0.245619,0.890712,0.783111,0.217138,1.0,-0.690922
isAlone,-0.203367,0.135207,0.171647,-0.584471,-0.583398,-0.271832,-0.690922,1.0


In [85]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    int64   
 2   Name      891 non-null    object  
 3   Sex       891 non-null    object  
 4   Age       891 non-null    float64 
 5   SibSp     891 non-null    int64   
 6   Parch     891 non-null    int64   
 7   Fare      891 non-null    float64 
 8   Embarked  891 non-null    object  
 9   isAlone   891 non-null    int32   
 10  FareBin   891 non-null    category
 11  AgeBin    891 non-null    category
 12  Title     891 non-null    object  
dtypes: category(2), float64(2), int32(1), int64(4), object(4)
memory usage: 75.4+ KB


In [89]:
pd.crosstab(train_data.Survived, train_data.FamilySize)

FamilySize,0,1,2,3,4,5,6,7,10
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,374,72,43,8,12,19,8,6,7
1,163,89,59,21,3,3,4,0,0


In [91]:
pd.crosstab(train_data.Survived, train_data.SibSp)

SibSp,0,1,2,3,4,5,8
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,398,97,15,12,15,5,7
1,210,112,13,4,3,0,0


In [92]:
pd.crosstab(train_data.Survived, train_data.Parch)

Parch,0,1,2,3,4,5,6
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,445,53,40,2,4,4,1
1,233,65,40,3,0,1,0
