In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [2]:
def concat_df(train,test):
    return pd.concat([train,test],sort=True).reset_index(drop=True)

def divide_df(all_data):
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'],axis=1)

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_all = concat_df(df_train,df_test)
dfs = [df_train,df_test]

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

In [4]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S
336,337,0,1,"Pears, Mr. Thomas Clinton",male,29.0,1,0,113776,66.6,C2,S
674,675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0,,S


In [5]:
def display_missing(df):
    for col in df.columns.tolist():
        print('{} column missing value {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing value 0
Survived column missing value 0
Pclass column missing value 0
Name column missing value 0
Sex column missing value 0
Age column missing value 177
SibSp column missing value 0
Parch column missing value 0
Ticket column missing value 0
Fare column missing value 0
Cabin column missing value 687
Embarked column missing value 2


Test Set
PassengerId column missing value 0
Pclass column missing value 0
Name column missing value 0
Sex column missing value 0
Age column missing value 86
SibSp column missing value 0
Parch column missing value 0
Ticket column missing value 0
Fare column missing value 1
Cabin column missing value 327
Embarked column missing value 0




In [6]:
df_all_corr = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
6,Age,Age,1.0
9,Age,Pclass,0.408106
17,Age,SibSp,0.243699
22,Age,Fare,0.17874
25,Age,Parch,0.150917
29,Age,Survived,0.077221
41,Age,PassengerId,0.028814


In [8]:
age_by_pclass_sex = df_all.groupby(['Sex','Pclass']).median()['Age']
age_by_pclass_sex

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

In [11]:
df_all['Age'] = df_all.groupby(['Sex','Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [16]:
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket


In [15]:
df_all['Embarked'] = df_all['Embarked'].fillna('S')

In [17]:
df_all[df_all['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1043,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [24]:
med_fare = df_all.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
df_all['Fare'] = df_all['Fare'].fillna(med_fare)

In [25]:
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

In [31]:
df_all_deck = df_all.groupby(['Deck','Pclass']).count().drop(columns=['Survived','Sex','Age','SibSp','Parch','Fare','Embarked','Cabin','PassengerId','Ticket']).rename(columns={'Name': 'Count'})

In [34]:
df_all_deck = df_all_deck.transpose()

In [35]:
def get_pclass_dist(df):
    deck_counts =  {'A': {}, 'B': {}, 'C': {},'D': {},'E':{},'F':{},'G':{},'M':{},'T':{}}
    decks = df.columns.levels[0]
    
    for deck in decks:
        for pclass in range(1,4):
            try:
                count=df[deck][pclass][0]
                deck_counts[deck][pclass] = count
            except KeyError:
                deck_counts[deck][pclass] = 0
    df_decks = pd.DataFrame(deck_counts)
    deck_percentages = {}
    

Deck,A,B,C,D,D,E,E,E,F,F,G,M,M,M,T
Pclass,1,1,1,1,2,1,2,3,2,3,3,1,2,3,1
Count,22,65,94,40,6,34,4,3,13,8,5,67,254,693,1
