In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [4]:
def concat_df(train,test):
    return pd.concat([train,test],sort=True).reset_index(drop=True)

def divide_df(all_data):
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'],axis=1)

In [25]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_all = concat_df(df_train,df_test)
dfs = [df_train,df_test]

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

In [6]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
214,215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
374,375,0,3,"Palsson, Miss. Stina Viola",female,3.0,3,1,349909,21.075,,S


In [10]:
def display_missing(df):
    for col in df.columns.tolist():
        print('{} column missing value {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing value 0
Survived column missing value 0
Pclass column missing value 0
Name column missing value 0
Sex column missing value 0
Age column missing value 177
SibSp column missing value 0
Parch column missing value 0
Ticket column missing value 0
Fare column missing value 0
Cabin column missing value 687
Embarked column missing value 2


Test Set
PassengerId column missing value 0
Pclass column missing value 0
Name column missing value 0
Sex column missing value 0
Age column missing value 86
SibSp column missing value 0
Parch column missing value 0
Ticket column missing value 0
Fare column missing value 1
Cabin column missing value 327
Embarked column missing value 0




In [27]:
df_all_corr = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
6,Age,Age,1.0
9,Age,Pclass,0.408106
17,Age,SibSp,0.243699
22,Age,Fare,0.17874
25,Age,Parch,0.150917
29,Age,Survived,0.077221
41,Age,PassengerId,0.028814


Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
Age,1.0,0.17874,-0.150917,0.028814,-0.408106,-0.243699,-0.077221
Fare,0.17874,1.0,0.221539,0.031428,-0.558629,0.160238,0.257307
Parch,-0.150917,0.221539,1.0,0.008942,0.018322,0.373587,0.081629
PassengerId,0.028814,0.031428,0.008942,1.0,-0.038354,-0.055224,-0.005007
Pclass,-0.408106,-0.558629,0.018322,-0.038354,1.0,0.060832,-0.338481
SibSp,-0.243699,0.160238,0.373587,-0.055224,0.060832,1.0,-0.035322
Survived,-0.077221,0.257307,0.081629,-0.005007,-0.338481,-0.035322,1.0
