In [1]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import load_model
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import zipfile
import csv
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE

In [2]:
!wget https://dl.dropboxusercontent.com/s/47n4lv9wpyrm61i/titanic_train.csv?dl=0 -O /content/train.csv
!wget https://dl.dropboxusercontent.com/s/z53jggpeiyaz62n/titanic_test.csv?dl=0 -O /content/test.csv

--2020-08-16 04:00:32--  https://dl.dropboxusercontent.com/s/47n4lv9wpyrm61i/titanic_train.csv?dl=0
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6018:15::a27d:30f
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.3.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61194 (60K) [text/csv]
Saving to: ‘/content/train.csv’


2020-08-16 04:00:32 (3.06 MB/s) - ‘/content/train.csv’ saved [61194/61194]

--2020-08-16 04:00:33--  https://dl.dropboxusercontent.com/s/z53jggpeiyaz62n/titanic_test.csv?dl=0
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6018:15::a27d:30f
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.3.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28629 (28K) [text/csv]
Saving to: ‘/content/test.csv’


2020-08-16 04:00:34 (2.66 MB/s) - ‘/content/test.csv’ saved [28629/28629]



In [3]:
df = pd.read_csv('train.csv')
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#Get varience of the columns. You can remove features with low varience (as these features wont have much importance for learning)
df.var(axis=0)

PassengerId    66231.000000
Survived           0.236772
Pclass             0.699015
Age              211.019125
SibSp              1.216043
Parch              0.649728
Fare            2469.436846
dtype: float64

In [18]:
df = pd.get_dummies(df, columns=['Embarked','Sex'])
#Only get the numerical values
df = df.select_dtypes(exclude=object)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,22.0,1,0,7.25,0,0,1,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,1,3,26.0,0,0,7.925,0,0,1,1,0
3,4,1,1,35.0,1,0,53.1,0,0,1,1,0
4,5,0,3,35.0,0,0,8.05,0,0,1,0,1


In [19]:
target= 'Survived'
tmp = df.drop(target,axis=1)
features = df.columns

In [20]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

In [21]:
x = df[features]
y = df[target]

##SelectKBest to get chi-squared scores##

In [23]:
#Only works with numerical values
selector = SelectKBest(chi2,k='all')
selector.fit(x,y)
#The larger the values, the stronger the relationship between the feature and the target
selector.scores_

array([3.31293407e+00, 5.49000000e+02, 3.08736994e+01, 2.46879258e+01,
       2.58186538e+00, 1.00974991e+01, 4.51831909e+03, 2.04644013e+01,
       1.08467891e-02, 5.98483982e+00, 1.70348127e+02, 9.27024470e+01])

##RFE to select the n most important features##

In [24]:
rf = RandomForestClassifier(n_estimators=100)
#You want to select the top 6 features only
recursive_selector = RFE(estimator=rf,n_features_to_select=6)
recursive_selector.fit(x,y)
print('Most important features are:')
for x in features[recursive_selector.support_]:
  print(x)

Most important features are:
Survived
Pclass
Age
Fare
Sex_female
Sex_male


##Feature engineering##

In [25]:
#See the correlation between the features
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658,-0.001205,-0.033606,0.022148,-0.042939,0.042939
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.16824,0.00365,-0.15566,0.543351,-0.543351
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.243292,0.221009,0.08172,-0.1319,0.1319
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,0.032024,-0.013855,-0.027121,-0.084153,0.084153
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,-0.059528,-0.026354,0.070941,0.114631,-0.114631
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,-0.011069,-0.081228,0.063036,0.245489,-0.245489
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.269335,-0.117216,-0.166603,0.182333,-0.182333
Embarked_C,-0.001205,0.16824,-0.243292,0.032024,-0.059528,-0.011069,0.269335,1.0,-0.148258,-0.778359,0.082853,-0.082853
Embarked_Q,-0.033606,0.00365,0.221009,-0.013855,-0.026354,-0.081228,-0.117216,-0.148258,1.0,-0.496624,0.074115,-0.074115
Embarked_S,0.022148,-0.15566,0.08172,-0.027121,0.070941,0.063036,-0.166603,-0.778359,-0.496624,1.0,-0.125722,0.125722
