In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
print('Train set : ', df_train.shape,' Test set : ',df_test.shape)

Train set :  (891, 12)  Test set :  (418, 11)


In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_tr = df_train.drop(['PassengerId'],axis=1)
df_tst = df_test.drop(['PassengerId'],axis=1)

In [6]:
numerical_col = df_tr.describe().columns
numerical_col

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

##### Numerical Features: Age (Continuous), Fare (Continuous), SibSp (Discrete), Parch (Discrete)
##### Categorical Features: Survived, Sex, Embarked, Pclass
##### Alphanumeric Features: Ticket, Cabin

In [7]:
def missing_data(df):
    total_miss = pd.isnull(df).sum().sort_values(ascending=False)
    percent_miss = ((pd.isnull(df).sum() / pd.isnull(df).count())*100).sort_values(ascending=False)
    missing_values =  pd.concat([total_miss,percent_miss],axis = 1, keys = ['Total', 'Percent'])
    return missing_values

In [8]:
print('Train set :')
missing_data(df_tr)

Train set :


Unnamed: 0,Total,Percent
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [9]:
print('Test set :')
missing_data(df_tst)

Test set :


Unnamed: 0,Total,Percent
Cabin,327,78.229665
Age,86,20.574163
Fare,1,0.239234
Embarked,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


###### CABIN column needs to be dropped right away because 77% of data is missing.

In [10]:
df_tr.drop(['Cabin'],axis=1,inplace=True)
df_tst.drop(['Cabin'],axis=1,inplace=True)

In [11]:
df_tr.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


##### Before we fill the missing values in Age column, we see that we have suffixes in Name column with Mr., Mrs., Doc. etc... We could use that to interpret the age of a person.

In [12]:
df_tr['Embarked'].fillna(df_tr['Embarked'].mode().values[0],inplace=True)
df_tst['Fare'].fillna(df_tst['Fare'].mean(),inplace=True)

In [13]:
def title(df):
    df['Title'] = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]
    columns = df['Title'].unique()
    df['Title'].replace(['Ms','Mlle','Mme'],'Miss',inplace=True)
    df['Title'].replace(['Lady','the Countess','Dona'],'Mrs',inplace=True)
    df['Title'].replace(['Don','Jonkheer','Rev','Sir'],'Unknown',inplace=True)
    df['Title'].replace(['Capt','Col','Dr','Major'],'Mr',inplace=True)
    tile_cols = pd.crosstab(df['Title'],df['Sex'])
    return df.head()

In [14]:
# Ms, Miss, Mlle, Mme ------- Miss
# mrs, lady, countess ------- Mrs
# mr, capt, col, major, dr -- Mr
# don, jonkheer, rev, sir --- Unknown

In [15]:
title(df_tr)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [16]:
title(df_tst)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Mr
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,Mrs
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Mr
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Mr
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Mrs


#### Now that we have the title column, we can drop name column.
#### ALso, the ticket column doesn't provide any useful information, Let's just drop this as well.

In [17]:
def drop_col(df):
    df.drop(['Name','Ticket'],axis=1,inplace=True)

In [18]:
drop_col(df_tr)

In [19]:
drop_col(df_tst)

In [20]:
df_tr.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.25,S,Mr
1,1,1,female,38.0,1,0,71.2833,C,Mrs
2,1,3,female,26.0,0,0,7.925,S,Miss
3,1,1,female,35.0,1,0,53.1,S,Mrs
4,0,3,male,35.0,0,0,8.05,S,Mr


##### Now, let's compute the missing AGE values using Titles

In [21]:
def age_imputation(df):
    for i in ['Master','Mrs','Mr','Unknown','Miss']:
        df[(df['Title']== i) & df['Age'].isna()] = df[(df['Title']== i) 
                                            & df['Age'].isna()].fillna((df['Age'][(df['Title']== i)].mean()))
    

In [22]:
#df_tr[(df_tr['Title']=='Master') & df_tr['Age'].isna()] = df_tr[(df_tr['Title']=='Master') & df_tr['Age'].isna()].fillna((df_tr['Age'][(df_tr['Title']=='Master')].mean()))
#df_tr[(df_tr['Title']=='Mrs') & df_tr['Age'].isna()] = df_tr[(df_tr['Title']=='Mrs') & df_tr['Age'].isna()].fillna((df_tr['Age'][(df_tr['Title']=='Mrs')].mean()))
#df_tr[(df_tr['Title']=='Mr') & df_tr['Age'].isna()] = df_tr[(df_tr['Title']=='Mr') & df_tr['Age'].isna()].fillna((df_tr['Age'][(df_tr['Title']=='Mr')].mean()))
#df_tr[(df_tr['Title']=='Miss') & df_tr['Age'].isna()] = df_tr[(df_tr['Title']=='Miss') & df_tr['Age'].isna()].fillna((df_tr['Age'][(df_tr['Title']=='Miss')].mean()))
#df_tr[(df_tr['Title']=='Unknown') & df_tr['Age'].isna()] = df_tr[(df_tr['Title']=='Unknown') & df_tr['Age'].isna()].fillna((df_tr['Age'][(df_tr['Title']=='Unknown')].mean()))

In [23]:
age_imputation(df_tr)
age_imputation(df_tst)

In [24]:
def age_null(df):
    print( df.isnull().sum().max() )

In [25]:
age_null(df_tr)

0


In [26]:
age_null(df_tst)

0


In [27]:
sex_mapping = {"male": 0, "female": 1}
df_tr['Sex'] = df_tr['Sex'].map(sex_mapping)
df_tst['Sex'] = df_tst['Sex'].map(sex_mapping)

In [28]:
embarked_mapping = {"S": 1, "C": 2, "Q": 3}
df_tr['Embarked'] = df_tr['Embarked'].map(embarked_mapping)
df_tst['Embarked'] = df_tst['Embarked'].map(embarked_mapping)

In [29]:
title_mapping = {"Mr":1,"Mrs":2,"Master":3,"Miss":4,"Unknown":5}
df_tr['Title'] = df_tr['Title'].map(title_mapping)
df_tst['Title'] = df_tst['Title'].map(title_mapping)

In [30]:
df_tr.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,1,1
1,1,1,1,38.0,1,0,71.2833,2,2
2,1,3,1,26.0,0,0,7.925,1,4
3,1,1,1,35.0,1,0,53.1,1,2
4,0,3,0,35.0,0,0,8.05,1,1


In [31]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

In [32]:
df_trn_scaled = df_tr.copy()

In [33]:
df_trn_scaled[['Age','Fare']] = scale.fit_transform(df_trn_scaled[['Age','Fare']])

In [34]:
df_tst[['Age','Fare']] = scale.fit_transform(df_tst[['Age','Fare']])

In [35]:
from sklearn.model_selection import train_test_split

X = df_trn_scaled.drop(['Survived'],axis=1)
y = df_trn_scaled['Survived']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.22, random_state = 0)

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

### Decision Trees ( ID3 and CART )
###### ID3 uses Entropy , CART uses the Gini Impurity.

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
id3 = DecisionTreeClassifier(criterion='entropy',random_state=0)
id3.fit(X_train, y_train)
y_pred = id3.predict(X_val)
print('ID3 : ',cross_val_score(id3,X_train,y_train,cv=5), ' Mean : ',cross_val_score(id3,X_train,y_train,cv=5).mean())

ID3 :  [0.77697842 0.76978417 0.76258993 0.74100719 0.8115942 ]  Mean :  0.7723907830257534


In [39]:
cart = DecisionTreeClassifier(criterion='gini',random_state=0)
cart.fit(X_train, y_train)
y_pred = cart.predict(X_val)
print('CART : ',cross_val_score(cart,X_train,y_train,cv=5), ' Mean : ',cross_val_score(cart,X_train,y_train,cv=5).mean())

CART :  [0.74820144 0.76978417 0.79136691 0.73381295 0.81884058]  Mean :  0.7724012094672088


###### Hyperparameter Tuning

In [40]:
max_depth=[3,4,5,6,7,8]
min_samples_split=[int(x) for x in np.linspace(start=5,stop=15,num=5)]
max_features=['auto','sqrt','log2']

params = {
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'max_features' : max_features
}

In [41]:
clf = RandomizedSearchCV(estimator=id3,param_distributions=params,n_iter=10,n_jobs=-1,random_state=0,cv=5)
clf.fit(X_train,y_train)
clf.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       min_samples_split=12, random_state=0)

In [42]:
id3_hyp = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       min_samples_split=12, random_state=0).fit(X_train,y_train)
y_pred = id3_hyp.predict(X_val)
print('ID3 Hyp: ',cross_val_score(id3_hyp,X_train,y_train,cv=5), ' Mean : ',cross_val_score(id3_hyp,X_train,y_train,cv=5).mean())

ID3 Hyp:  [0.81294964 0.83453237 0.82014388 0.79856115 0.89855072]  Mean :  0.8329475549994786


In [43]:
clf = RandomizedSearchCV(estimator=cart,param_distributions=params,n_iter=10,n_jobs=-1,random_state=0,cv=5)
clf.fit(X_train,y_train)
clf.best_estimator_

DecisionTreeClassifier(max_depth=6, max_features='sqrt', min_samples_split=12,
                       random_state=0)

In [44]:
cart_hyp = DecisionTreeClassifier(criterion='gini',max_depth=6, max_features='sqrt', min_samples_split=12,
                       random_state=0).fit(X_train,y_train)
y_pred = cart_hyp.predict(X_val)
print('CART Hyp: ',cross_val_score(cart_hyp,X_train,y_train,cv=5), ' Mean : ',cross_val_score(cart_hyp,X_train,y_train,cv=5).mean())

CART Hyp:  [0.8057554  0.84892086 0.82733813 0.77697842 0.86231884]  Mean :  0.8242623292670211


### Random Forest Classifier

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
rfc = RandomForestClassifier(random_state=0).fit(X_train,y_train)
y_pred = rfc.predict(X_val)
print('Random Forest : ',cross_val_score(rfc,X_train,y_train,cv=5), ' Mean : ',cross_val_score(rfc,X_train,y_train,cv=5).mean())

Random Forest :  [0.76258993 0.84172662 0.79856115 0.78417266 0.83333333]  Mean :  0.8040767386091128


###### Hyperparameter Tuning

In [47]:
max_depth=[3,4,5,6,7,8]
min_samples_split=[int(x) for x in np.linspace(start=5,stop=15,num=5)]
max_features=['auto','sqrt','log2']
oob_score = ['True','False']
n_estimators = [int(x) for x in np.linspace(start=100,stop=1200,num=12)]


params = {
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'max_features' : max_features,
    'oob_score' : oob_score,
    'n_estimators' : n_estimators
}

In [48]:
clf = RandomizedSearchCV(estimator=rfc,param_distributions=params,n_iter=10,n_jobs=-1,random_state=0,cv=5)
clf.fit(X_train,y_train)
clf.best_estimator_

RandomForestClassifier(max_depth=4, max_features='log2', min_samples_split=15,
                       n_estimators=500, oob_score='False', random_state=0)

In [49]:
rfc_hyp = RandomForestClassifier(max_depth=4, max_features='log2', min_samples_split=15,
                       n_estimators=500, oob_score='False', random_state=0).fit(X_train,y_train)
y_pred = rfc_hyp.predict(X_val)
print('RF Hyp : ',cross_val_score(rfc_hyp,X_train,y_train,cv=5), ' Mean : ',cross_val_score(rfc_hyp,X_train,y_train,cv=5).mean())

RF Hyp :  [0.81294964 0.84172662 0.81294964 0.81294964 0.89130435]  Mean :  0.8343759774788865


### XGBoost

In [50]:
import xgboost as xgb

In [51]:
xgb_cl = xgb.XGBClassifier(random_state=0).fit(X_train,y_train)
y_pred = xgb_cl.predict(X_val)
print('XGBoost : ',cross_val_score(xgb_cl,X_train,y_train,cv=5), ' Mean : ',cross_val_score(xgb_cl,X_train,y_train,cv=5).mean())

XGBoost :  [0.76258993 0.85611511 0.78417266 0.76978417 0.80434783]  Mean :  0.7954019393181108


###### Hyperparameter Tuninng

In [52]:
n_estimators = [int(x) for x in np.linspace(start=100,stop=1200,num=12)]
max_depth=[3,4,5,6,7,8]
learning_rate = [0.05,0.1,0.15,0.2,0.25,0.3]
colsample_bytree = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
gamma = [0.01,0.03,0.05,0.07,0.1]
reg_lambda = [0.01,0.03,0.05,0.07,0.1]

params = {
    'max_depth' : max_depth,
    'n_estimators' : n_estimators,
    'learning_rate' : learning_rate,
    'colsample_bytree' : colsample_bytree,
    'gamma' : gamma,
    'reg_lambda' : reg_lambda
}

In [53]:
clf = RandomizedSearchCV(estimator=xgb_cl,param_distributions=params,n_iter=10,n_jobs=-1,random_state=0,cv=5)
clf.fit(X_train,y_train)
clf.best_params_

{'reg_lambda': 0.05,
 'n_estimators': 200,
 'max_depth': 3,
 'learning_rate': 0.1,
 'gamma': 0.01,
 'colsample_bytree': 0.5}

In [54]:
xgb_hyp = xgb.XGBClassifier(n_estimators=200,reg_lambda=0.05,max_depth=3,learning_rate=0.1,gamma=0.01,colsample_bytree=0.5)
xgb_hyp.fit(X_train,y_train)
y_pred = xgb_hyp.predict(X_val)
print('XGBoost Hyp: ',cross_val_score(xgb_hyp,X_train,y_train,cv=5), ' Mean : ',cross_val_score(xgb_hyp,X_train,y_train,cv=5).mean())

XGBoost Hyp:  [0.78417266 0.8705036  0.8057554  0.8057554  0.85507246]  Mean :  0.8242519028255655


### GBM

In [55]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
gbm = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
y_pred = gbm.predict(X_val)
print('GBM : ',cross_val_score(gbm,X_train,y_train,cv=5), ' Mean : ',cross_val_score(gbm,X_train,y_train,cv=5).mean())

GBM :  [0.79856115 0.85611511 0.81294964 0.8057554  0.84782609]  Mean :  0.82424147638411


### Support Vector Machines

In [57]:
from sklearn.svm import SVC

In [58]:
svc = SVC(random_state=0).fit(X_train, y_train)
y_pred = svc.predict(X_val)
print('SVC : ',cross_val_score(svc,X_train,y_train,cv=5), ' Mean : ',cross_val_score(svc,X_train,y_train,cv=5).mean())

SVC :  [0.8057554  0.84892086 0.82014388 0.81294964 0.87681159]  Mean :  0.8329162756751121


###### Hyperparameter Tuning

In [59]:
C = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel = ['linear','rbf','poly']
degree = [3,4,5,6]
gamma = ['scale','auto']

params = {
    'C' : C,
    'kernel' : kernel,
    'degree' : degree,
    'gamma' : gamma
}

In [60]:
clf = RandomizedSearchCV(estimator=svc,param_distributions=params,n_iter=10,n_jobs=-1,random_state=0,cv=5)
clf.fit(X_train,y_train)
clf.best_params_

{'kernel': 'rbf', 'gamma': 'auto', 'degree': 3, 'C': 1.0}

In [61]:
svc_hyp = SVC(kernel='rbf',gamma='auto',C=1,random_state=0).fit(X_train, y_train)
y_pred = svc_hyp.predict(X_val)
print('SVC Hyp : ',cross_val_score(svc_hyp,X_train,y_train,cv=5), ' Mean : ',cross_val_score(svc_hyp,X_train,y_train,cv=5).mean())

SVC Hyp :  [0.79856115 0.86330935 0.82014388 0.81294964 0.87681159]  Mean :  0.8343551245959754


## SUBMISSIONS

In [62]:
ids = df_test['PassengerId']

# ID3 predictions
predictions_id3 = id3.predict(df_tst)
output_id3 = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_id3 })
output_id3.to_csv('submission_id3.csv', index=False)

# ID3 HYP predictions
predictions_id3_hyp = id3_hyp.predict(df_tst)
output_id3_hyp = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_id3_hyp })
output_id3_hyp.to_csv('submission_id3_hyp.csv', index=False)

# CART predictions
predictions_cart = cart.predict(df_tst)
output_cart = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_cart })
output_cart.to_csv('submission_cart.csv', index=False)

# CART HYP predictions
predictions_cart_hyp = cart_hyp.predict(df_tst)
output_cart_hyp = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_cart_hyp })
output_cart_hyp.to_csv('submission_cart_hyp.csv', index=False)

# Random Forest predictions
predictions_rfc = rfc.predict(df_tst)
output_rfc = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_rfc })
output_rfc.to_csv('submission_rfc.csv', index=False)

# Random Forest HYP predictions
predictions_rfc_hyp = rfc_hyp.predict(df_tst)
output_rfc_hyp = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_rfc_hyp })
output_rfc_hyp.to_csv('submission_rfc_hyp.csv', index=False)

# XGBoost predictions
predictions_xgb = xgb_cl.predict(df_tst)
output_xgb = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_xgb })
output_xgb.to_csv('submission_xgb.csv', index=False)

# XGBoost HYP predictions
predictions_xgb_hyp = xgb_hyp.predict(df_tst)
output_xgb_hyp = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_xgb_hyp })
output_xgb_hyp.to_csv('submission_xgb_hyp.csv', index=False)

# GBM predictions
predictions_gbm = gbm.predict(df_tst)
output_gbm = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_gbm })
output_gbm.to_csv('submission_gbm.csv', index=False)

# SVC predictions
predictions_svc = svc.predict(df_tst)
output_svc = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_svc })
output_svc.to_csv('submission_svc.csv', index=False)

# SVC HYP predictions
predictions_svc_hyp = svc_hyp.predict(df_tst)
output_svc_hyp = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions_svc_hyp })
output_svc_hyp.to_csv('submission_svc_hyp.csv', index=False)

