In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, PolynomialFeatures, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [23]:
df_train = pd.read_csv('train.csv',index_col="PassengerId")
df_test = pd.read_csv('test.csv',index_col="PassengerId")

df_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
def fill_nans(df):
    df['Age'] = df.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.median()))
    df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
    df['Fare'] = df.groupby(['Pclass','Sex'])['Fare'].apply(lambda x: x.fillna(x.median()))
    return df

df_train = fill_nans(df_train)
df_test = fill_nans(df_test)

In [25]:
def log_transf(X,log_vars):
    for var in log_vars:
        X['log_'+var]=np.log(X[var]+1)
        X.drop(var,axis=1,inplace=True)
    return X

def build_X(df_train,df_test):
    df = pd.concat([df_train,df_test],join='inner',axis=0)
    oe = OrdinalEncoder()
    X_cab = df['Cabin'].str[0].fillna(df['Pclass'].map({1:"C",2:"E",3:"F"})).replace({"T":"A"})
    X_cab = pd.DataFrame(oe.fit_transform(np.array(X_cab).reshape(-1,1)),index=df.index,columns=['Cabin_enc'])
    ohe_vars = ['Sex','Embarked']
    ohe = OneHotEncoder(sparse=False,drop='first')
    ohe.fit(np.array(df[ohe_vars]))
    X_ohe = pd.DataFrame(ohe.transform(np.array(df[ohe_vars])),index=df.index,columns=['Sex_enc','Embarked_1','Embarked_2'])
    X_oth = df[['Pclass','Age','Fare','SibSp','Parch']]
    log_vars = ['Fare']
    X_oth = log_transf(X_oth,log_vars)
    X = pd.concat([X_ohe,X_oth,X_cab],axis=1)
    X['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    family_map = {1:1, 2:2, 3:2, 4:2, 5:3, 6:3, 7:4, 8:4, 11:4}
    X['Family_Size_Grouped'] = X['Family_Size'].map(family_map)
    X["Ticket_freq"]=df.groupby("Ticket")["Ticket"].transform('count')
    scale = StandardScaler()
    #scale.fit(X.loc[df_train.index])
    X = pd.DataFrame(scale.fit_transform(X),index=X.index,columns=X.columns)
    X1 = X.loc[df_train.index]
    X2 = X.loc[df_test.index]
    return X1,X2

def powerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

In [26]:

X,X_submit = build_X(df_train,df_test)
y = df_train['Survived']
X.describe()

Unnamed: 0,Sex_enc,Embarked_1,Embarked_2,Pclass,Age,SibSp,Parch,log_Fare,Cabin_enc,Family_Size,Family_Size_Grouped,Ticket_freq
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.007485,-0.025859,0.055104,0.01643,-0.004482,0.023197,-0.003968,-0.017103,0.014198,0.013089,0.013082,0.011021
std,0.998279,0.963537,0.974678,0.998275,1.007113,1.059047,0.931611,1.0009,0.999298,1.019219,1.020435,1.009841
min,-1.344995,-0.32204,-1.526692,-1.546098,-2.176431,-0.479087,-0.445,-3.076716,-3.009771,-0.558346,-0.699804,-0.619174
25%,-1.344995,-0.32204,-1.526692,-0.352091,-0.580725,-0.479087,-0.445,-0.817605,-0.039147,-0.558346,-0.699804,-0.619174
50%,0.743497,-0.32204,0.655011,0.841916,-0.240085,-0.479087,-0.445,-0.248842,0.703509,-0.558346,-0.699804,-0.619174
75%,0.743497,-0.32204,0.655011,0.841916,0.516891,0.481288,-0.445,0.502937,0.703509,0.073352,0.740516,0.504957
max,0.743497,3.105202,0.655011,0.841916,3.847587,7.203909,6.489576,3.369337,1.446165,5.758637,3.621157,5.00148


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=42)

In [28]:
lr = LogisticRegression(penalty='elasticnet',solver='saga')

param_grid = {
        'C':np.geomspace(0.01,3,15),
        'l1_ratio':np.linspace(0,0.5,11)
}

gs = GridSearchCV(lr,param_grid,cv=8,scoring='accuracy',n_jobs=-1)
gs.fit(X_train,y_train)

lr=gs.best_estimator_

print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))
print(gs.best_params_)

0.8170144462279294
0.8059701492537313
{'C': 1.3281489698569586, 'l1_ratio': 0.05}


In [29]:
svm_1 = SVC(kernel='poly',cache_size=7000,max_iter=2000)

param_grid = {
        'C':np.linspace(30,50,20),
        'degree':[2,3,4],
        'gamma':np.linspace(0.001,0.1,10)
}

gs = GridSearchCV(svm_1,param_grid,cv=8,scoring='accuracy',verbose=1,n_jobs=-1)
gs.fit(X_train,y_train)

svm_1=gs.best_estimator_


print(svm_1.score(X_train,y_train))
print(svm_1.score(X_test,y_test))
print(gs.best_params_)

Fitting 8 folds for each of 600 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 2800 tasks      | elapsed:    4.2s


0.8683788121990369
0.7910447761194029
{'C': 36.31578947368421, 'degree': 3, 'gamma': 0.045000000000000005}


[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed:    7.2s finished


In [30]:
svm_2 = SVC(kernel='rbf',cache_size=7000,max_iter=3000)

param_grid = {
        'C':np.linspace(7,20,15),
        'gamma':np.geomspace(0.01,0.4,15)+np.linspace(0,0.1,15)
}

gs = GridSearchCV(svm_2,param_grid,cv=8,scoring='accuracy',verbose=2,n_jobs=-1)
gs.fit(X_train,y_train)

svm_2=gs.best_estimator_


print(svm_2.score(X_train,y_train))
print(svm_2.score(X_test,y_test))
print(gs.best_params_)

Fitting 8 folds for each of 225 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    1.2s


0.85553772070626
0.8134328358208955
{'C': 8.857142857142858, 'gamma': 0.03122385408667882}


[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:    3.7s finished


In [31]:
rf = RandomForestClassifier(n_estimators=2000,oob_score=True)

param_grid = {
        'max_depth':np.ceil(np.linspace(2,10,5)),
        'min_samples_leaf':(np.linspace(1,20,5)).astype(int),
        'max_features':np.linspace(1,8,5).astype(int)
}

gs = GridSearchCV(rf,param_grid,cv=8,scoring='accuracy',verbose=3,n_jobs=-1)
gs.fit(X_train,y_train)

rf=gs.best_estimator_


print(rf.score(X_train,y_train))
print(rf.score(X_test,y_test))
print(gs.best_params_)

Fitting 8 folds for each of 125 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  7.9min finished


0.9309791332263242
0.8171641791044776
{'max_depth': 8.0, 'max_features': 4, 'min_samples_leaf': 1}


In [33]:
gb = GradientBoostingClassifier(n_estimators=2000)

param_grid = {
        'max_depth':np.ceil(np.linspace(2,6,3)),
        'min_samples_leaf':(np.linspace(4,30,5)).astype(int),
        'max_features':np.linspace(1,6,4).astype(int),
        'learning_rate':np.geomspace(0.01,0.1,4)+np.linspace(0,0.1,4)
    
}

gs = GridSearchCV(gb,param_grid,cv=7,scoring='accuracy',verbose=3,n_jobs=-1)
gs.fit(X_train,y_train)

gb=gs.best_estimator_


print(gb.score(X_train,y_train))
print(gb.score(X_test,y_test))
print(gs.best_params_)

Fitting 7 folds for each of 240 candidates, totalling 1680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1680 out of 1680 | elapsed:  5.2min finished


0.869983948635634
0.8097014925373134
{'learning_rate': 0.01, 'max_depth': 4.0, 'max_features': 1, 'min_samples_leaf': 30}


In [18]:
classifiers = [('lr', lr), ('svm_1', svm_1), ('svm_2', svm_2), ('rf', rf), ('gb', gb)]
#classifiers = [('lr', lr), ('svm_2', svm_2), ('rf', rf)]
ps = list(powerset(classifiers))
ps.remove([])

for cl in classifiers:
    print(classification_report(y_test,cl[1].predict(X_test)))
    print(confusion_matrix(y_test,cl[1].predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       165
           1       0.76      0.73      0.74       103

    accuracy                           0.81       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.81      0.81       268

[[141  24]
 [ 28  75]]
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       165
           1       0.78      0.64      0.70       103

    accuracy                           0.79       268
   macro avg       0.79      0.76      0.77       268
weighted avg       0.79      0.79      0.79       268

[[146  19]
 [ 37  66]]
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       165
           1       0.80      0.68      0.74       103

    accuracy                           0.81       268
   macro avg       0.81      0.79      0.80       268
weighted avg       0.81     

In [19]:
counter=1
for vc in ps:
    names, models = zip(*vc)
    print(names)
    vc = VotingClassifier(vc)
    vc.fit(X_train,y_train)
    print(vc.score(X_train,y_train))
    print(vc.score(X_test,y_test))
    y_pred=vc.predict(X_submit)
    df_submit = pd.DataFrame(y_pred,index=df_test.index,columns=['Survived'])
    df_submit.to_csv('Submit Files/VC_'+str(counter)+'.csv')
    counter+=1


('lr',)
0.8170144462279294
0.8059701492537313
('svm_1',)
0.8683788121990369
0.7910447761194029
('lr', 'svm_1')
0.8443017656500803
0.7947761194029851
('svm_2',)
0.85553772070626
0.8134328358208955
('lr', 'svm_2')
0.8378812199036918
0.8059701492537313
('svm_1', 'svm_2')
0.8603531300160514
0.8059701492537313
('lr', 'svm_1', 'svm_2')
0.8571428571428571
0.8097014925373134
('rf',)
0.9518459069020867
0.8059701492537313
('lr', 'rf')
0.8892455858747994
0.7985074626865671
('svm_1', 'rf')
0.9020866773675762
0.8059701492537313
('lr', 'svm_1', 'rf')
0.8892455858747994
0.8097014925373134
('svm_2', 'rf')
0.9004815409309791
0.8022388059701493
('lr', 'svm_2', 'rf')
0.8747993579454254
0.8171641791044776
('svm_1', 'svm_2', 'rf')
0.8747993579454254
0.8059701492537313
('lr', 'svm_1', 'svm_2', 'rf')
0.8715890850722311
0.8059701492537313
('gb',)
0.8828250401284109
0.8134328358208955
('lr', 'gb')
0.8587479935794543
0.8097014925373134
('svm_1', 'gb')
0.869983948635634
0.8022388059701493
('lr', 'svm_1', 'gb')
0

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,24.0,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,24.0,0,0,359309,8.0500,,S
