In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score,StratifiedKFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (GradientBoostingClassifier,RandomForestClassifier, VotingClassifier,
                             StackingClassifier, AdaBoostClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [11]:
df = pd.read_csv("train_processed.csv")
y = df["Survived"]
X = df.iloc[:,2:]
#X = X.drop(["Ticket_number"], axis =1)
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 100)

In [3]:
X.columns

Index(['Sex', 'Age', 'Ticket_category', 'Is_with_familly', 'Fare_per_person',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'SibSp_0', 'SibSp_1', 'SibSp_2',
       'Parch_0', 'Parch_1', 'Parch_2', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

# Modelisation

In [13]:
models = [SVC(), KNeighborsClassifier(),LogisticRegression(),GradientBoostingClassifier(),RandomForestClassifier(),
          DecisionTreeClassifier(), GaussianNB()]
names = ["svc","knn","rl","gb","rf","dt","gnb"]
param_grid_svc = {'C':[0.5,1,5,10],"kernel" :["linear", "poly", "rbf", "sigmoid"],}
param_grid_knn = {'n_neighbors':[5,15,25],"metric" :["minkowski","euclidean","manhattan","chebyshev"]}
param_grid_rl = { 'C':[0.05,0.07,0.1], 'max_iter' : [300,200,100], "solver" : ["newton-cg", "lbfgs", "sag", "saga"]}
param_grid_gb = {"n_estimators":[100,200,300], "learning_rate":[0.10,0.50,1], "max_depth": [1,2,3]}
param_grid_rf = {'max_features': ["sqrt", "log2", None], "criterion": ["gini", "entropy"],"max_depth" : [3,2,1, None]}
param_grid_dt = {'max_features': ["sqrt", "log2", None], "criterion": ["gini", "entropy"],"max_depth" : [3,2,1, None]}
param_grid_gnb = {}
param_grid = [param_grid_svc,param_grid_knn,param_grid_rl,param_grid_gb,param_grid_rf,param_grid_dt,param_grid_gnb]

gridcvs = {}
for pgrid, clf, name in zip(param_grid,models,names):
    gcv = GridSearchCV(clf, pgrid, cv=5, n_jobs= -1, refit=True).fit(X_train,y_train)
    gridcvs[name] = gcv
    print(name)
    print("best parameters :",gridcvs[name].best_params_)
    print("TRAIN accuracy :", gridcvs[name].best_score_)
    print("TEST accuracy :", gridcvs[name].score(X_test,y_test),"\n")

svc
best parameters : {'C': 0.5, 'kernel': 'rbf'}
TRAIN accuracy : 0.8062050625430908
TEST accuracy : 0.7988826815642458 

knn
best parameters : {'metric': 'minkowski', 'n_neighbors': 15}
TRAIN accuracy : 0.7765882005318625
TEST accuracy : 0.8044692737430168 



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


rl
best parameters : {'C': 0.07, 'max_iter': 300, 'solver': 'newton-cg'}
TRAIN accuracy : 0.8047473653107456
TEST accuracy : 0.7821229050279329 

gb
best parameters : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
TRAIN accuracy : 0.8243967300305328
TEST accuracy : 0.8268156424581006 

rf
best parameters : {'criterion': 'entropy', 'max_depth': None, 'max_features': None}
TRAIN accuracy : 0.8272727272727274
TEST accuracy : 0.7932960893854749 

dt
best parameters : {'criterion': 'gini', 'max_depth': 3, 'max_features': None}
TRAIN accuracy : 0.806155816014971
TEST accuracy : 0.7877094972067039 

gnb
best parameters : {}
TRAIN accuracy : 0.7443809711415346
TEST accuracy : 0.7318435754189944 



In [4]:
svc = SVC()
param_grid_svc = {'C':[1,5,10],"kernel" :["linear", "poly", "rbf", "sigmoid"],}
grid_svc = GridSearchCV(estimator=svc, param_grid= param_grid_svc,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_svc.best_params_)
print(grid_svc.best_score_) 
print("svc test score :",grid_svc.score(X_test,y_test))

{'C': 1, 'kernel': 'poly'}
0.8047473653107456
svc test score : 0.8100558659217877


In [5]:
knn =  KNeighborsClassifier()
param_grid_knn = {'n_neighbors':[5,10,15,20,25,30],"metric" :["minkowski","euclidean","manhattan","chebyshev"]}
grid_knn = GridSearchCV(estimator=knn, param_grid= param_grid_knn,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_knn.best_params_)
print(grid_knn.best_score_) 
print("knn test  score :",grid_knn.score(X_test,y_test))

{'metric': 'minkowski', 'n_neighbors': 15}
0.7765882005318625
knn test  score : 0.8044692737430168


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [6]:
rl = LogisticRegression()
param_grid_rl = { 'C':[0.05,0.07,0.1], 'max_iter' : [300,200,100], "solver" : ["newton-cg", "lbfgs", "sag", "saga"]}
grid_rl = GridSearchCV(estimator=rl, param_grid= param_grid_rl,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_rl.best_params_)
print(grid_rl.best_score_) 
print("rl test score :",grid_rl.score(X_test,y_test))

{'C': 0.07, 'max_iter': 300, 'solver': 'newton-cg'}
0.8047473653107456
rl test score : 0.7821229050279329


In [7]:
gb = GradientBoostingClassifier()
param_grid_gb = {"n_estimators":[100,200,300], "learning_rate":[0.25,0.30,0.40], "max_depth": [1,2]}
grid_gb = GridSearchCV(estimator=gb, param_grid= param_grid_gb,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_gb.best_params_)
print(grid_gb.best_score_) 
print("gb test score :",grid_gb.score(X_test,y_test))

{'learning_rate': 0.25, 'max_depth': 2, 'n_estimators': 300}
0.8272530286614794
gb test score : 0.8435754189944135


In [8]:
rf = RandomForestClassifier()
param_grid_rf = {'max_features': ["sqrt", "log2", None], "criterion": ["gini", "entropy"],"max_depth" : [2,1, None]}
grid_rf = GridSearchCV(estimator=rf, param_grid= param_grid_rf,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_rf.best_params_)
print(grid_rf.best_score_) 
print("rf test score :",grid_rf.score(X_test,y_test))

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt'}
0.8328868314783808
rf test score : 0.8156424581005587


In [9]:
dt=DecisionTreeClassifier()
param_grid_dt = {'max_features': ["sqrt", "log2", None], "criterion": ["gini", "entropy"],"max_depth" : [2,1, None]}
grid_dt = GridSearchCV(estimator=rf, param_grid= param_grid_rf,cv=5, n_jobs = -1, refit=True).fit(X_train,y_train)
print(grid_dt.best_params_)
print(grid_dt.best_score_) 
print("dt test score :",grid_dt.score(X_test,y_test))

{'criterion': 'entropy', 'max_depth': None, 'max_features': None}
0.8357135821924555
dt test score : 0.7988826815642458


In [10]:
feats={}
dt2=DecisionTreeClassifier(criterion= 'gini', max_depth= None, max_features= None)
dt2.fit(X_train,y_train)
for feature, importance in zip(X_train.columns, dt2.feature_importances_):
    feats[feature] = importance 
    
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})
importances.sort_values(by='Importance', ascending=False).head(50)


Unnamed: 0,Importance
Sex,0.289604
Ticket_number,0.219401
Age,0.162077
Fare_per_person,0.13098
Pclass_3,0.076498
Ticket_category,0.026129
SibSp_2,0.02417
Is_with_familly,0.01709
Parch_2,0.01611
Embarked_Q,0.011563


In [11]:
vclf_all = VotingClassifier(estimators=[('gb', gb), ('rf', rf), ('svc', svc), ('lr', rl), ("knn",knn),("dt",dt)], voting='hard').fit(X_train,y_train)
vclf_best = VotingClassifier(estimators=[('gb', gb), ('rf', rf),("dt",dt)], voting='hard').fit(X_train,y_train)
print("vclf_all test score :", vclf_all.score(X_test,y_test))
print("vclf_best test score :", vclf_best.score(X_test,y_test))

vclf_all test score : 0.8212290502793296
vclf_best test score : 0.8379888268156425


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [12]:
sclf = StackingClassifier(estimators=[('gb', gb), ('rf', rf), ('svc', svc), ('lr', rl), ("knn",knn),("dt",dt)], final_estimator = gb).fit(X_train,y_train)
print("sclf test score :", sclf.score(X_test,y_test))

sclf test score : 0.8156424581005587


In [13]:
abc = AdaBoostClassifier(base_estimator=gb,n_estimators=400)
abc.fit(X_train,y_train)
print("abc test score :",abc.score(X_test,y_test))

abc test score : 0.7932960893854749


# Deep Learning

In [14]:
#une fonction permettant de réuidre le learning rate en fonction de l'évolution de la "val_accuracy"
from tensorflow.keras.callbacks import ReduceLROnPlateau
reducelr = ReduceLROnPlateau(monitor = 'val_accuracy',
                        min_delta = 0.001,
                        patience = 10,
                        factor = 0.5, 
                        cooldown = 5,
                        verbose = 1)

#une fonction permettant de stoper l'entrainement si l'évolution de la "val_accuracy" ne depasse plus un certain seuil
from tensorflow.keras.callbacks import EarlyStopping 
earlystop = EarlyStopping(monitor = 'val_accuracy',
                    min_delta = 0.0001,
                    patience = 50,
                    verbose = 1,
                    restore_best_weights = True)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense ,  Dropout
from tensorflow.keras.models import Model

#creation du model avec 2 couche Dense
model = Sequential()
model.add(Input(shape = X_train.shape[1]))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="tanh"))
model.add(Dense(64, activation="tanh"))
model.add(Dense(32, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               9728      
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 184,321
Trainable params: 184,321
Non-trai

In [16]:
#entrainement du modèle
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

model_history = model.fit (X_train, y_train, epochs = 200, batch_size = 50, validation_split = 0.2, callbacks= [reducelr,earlystop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
 1/12 [=>............................] - ETA: 0s - loss: 0.2487 - accuracy: 0.9000
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
 1/12 [=>............................] - ETA: 0s - loss: 0.2798 - accuracy: 0.8600
Epoch 27: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
 1/12 [=>............................] - ETA: 0s - loss: 0.1649 - accuracy: 0.9400
Epoch 41: ReduceLROnPlateau reducing learning rate to 0.00012500000593718

Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
 1/12 [=>............................] - ETA: 0s - loss: 0.1861 - accuracy: 0.9400Restoring model weights from the end of the best epoch: 3.
Epoch 53: early stopping
