<a href="https://colab.research.google.com/github/AryanMethil/Titanic-Kaggle-/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import joblib
import pandas as pd
from sklearn import metrics
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
TRAINING_PATH='/content/drive/My Drive/Titanic/input/train_folds.csv'
TESTING_PATH='/content/drive/My Drive/Titanic/input/test.csv'
MODEL_PATH='/content/drive/My Drive/Titanic/models/'
SUBMISSION_FILES_PATH='/content/drive/My Drive/Titanic/Submissions/'

# Data Exploration

1.   Null Values
2.   Number of unique values



In [5]:
df=pd.read_csv(TRAINING_PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,kfolds
0,0,240,0,2,"Hunt, Mr. George Henry",male,33.0,0,0,SCO/W 1585,12.275,,S,0
1,1,561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q,0
2,2,322,0,3,"Danoff, Mr. Yoto",male,27.0,0,0,349219,7.8958,,S,0
3,3,269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58.0,0,1,PC 17582,153.4625,C125,S,0
4,4,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S,0


In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,kfolds
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,1.997755
std,257.353842,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,1.4158
min,0.0,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,1.0
50%,445.0,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,2.0
75%,667.5,668.5,1.0,3.0,38.0,1.0,0.0,31.0,3.0
max,890.0,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,4.0


In [7]:
# Count the number of null values in each column
df.isna().sum()

Unnamed: 0       0
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
kfolds           0
dtype: int64

In [8]:
# Total number of rows
len(df)

891

In [9]:
# Total number of unique values in each column
df.nunique()

Unnamed: 0     891
PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
kfolds           5
dtype: int64

In [10]:
# Check for class imbalance 
df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

#### Drop Columns based on the number of unique values and number of null values (both should be sufficiently large to drop) 

In [11]:
# Unnamed:0 , Name and PassengerId have all different values so no value can determine the result
# Ticket also has lots of different values
# Cabin has lots of null values
df=df.drop(['Cabin','Name','PassengerId','Unnamed: 0','Ticket'],axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,kfolds
0,0,2,male,33.0,0,0,12.275,S,0
1,0,3,male,,0,0,7.75,Q,0
2,0,3,male,27.0,0,0,7.8958,S,0
3,1,1,female,58.0,0,1,153.4625,S,0
4,1,2,male,34.0,0,0,13.0,S,0


#### Imputer to fill in missing values

1.   KNN Imputer
2.   Iterative Imputer



In [12]:
# Use KNN Imputer to fill missing values

from sklearn.impute import KNNImputer
imputer=KNNImputer(n_neighbors=3)

df_knn_imputed=df
df_knn_imputed['Age']=imputer.fit_transform(df_knn_imputed['Age'].values.reshape(-1,1))
df_knn_imputed.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,kfolds
0,0,2,male,33.0,0,0,12.275,S,0
1,0,3,male,29.699118,0,0,7.75,Q,0
2,0,3,male,27.0,0,0,7.8958,S,0
3,1,1,female,58.0,0,1,153.4625,S,0
4,1,2,male,34.0,0,0,13.0,S,0


In [13]:
# Use Iterative Imputer to fill missing values

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
imputer=IterativeImputer(random_state=42)

df_iter_imputed=df
df_iter_imputed['Age']=imputer.fit_transform(df_iter_imputed['Age'].values.reshape(-1,1))
df_iter_imputed.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,kfolds
0,0,2,male,33.0,0,0,12.275,S,0
1,0,3,male,29.699118,0,0,7.75,Q,0
2,0,3,male,27.0,0,0,7.8958,S,0
3,1,1,female,58.0,0,1,153.4625,S,0
4,1,2,male,34.0,0,0,13.0,S,0


#### One hot encode categorical features

In [14]:
# One hot encode the categorical columns - Sex and Embarked

df=pd.get_dummies(data=df_iter_imputed,columns=['Sex','Embarked'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,kfolds,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,2,33.0,0,0,12.275,0,0,1,0,0,1
1,0,3,29.699118,0,0,7.75,0,0,1,0,1,0
2,0,3,27.0,0,0,7.8958,0,0,1,0,0,1
3,1,1,58.0,0,1,153.4625,0,1,0,0,0,1
4,1,2,34.0,0,0,13.0,0,0,1,0,0,1


#### Move the Survived and kfolds column to the end

In [15]:
# Move the target and kfolds column to the last

df=df[[column for column in df if column not in['Survived','kfolds']]+['Survived','kfolds']]
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived,kfolds
0,2,33.0,0,0,12.275,0,1,0,0,1,0,0
1,3,29.699118,0,0,7.75,0,1,0,1,0,0,0
2,3,27.0,0,0,7.8958,0,1,0,0,1,0,0
3,1,58.0,0,1,153.4625,1,0,0,0,1,1,0
4,2,34.0,0,0,13.0,0,1,0,0,1,1,0


#### MinMax Scaler

In [16]:
# Scale the columns using MinMaxScaler except for the target and kfolds column

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df_2=pd.DataFrame(scaler.fit_transform(df),index=df.index,columns=df.columns)
df_2['kfolds']=df['kfolds']
df_2['Survived']=df['Survived']
df=df_2
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived,kfolds
0,0.5,0.409399,0.0,0.0,0.023959,0.0,1.0,0.0,0.0,1.0,0,0
1,1.0,0.367921,0.0,0.0,0.015127,0.0,1.0,0.0,1.0,0.0,0,0
2,1.0,0.334004,0.0,0.0,0.015412,0.0,1.0,0.0,0.0,1.0,0,0
3,0.0,0.723549,0.0,0.166667,0.299539,1.0,0.0,0.0,0.0,1.0,1,0
4,0.5,0.421965,0.0,0.0,0.025374,0.0,1.0,0.0,0.0,1.0,1,0


# Feature Selection

1.   Greedy Feature Selection
2.   Recursive Feature Selection



In [17]:
def run(fold,df,models,target_name,print_details=False):
  
  # print(df.head())
  # Training and validation sets
  df_train=df[df['kfolds']!=fold].reset_index(drop=True)
  df_valid=df[df['kfolds']==fold].reset_index(drop=True)


  # x and y of training dataset
  x_train=df_train.drop(target_name,axis=1).values
  y_train=df_train[target_name].values

  # x and y of validation dataset
  x_valid=df_valid.drop(target_name,axis=1).values
  y_valid=df_valid[target_name].values

  # accuracy => will store accuracies of the models  (same for confusion_matrices)
  accuracy=[]
  confusion_matrices=[]
  classification_report=[]

  for model_name,model_constructor in list(models.items()):
    clf=model_constructor
    clf.fit(x_train,y_train)

    # preds_train, preds_valid => predictions when training and validation x are fed into the trained model
    preds_train=clf.predict(x_train)
    preds_valid=clf.predict(x_valid)

    acc_train=metrics.accuracy_score(y_train,preds_train)
    acc_valid=metrics.accuracy_score(y_valid,preds_valid)
    conf_matrix=metrics.confusion_matrix(y_valid,preds_valid)
    class_report=metrics.classification_report(y_valid,preds_valid)

    accuracy.append(acc_valid)
    confusion_matrices.append(conf_matrix)
    classification_report.append(class_report)

    if(print_details==True):
      print(f'Model => {model_name} => Fold = {fold} => Training Accuracy = {acc_train} => Validation Accuracy = {acc_valid}')

  if(print_details==True):
    print('\n--------------------------------------------------------------------------------------------\n')
    
  return accuracy,confusion_matrices,classification_report

In [18]:
def greedy_feature_selection(fold,df,models,target_name):

  # target_index => stores the index of the target variable in the dataset
  # kfolds_index => stores the index of kfolds column in the dataset

  target_index=df.columns.get_loc(target_name)
  kfolds_index=df.columns.get_loc('kfolds')

  # good_features => stores the indices of all the optimal features
  # best_scores => keeps track of the best scores 
  good_features=[]
  best_scores=[]

  # df has X and y and a kfolds column. 
  # no of features (no of columns in X) => total columns in df - 1 (there's 1 y) - 1 (there's 1 kfolds)
  num_features=df.shape[1]-2

  while True:

    # this_feature => the feature added to the already selected features to measure the effect of the former on the model
    # best_score => keeps track of the best score achieved while selecting features 1 at a time and checking its effect on the model
    this_feature=None
    best_score=0


    for feature in range(num_features):

      # if the feature is already in the good_features list, ignore and move ahead
      if feature in good_features:
        continue
      
      # add the currently selected feature to the already discovered good features
      selected_features=good_features+[feature]

      # all the selected features + target and kfolds column
      df_train=df.iloc[:, selected_features + [target_index,kfolds_index]]

      # fit the selected dataset to a model 
      accuracy,confusion_matrices,classification_report=run(fold,df_train,models,target_name=target_name)

      # if any improvement is observed over the previous set of features
      if(accuracy[0]>best_score):
        this_feature=feature
        best_score=accuracy[0]
      
    if(this_feature!=None):
      good_features.append(this_feature)
      best_scores.append(best_score)
    
    if(len(best_scores)>2):
      if(best_scores[-1]<best_scores[-2]):
        break
    
  return best_scores[:-1] , df.iloc[:, good_features[:-1] + [target_index,kfolds_index]]

In [19]:
from sklearn.feature_selection import RFE

def recursive_feature_selection(df,models,n_features_to_select,target_name):
  X=df.drop(labels=[target_name,'kfolds'],axis=1).values
  y=df[target_name]
  kfolds=df.kfolds.values

  model_name,model_constructor=list(models.items())[0]

  rfe=RFE(
      estimator=model_constructor,
      n_features_to_select=n_features_to_select
  )

  try:
    rfe.fit(X,y)
  except RuntimeError:
    print(f"{model_name} does not support feature importance... Returning original dataframe\n")
    return df
  else:
    X_transformed = rfe.transform(X)
    df_optimal=pd.DataFrame(data=[X,y,kfolds])
    return df_optimal

# Finding the optimal features for the different models

Models :

1.   XGB Classifier
2.   Gaussian Naive Bayes
3.   SVM Classifier
4.   Random Forest Classifier



In [20]:
print('Greedy Feature Selection : ')
print('\n')
models={'XGB': XGBClassifier()}
best_scores,df_optimal_XGB=greedy_feature_selection(fold=4,df=df,models=models,target_name='Survived')
print(df_optimal_XGB.head())

print('\n')
print("Recursive Feature Selection : ")
print('\n')
df_recursive_optimal_XGB=recursive_feature_selection(df=df,models=models,n_features_to_select=5,target_name='Survived')
print(df_recursive_optimal_XGB.head())

Greedy Feature Selection : 




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


   Sex_female  Pclass      Fare  Sex_male  Embarked_Q  Survived  kfolds
0         0.0     0.5  0.023959       1.0         0.0         0       0
1         0.0     1.0  0.015127       1.0         1.0         0       0
2         0.0     1.0  0.015412       1.0         0.0         0       0
3         1.0     0.0  0.299539       0.0         0.0         1       0
4         0.0     0.5  0.025374       1.0         0.0         1       0


Recursive Feature Selection : 


                                                   0
0  [[0.5, 0.4093993465694898, 0.0, 0.0, 0.0239592...
1  0      0
1      0
2      0
3      1
4      1
 ...
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...


In [21]:
models={'Naive Bayes' : GaussianNB()}
best_scores,df_optimal_NB=greedy_feature_selection(fold=4,df=df,models=models,target_name='Survived')
print(df_optimal_NB.head())

print('\n')
df_recursive_optimal_NB=recursive_feature_selection(df=df,models=models,n_features_to_select=5,target_name='Survived')
print(df_recursive_optimal_NB.head())

  _warn_prf(average, modifier, msg_start, len(result))


   Sex_female  Pclass     Parch      Fare  Embarked_Q  Survived  kfolds
0         0.0     0.5  0.000000  0.023959         0.0         0       0
1         0.0     1.0  0.000000  0.015127         1.0         0       0
2         0.0     1.0  0.000000  0.015412         0.0         0       0
3         1.0     0.0  0.166667  0.299539         0.0         1       0
4         0.0     0.5  0.000000  0.025374         0.0         1       0


Naive Bayes does not support feature importance... Returning original dataframe

   Pclass       Age  SibSp     Parch  ...  Embarked_Q  Embarked_S  Survived  kfolds
0     0.5  0.409399    0.0  0.000000  ...         0.0         1.0         0       0
1     1.0  0.367921    0.0  0.000000  ...         1.0         0.0         0       0
2     1.0  0.334004    0.0  0.000000  ...         0.0         1.0         0       0
3     0.0  0.723549    0.0  0.166667  ...         0.0         1.0         1       0
4     0.5  0.421965    0.0  0.000000  ...         0.0         1.0

In [22]:
models={'SVM' : SVC()}
best_scores,df_optimal_SVM=greedy_feature_selection(fold=4,df=df,models=models,target_name='Survived')
print(df_optimal_SVM.head())

print('\n')
df_recursive_optimal_SVM=recursive_feature_selection(df=df,models=models,n_features_to_select=5,target_name='Survived')
print(df_recursive_optimal_SVM.head())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


   Sex_female  Pclass  Embarked_Q  SibSp       Age      Fare  Survived  kfolds
0         0.0     0.5         0.0    0.0  0.409399  0.023959         0       0
1         0.0     1.0         1.0    0.0  0.367921  0.015127         0       0
2         0.0     1.0         0.0    0.0  0.334004  0.015412         0       0
3         1.0     0.0         0.0    0.0  0.723549  0.299539         1       0
4         0.0     0.5         0.0    0.0  0.421965  0.025374         1       0


SVM does not support feature importance... Returning original dataframe

   Pclass       Age  SibSp     Parch  ...  Embarked_Q  Embarked_S  Survived  kfolds
0     0.5  0.409399    0.0  0.000000  ...         0.0         1.0         0       0
1     1.0  0.367921    0.0  0.000000  ...         1.0         0.0         0       0
2     1.0  0.334004    0.0  0.000000  ...         0.0         1.0         0       0
3     0.0  0.723549    0.0  0.166667  ...         0.0         1.0         1       0
4     0.5  0.421965    0.0  0.0

In [23]:
models={'RFC' : RandomForestClassifier()}
best_scores,df_optimal_RFC=greedy_feature_selection(fold=4,df=df,models=models,target_name='Survived')
print(df_optimal_RFC.head())

print('\n')
df_recursive_optimal_RFC=recursive_feature_selection(df=df,models=models,n_features_to_select=5,target_name='Survived')
print(df_recursive_optimal_RFC.head())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


   Sex_female  Pclass  SibSp      Fare  Survived  kfolds
0         0.0     0.5    0.0  0.023959         0       0
1         0.0     1.0    0.0  0.015127         0       0
2         0.0     1.0    0.0  0.015412         0       0
3         1.0     0.0    0.0  0.299539         1       0
4         0.0     0.5    0.0  0.025374         1       0


                                                   0
0  [[0.5, 0.4093993465694898, 0.0, 0.0, 0.0239592...
1  0      0
1      0
2      0
3      1
4      1
 ...
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...


# **Warning : These cells take hours to run! **

---


# Hyperparameter tuning

Models : 

1.   XGB Classifier
2.   SVM Classifier
3.   Random Forest Classifier




In [24]:
from sklearn import model_selection
from sklearn import metrics

def hyperparameter_tune_and_run(df,num_folds,models,target_name,param_grid,evaluation_metric,print_details=False):
  X=df.drop(labels=[target_name,'kfolds'],axis=1).values
  y=df[target_name]

  model_name,model_constructor=list(models.items())[0]

  model = model_selection.GridSearchCV(
      estimator = model_constructor,
      param_grid = param_grid,
      scoring = evaluation_metric,
      verbose = 10,
      cv = num_folds,
      n_jobs=-1
  )

  model.fit(X,y)

  if(print_details==True):
    print(f"Best score : {model.best_score_}")

    print("Best parameters : ")
    best_parameters=model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
      print(f"\t{param_name}: {best_parameters[param_name]}")
  
  return model

In [None]:
models={'XGB Classifier': XGBClassifier()}
param_grid = {
    "learning_rate":[0.01,0.015,0.025,0.05,0.1],
    "gamma":[0.05,0.1,0.3,0.5,0.7,0.9,1.0],
    "max_depth":[3,5,7,9,12,15,17,25],
    "min_child_weight":[1,3,5,7],
    "subsample":[0.6,0.7,0.8,0.9,1.0],
    "colsample_bytree":[0.6,0.7,0.8,0.9,1.0],
    # "reg_lambda":[0.01,0.03,0.05,0.07,0.09,0.1,1.0],
    # "reg_alpha":[0.01,0.03,0.05,0.07,0.09,0.1,1.0]
}
model = hyperparameter_tune_and_run(df=df_optimal_XGB,num_folds=5,models=models,target_name='Survived',param_grid=param_grid,evaluation_metric="accuracy",print_details=True)

Fitting 5 folds for each of 28000 candidates, totalling 140000 fits
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6, score=0.804, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6, score=0.798, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6, score=0.798, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsam

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s remaining:    0.0s


[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7, score=0.798, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7, score=0.798, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7, score=0.781, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7 
[CV]  colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7, score=0.792, total=   0.0s
[CV] colsample_bytree=0.6, gamma=0.05, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8 
[CV]  colsample_bytree=0.6, gamm

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s remaining:    0.0s


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV]  colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6, score=0.827, total=   0.1s
[CV] colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6, score=0.792, total=   0.1s
[CV] colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6, score=0.809, total=   0.1s
[CV] colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6 
[CV]  colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_depth=25, min_child_weight=1, subsample=0.6, score=0.815, total=   0.1s
[CV] colsample_bytree=1.0, gamma=1.0, learning_rate=0.015, max_dept

[Parallel(n_jobs=1)]: Done 140000 out of 140000 | elapsed: 120.1min finished


Best score : 0.8271483271608812
Best parameters : 
	colsample_bytree: 0.8
	gamma: 0.3
	learning_rate: 0.1
	max_depth: 5
	min_child_weight: 5
	subsample: 1.0


In [None]:
models={'SVM Classifier': SVC()}
param_grid = {
    "C":[0.001,0.01,0.1,1,10,100,1000],
    "gamma":['auto'],
    "class_weight":['balanced']
}
SVM_model = hyperparameter_tune_and_run(df=df_optimal_SVM,num_folds=5,models=models,target_name='Survived',param_grid=param_grid,evaluation_metric="accuracy",print_details=True)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] C=0.001, class_weight=balanced, gamma=auto ......................
[CV]  C=0.001, class_weight=balanced, gamma=auto, score=0.385, total=   0.0s
[CV] C=0.001, class_weight=balanced, gamma=auto ......................
[CV]  C=0.001, class_weight=balanced, gamma=auto, score=0.382, total=   0.0s
[CV] C=0.001, class_weight=balanced, gamma=auto ......................
[CV]  C=0.001, class_weight=balanced, gamma=auto, score=0.382, total=   0.0s
[CV] C=0.001, class_weight=balanced, gamma=auto ......................
[CV]  C=0.001, class_weight=balanced, gamma=auto, score=0.382, total=   0.0s
[CV] C=0.001, class_weight=balanced, gamma=auto ......................
[CV]  C=0.001, class_weight=balanced, gamma=auto, score=0.388, total=   0.0s
[CV] C=0.01, class_weight=balanced, gamma=auto .......................
[CV]  C=0.01, class_weight=balanced, gamma=auto, score=0.385, total=   0.0s
[CV] C=0.01, class_weight=balanced, gamma=auto ......

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s remaining:    0.0s


[CV]  C=0.01, class_weight=balanced, gamma=auto, score=0.382, total=   0.0s
[CV] C=0.01, class_weight=balanced, gamma=auto .......................
[CV]  C=0.01, class_weight=balanced, gamma=auto, score=0.382, total=   0.0s
[CV] C=0.01, class_weight=balanced, gamma=auto .......................
[CV]  C=0.01, class_weight=balanced, gamma=auto, score=0.388, total=   0.0s
[CV] C=0.1, class_weight=balanced, gamma=auto ........................
[CV]  C=0.1, class_weight=balanced, gamma=auto, score=0.799, total=   0.0s
[CV] C=0.1, class_weight=balanced, gamma=auto ........................
[CV]  C=0.1, class_weight=balanced, gamma=auto, score=0.775, total=   0.0s
[CV] C=0.1, class_weight=balanced, gamma=auto ........................
[CV]  C=0.1, class_weight=balanced, gamma=auto, score=0.781, total=   0.0s
[CV] C=0.1, class_weight=balanced, gamma=auto ........................
[CV]  C=0.1, class_weight=balanced, gamma=auto, score=0.781, total=   0.0s
[CV] C=0.1, class_weight=balanced, gamma=auto 

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.2s remaining:    0.0s


[CV]  C=1, class_weight=balanced, gamma=auto, score=0.781, total=   0.0s
[CV] C=1, class_weight=balanced, gamma=auto ..........................
[CV]  C=1, class_weight=balanced, gamma=auto, score=0.798, total=   0.0s
[CV] C=10, class_weight=balanced, gamma=auto .........................
[CV]  C=10, class_weight=balanced, gamma=auto, score=0.799, total=   0.0s
[CV] C=10, class_weight=balanced, gamma=auto .........................
[CV]  C=10, class_weight=balanced, gamma=auto, score=0.770, total=   0.0s
[CV] C=10, class_weight=balanced, gamma=auto .........................
[CV]  C=10, class_weight=balanced, gamma=auto, score=0.781, total=   0.0s
[CV] C=10, class_weight=balanced, gamma=auto .........................
[CV]  C=10, class_weight=balanced, gamma=auto, score=0.781, total=   0.0s
[CV] C=10, class_weight=balanced, gamma=auto .........................
[CV]  C=10, class_weight=balanced, gamma=auto, score=0.803, total=   0.0s
[CV] C=100, class_weight=balanced, gamma=auto ............

[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    0.8s finished


In [None]:
models={'Random Forest': RandomForestClassifier()}
param_grid = {
    "n_estimators":[120,300,500,800,1200],
    "max_depth":[5,8,15,25,30,None],
    "min_samples_split":[1,2,5,10,15,100],
    "min_samples_leaf":[1,2,5,10],
    "max_features":["log2","sqrt",None]
}
Random_Forest_model = hyperparameter_tune_and_run(df=df_optimal_RFC,num_folds=5,models=models,target_name='Survived',param_grid=param_grid,evaluation_metric="accuracy",print_details=True)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1

Best score : 0.8260435628648548
Best parameters : 
	max_depth: 30
	max_features: None
	min_samples_leaf: 1
	min_samples_split: 15
	n_estimators: 300


# Run the models

In [25]:
XGB_model=XGBClassifier(max_depth=4,learning_rate=0.1,colsample_bytree=0.8,gamma=0.3,min_child_weight=5,subsample=1.0)
SVM_model=SVC(C=1000,class_weight='balanced',gamma='auto')
RFC_model=RandomForestClassifier(max_depth=30,max_features=None,min_samples_leaf=1,min_samples_split=15,n_estimators=300)

models={
    'XGB Classifier' : XGB_model,
    'SVM Classifier' : SVM_model,
    'Random Forest Classifier' : RFC_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df_optimal_XGB,models=models,target_name='Survived',print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

Model => XGB Classifier => Fold = 0 => Training Accuracy = 0.8300561797752809 => Validation Accuracy = 0.8268156424581006
Model => SVM Classifier => Fold = 0 => Training Accuracy = 0.8117977528089888 => Validation Accuracy = 0.5698324022346368
Model => Random Forest Classifier => Fold = 0 => Training Accuracy = 0.8735955056179775 => Validation Accuracy = 0.8491620111731844

--------------------------------------------------------------------------------------------

Model => XGB Classifier => Fold = 1 => Training Accuracy = 0.8541374474053296 => Validation Accuracy = 0.797752808988764
Model => SVM Classifier => Fold = 1 => Training Accuracy = 0.8106591865357644 => Validation Accuracy = 0.7752808988764045
Model => Random Forest Classifier => Fold = 1 => Training Accuracy = 0.8821879382889201 => Validation Accuracy = 0.8033707865168539

--------------------------------------------------------------------------------------------

Model => XGB Classifier => Fold = 2 => Training Accuracy = 

# Prediction

In [58]:
df_test = pd.read_csv(TESTING_PATH)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [59]:
df_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


#### Creating the dataframe to submit and inserting the Passenger ID column



In [60]:
df_submit=pd.DataFrame()
df_submit['PassengerId']=df_test['PassengerId']
df_submit.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [61]:
def get_preprocessed_test_data(df,df_test):

  df_test=pd.get_dummies(data=df_test,columns=['Sex','Embarked'])

  optimal_data_cols = df.columns
  optimal_data_cols = list(optimal_data_cols[:-2])
  df_test=df_test.loc[:,optimal_data_cols]

  null_columns = [k for k,v in dict(df_test.isna().sum()).items() if v!=0]

  for null_column in null_columns:
    df_test[null_column]=imputer.transform(df_test[null_column].values.reshape(-1,1))

  df_test_2=pd.DataFrame(scaler.fit_transform(df_test),index=df_test.index,columns=df_test.columns)
  df_test=df_test_2

  return df_test

In [62]:
df_test = get_preprocessed_test_data(df,df_test)
df_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.452723,0.0,0.0,0.015282,0.0,1.0,0.0,1.0,0.0
1,1.0,0.617566,0.125,0.0,0.013663,1.0,0.0,0.0,0.0,1.0
2,0.5,0.815377,0.0,0.0,0.018909,0.0,1.0,0.0,1.0,0.0
3,1.0,0.353818,0.0,0.0,0.016908,0.0,1.0,0.0,0.0,1.0
4,1.0,0.287881,0.125,0.111111,0.023984,1.0,0.0,0.0,0.0,1.0


In [63]:
df_test.isna().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [37]:
x_train=df.drop(['Survived','kfolds'],axis=1)
y_train=df['Survived']

In [None]:
models={'Random Forest': RandomForestClassifier()}
param_grid = {
    "n_estimators":[120,300,500,800,1200],
    "max_depth":[5,8,15,25,30,None],
    "min_samples_split":[1,2,5,10,15,100],
    "min_samples_leaf":[1,2,5,10],
    "max_features":["log2","sqrt",None]
}
Random_Forest_model = hyperparameter_tune_and_run(df=df,num_folds=5,models=models,target_name='Survived',param_grid=param_grid,evaluation_metric="accuracy",print_details=True)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1

Best score : 0.8372481325717155
Best parameters : 
	max_depth: None
	max_features: None
	min_samples_leaf: 2
	min_samples_split: 10
	n_estimators: 120


In [38]:
x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.5,0.409399,0.0,0.0,0.023959,0.0,1.0,0.0,0.0,1.0
1,1.0,0.367921,0.0,0.0,0.015127,0.0,1.0,0.0,1.0,0.0
2,1.0,0.334004,0.0,0.0,0.015412,0.0,1.0,0.0,0.0,1.0
3,0.0,0.723549,0.0,0.166667,0.299539,1.0,0.0,0.0,0.0,1.0
4,0.5,0.421965,0.0,0.0,0.025374,0.0,1.0,0.0,0.0,1.0


In [39]:
len(x_train)

891

In [None]:
final_model = RandomForestClassifier(max_depth=None,max_features=None,min_samples_leaf=2,min_samples_split=10,n_estimators=100)
final_model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
df_submit['Survived']=final_model.predict(df_test)

In [None]:
df_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [40]:
len(df_submit)

418

In [None]:
df_submit.to_csv('RFC_Submission_Allfeats_hyperparams_tuned_100.csv',index=False)

In [25]:
df_1 = pd.read_csv('/content/drive/My Drive/Titanic/Submissions/RFC_Submission_2.csv')
df_2 = pd.read_csv('/content/drive/My Drive/Titanic/Submissions/RFC_Submission_Allfeats_hyperparams_tuned.csv')
df_3 = pd.read_csv('/content/drive/My Drive/Titanic/Submissions/XGB_Submission.csv')

In [26]:
df_mode = pd.DataFrame()
df_mode['Survived_1'],df_mode['Survived_2'],df_mode['Survived_3'] = df_1['Survived'],df_2['Survived'],df_3['Survived']
df_mode.head()

Unnamed: 0,Survived_1,Survived_2,Survived_3
0,0,0,0
1,1,0,1
2,0,0,0
3,0,0,0
4,1,1,1


In [27]:
df_mode.mode(axis=1)

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [46]:
df_submit['Survived'] = df_mode.mode(axis=1)[0]

In [47]:
df_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [49]:
df_submit.to_csv('Bagged_model.csv',index=False)