In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Titanic - Machine Learning from Disaster
https://www.kaggle.com/c/titanic/overview

## 1. Problem definition
The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.


## 2. Data
The data has been split into two groups:

1. training set (train.csv)
2. test set (test.csv)

* **survival** : Survival - 0 = No, 1 = Yes
* **pclass**: Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd
* **sex**: Sex
* **Age**: Age in years
* **sibsp**: # of siblings / spouses aboard the Titanic
* **parch**: # of parents / children aboard the Titanic
* **ticket**: Ticket number
* **fare**: Passenger fare
* **cabin**: Cabin number
* **embarked**: Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton

### Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...  
Sibling = brother, sister, stepbrother, stepsister  
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...  
Parent = mother, father  
Child = daughter, son, stepdaughter, stepson  
Some children travelled only with a nanny, therefore parch=0 for them.


## 3. Evaluation
Your score is the percentage of passengers you correctly predict. This is known as accuracy.

## 4. Submission
Submission File Format  
You should submit a csv file with exactly 418 entries plus a header row. Your submission will show an error if you have extra columns (beyond PassengerId and Survived) or rows.

The file should have exactly 2 columns:

* PassengerId (sorted in any order)
* Survived (contains your binary predictions: 1 for survived, 0 for deceased)  
PassengerId,Survived  
892,0  
893,1  
894,0  
Etc.

## 5. Feature importance
Determine which features are important and repeat the process

In [380]:
# basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
import missingno as msno

%matplotlib inline

# modelling
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# xgboost
from xgboost import XGBClassifier

#catboost
from catboost import CatBoostClassifier

# lightgbm
import lightgbm as lgb

# hyperparameter optimization
import optuna

# metrics"
from sklearn.metrics import log_loss
from sklearn.metrics import plot_roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load Data

In [4]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')

In [5]:
df.head()

In [6]:
df.dtypes

In [7]:
print(f'# recorded entries = {len(df)}')

# EDA

In [8]:
dfe = df.copy()

In [9]:
msno.matrix(dfe)
plt.show()

In [10]:
dfe.isna().sum()

**Since almost 80% of the recorded data is missing the Cabin Number, we are going to drop this column since there is no sensible way to fill them**

In [11]:
dfe.drop(columns=['Cabin'], inplace=True)

In [12]:
dfe['Ticket'].unique()[:50]

In [13]:
dfe['Name'].unique()[:20]

**Drop Ticket and Name columns, they are not informative enough**

In [14]:
dfe.drop(columns=['Ticket', 'Name'], inplace=True)

**The passengerid column does not provide any useful information - it is the same as the integer index => drop it**

In [15]:
dfe.drop(columns=['PassengerId'], inplace=True)

## One Variable

In [16]:
dfe.describe()

In [17]:
dfe.head(1)

In [18]:
dfe['Age'].hist()

In [19]:
age_bins = [0,15,25,35,45,80]

In [20]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(nrows=2, ncols=2 ,figsize=(22,10))

dfe['Survived'].value_counts().sort_index().plot(kind='bar', ax = ax0)
ax0.set_title('Survived')
ax0.set_ylabel('Frequency')
ax0.set_xticklabels(['Dead (0)', 'Alive (1)'], rotation = 0)

dfe['Pclass'].value_counts().sort_index().plot(kind='bar', ax = ax1)
ax1.set_title('Class')
ax1.set_ylabel('Frequency')
ax1.set_xticklabels(['Upper (1)', 'Middle (2)', 'Lower(3)'], rotation = 0)

dfe['Sex'].value_counts().sort_index().plot(kind='bar', ax = ax2)
ax2.set_title('Sex')
ax2.set_ylabel('Frequency')
ax2.set_xticklabels(['F', 'M'], rotation = 0)

dfe['Age'].value_counts(bins=age_bins).sort_index().plot(kind='bar', ax = ax3)
ax3.set_title('Age group')
ax3.set_ylabel('Frequency')

plt.xticks(rotation='horizontal')
plt.show()

In [21]:
fig, ax = plt.subplots(figsize = (8,4))

ax.set_title('Fare Historgram')
ax.set_ylabel('Frequency')
ax.set_xlabel('Cost')
dfe['Fare'].hist(bins=20);

In [22]:
fig, ax = plt.subplots(figsize=(8,4))

dfe['Embarked'].value_counts().sort_index().plot(kind='bar', ax = ax)
ax.set_title('Embark Location')
ax.set_ylabel('Frequency')
ax.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

In [23]:
fig, (ax0, ax1) = plt.subplots(ncols=2, nrows=1, figsize=(18,4))

dfe['SibSp'].value_counts().sort_index().plot(kind='bar', ax = ax0)
ax0.set_title('Siblings/Spouse')
ax0.set_ylabel('Frequency')
#ax.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

dfe['Parch'].value_counts().sort_index().plot(kind='bar', ax = ax1)
ax1.set_title('Parents/Child')
ax1.set_ylabel('Frequency');
#ax.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

## Two Variables

In [24]:
dfe.head(1)

In [25]:
dfe['Age_binned'] = pd.cut(dfe['Age'], bins=age_bins)

In [26]:
fig, ((ax0, ax1), (ax2, ax3), (ax4, ax5)) = plt.subplots(ncols=2, nrows=3, figsize=(16, 20))

pd.crosstab(dfe['Survived'], dfe['Pclass']).T.plot(kind='bar', ax=ax0);
ax0.set_xticklabels(['Upper (1)', 'Middle (2)', 'Lower(3)'], rotation = 0);

pd.crosstab(dfe['Survived'], dfe['Sex']).T.plot(kind='bar', ax=ax2);
ax2.set_xticklabels(['F', 'M'], rotation = 0);

pd.crosstab(dfe['Survived'], dfe['Age_binned']).T.plot(kind='bar', ax=ax4);
ax4.set_xlabel('Age bin')
ax4.set_xticklabels(np.arange(len(age_bins)-1), rotation = 0);

pd.crosstab(dfe['Survived'], dfe['Embarked']).T.plot(kind='bar', ax=ax1);
ax1.set_title('Embark Location')
ax1.set_ylabel('Frequency')
ax1.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

pd.crosstab(dfe['Survived'], dfe['SibSp']).T.plot(kind='bar', ax=ax3);
ax3.set_title('Siblings/Spouse')
ax3.set_ylabel('Frequency')
#ax3.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

pd.crosstab(dfe['Survived'], dfe['Parch']).T.plot(kind='bar', ax=ax5);
ax5.set_title('Parents/Children')
ax5.set_ylabel('Frequency');
#ax5.set_xticklabels(['Cherbourg (C)', 'Queenstown (Q)', 'Southampton (S)'], rotation = 0);

# Data Augmenting

In [27]:
dfe_aug = dfe.copy()
dfe_title = df.copy()

## Titles 

In [28]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['Countess', 'Mme', 'the Countess']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms', 'Lady']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [29]:
dfe_title['Title'] = dfe_title['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [30]:
dfe_title['Title'] = dfe_title.apply(replace_titles, axis=1)

In [31]:
dfe_title['Title'] .value_counts()

## Make Parch and Sibsp binary

In [32]:
dfe_aug['Parch_bin'] = dfe_aug.apply(lambda x: 1 if x['Parch'] > 0 else 0, axis=1)
dfe_aug['SibSp_bin'] = dfe_aug.apply(lambda x: 1 if x[
    
    
    'SibSp'] > 0 else 0, axis=1)

**We can replace the existing SibSp and Parch with the binary correspondents since they are more insightful (more info below)**

## Add Cabin type

The first letter in Cabin shows the deck - 'first class had the top decks (A-E), second class (D-F), and third class (E-G)'

In [33]:
dfe_aug['Cabin'] = df['Cabin'].fillna('U')
dfe_aug['Cabin'] = dfe_aug.apply( lambda x: x['Cabin'][0] ,axis=1)
dfe_aug.head()

## Add family size

In [34]:
dfe_aug['FamilySize'] = dfe_aug['SibSp'] + dfe_aug['Parch']
dfe_aug['IsAlone'] = 1
dfe_aug.loc[dfe_aug['FamilySize'] > 1, 'IsAlone'] = 0

## Correlation

In [35]:
dfe_aug.corr()['Survived'][1:].plot.barh(figsize=(10,4))
print(dfe_aug.corr()['Survived'][1:])

In [36]:
corr = dfe_aug.corr()

f, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap='coolwarm');

### Correlation of Binned Variables

In [37]:
def corr_bin(val, feat):
    df_cor = dfe_aug.copy()
    df_cor[feat] = pd.qcut(df_cor[feat], val, labels=False)
    print(f'"{feat}" correlation with {val} bins: {df_cor.corr()["Survived"][feat]}')

In [38]:
def corr_bin2(val, feat):
    df_cor = dfe_aug.copy()
    df_cor[feat] = pd.cut(df_cor[feat], val, labels=False)
    print(f'"{feat}" correlation with {val} bins: {df_cor.corr()["Survived"][feat]}')

In [39]:
for i in range(3,11):
    corr_bin(i, "Fare")   

In [40]:
for i in range(3,11):
    corr_bin2(i, "Age")   

#### Conclusion: Binary SibSp and Parch are more valuable than their numerical origins

# Pre-process Data

In [41]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [42]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['Countess', 'Mme', 'the Countess']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms', 'Lady']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [43]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [44]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)

In [312]:
num_cols     = ['Age', 'Fare', 'Pclass','SibSp', 'Parch']
cat_ord_cols = ['Pclass', 'Sex']
cat_ohe_cols = ['Embarked', 'Cabin', 'Name', 'Sex'] 

In [313]:
from sklearn.base import BaseEstimator, TransformerMixin


class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        knn_imp = KNNImputer(n_neighbors=5)
        knn_imp.fit(X)
        self.knn_imp = knn_imp
        return self
    
    def transform(self, X, y=None):
        Xmod = X.copy()
        Xmod = pd.DataFrame(data= self.knn_imp.transform(X), columns=num_cols)
        
        Xmod['AgeBin'] = pd.cut(Xmod['Age'], bins=5, labels=False)
        #Xmod['Fare'] = pd.qcut(Xmod['Fare'], 4, labels=False)
        Xmod['AgeClass'] = Xmod['Age']*Xmod['Pclass']
        
        Xmod['FamilySize'] = Xmod['SibSp'] + Xmod['Parch'] + 1
        Xmod['IsAlone'] = Xmod['FamilySize'].map(lambda x: 0 if x > 1 else 1)

        Xmod.drop(columns=['SibSp', 'Parch', 'Pclass', 'AgeBin'], inplace=True)
        return Xmod

class OHTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        Xmod = X.copy()
        
        Xmod['Title'] = Xmod['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
        Xmod['Title'] = Xmod.apply(replace_titles, axis=1)
        
        Xmod['Cabin'].fillna('U', inplace=True)
        Xmod['Cabin'] = Xmod.apply(lambda x: x['Cabin'][0], axis=1)
        Xmod['Embarked'].fillna('S', inplace=True)
        
        Xmod.drop(columns=['Name', 'Sex'], inplace=True)
        return Xmod

In [314]:
num_pipeline = Pipeline([
    ('numtr', NumericalTransformer())
])

cat_pipeline_ordinal = Pipeline([
    ('ord', OrdinalEncoder()) 
])


cat_pipeline_ohe = Pipeline([
    ('ohtr', OHTransformer()),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('pca', PCA())
])


pipeline_pp = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat_ord', cat_pipeline_ordinal, cat_ord_cols),
    ('cat_ohe', cat_pipeline_ohe, cat_ohe_cols)
])

In [315]:
Xtransf = pipeline_pp.fit_transform(X)

In [None]:
all_columns = ['Age', 'Fare', 'AgeClass', 'Pclass', 'Sex', 'FamilySize', 'IsAlone', 'Title1', 'Title2', 'Title3', 'Title4',
              'Cabin1', 'Cabin2', 'Cabin3', 'Cabin4', 'Cabin5', 'Cabin6', 'Cabin7', 'Cabin8', 'Cabin9', 'Emb1', 'Emb2', 'Emb3']

# Model

## SVM

In [358]:
svm_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', SVC())
])

In [359]:
svm_params = {'pp__cat_ohe__pca__n_components': list(range(1, 8)),
                'clf__C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20], 
              'clf__kernel': ['rbf', 'poly', 'sigmoid']}

In [360]:
search = RandomizedSearchCV(svm_pl,
                       param_distributions=svm_params,
                       cv = 5,
                       n_iter = 100,
                       verbose=1,
                       n_jobs=-1)
search.fit(X, y);

# cross_val_score(svm_pl, X, y, cv=5, scoring='accuracy').mean()

In [361]:
search.best_score_

In [363]:
svm_best = search.best_estimator_

## RF - Tuning

In [316]:
rf_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

In [317]:
#bin_fare=True, bin_age=True, nbins_fare=5, nbins_age=4,

param_dist = {'pp__cat_ohe__pca__n_components': list(range(1, 7)),
             'clf__max_features': ['log2'],
             'clf__n_estimators':[55, 60],
             'clf__max_depth': list(range(9, 12)),
             'clf__min_samples_leaf': list(range(2,10)),
             'clf__min_samples_split': list(range(17, 30))}

In [318]:
search = RandomizedSearchCV(rf_pl,
                       param_distributions=param_dist,
                       cv = 5,
                       n_iter = 300,
                       verbose=1,
                       n_jobs=-1)
search.fit(X, y);

In [319]:
search.best_score_

In [320]:
search.best_params_

In [321]:
# Good one= 
# Best Score = 0.8327851358985626
# 0.8609550561797753
# 0.8659217877094972
good3 = {'pp__cat_ohe__pca__n_components': 5,
 'clf__n_estimators': 60,
 'clf__min_samples_split': 18,
 'clf__min_samples_leaf': 4,
 'clf__max_features': 'log2',
 'clf__max_depth': 10}

good_mine = {'clf__max_depth': 10,
 'clf__max_features': 'log2',
 'clf__min_samples_leaf': 7,
 'clf__min_samples_split': 30,
 'clf__n_estimators': 50,
 'pp__cat_ohe__pca__n_components': 5}


rf_pl.set_params(**good_mine)
rf_best = rf_pl

In [322]:
rf_best = search.best_estimator_

In [323]:
cross_val_score(rf_best, X, y, cv=5, scoring='accuracy').mean()

In [328]:
rf_best.fit(X_train, y_train)
print(rf_best.score(X_train, y_train))
print(rf_best.score(X_val, y_val))

In [325]:
cols_name = ['Age', 'Fare', 'Pclass', 'Sex', 'AgeClass', 'IsAlone', 'Title'] + [f'OH{i}' for i in range(search.best_params_['pp__cat_ohe__pca__n_components'])]
cols_name

In [326]:
plt.barh(cols_name, rf_best[1].feature_importances_)

### RF Optuna

In [68]:
rf_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

In [69]:
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        'pp__cat_ohe__pca__n_components': trial.suggest_int('pp__cat_ohe__pca__n_components', 3, 8),
        "clf__n_estimators": trial.suggest_int("clf__n_estimators",50, 70),
        "clf__max_depth": trial.suggest_int("clf__max_depth", 8, 13),
        "clf__min_samples_leaf": trial.suggest_int("clf__min_samples_leaf", 2, 12),
        "clf__min_samples_split": trial.suggest_int("clf__min_samples_split", 12, 40),
        "clf__max_features": trial.suggest_categorical("clf__max_features", ['log2', 'sqrt']),
    }

    cv_scores = cross_val_score(rf_pl.set_params(**param_grid), X, y, cv=2, scoring='accuracy')
    
    return np.mean(cv_scores)

In [70]:
study = optuna.create_study(direction="maximize", study_name="RF")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=150)

In [76]:
study.best_params

In [77]:
study.best_value

In [78]:
rf_pl.set_params(**study.best_params)
rf_optuna = rf_pl

In [79]:
cross_val_score(rf_optuna, X, y, cv=5, scoring='accuracy').mean()

In [83]:
rf_optuna.fit(X_train, y_train)
print(rf_optuna.score(X_train, y_train))
print(rf_optuna.score(X_val, y_val))

## LGBM

In [364]:
lgb_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', lgb.LGBMClassifier(verbose=-100, n_jobs=-1))
])

In [365]:
param_dist= {'pp__cat_ohe__pca__n_components':list(range(1, 10)),
            'clf__colsample_bytree':[0.6, 0.7, 0.8, 0.9, 1],
            'clf__subsample': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4],
            'clf__n_estimators':[70, 75, 80, 85, 90, 100, 120],
            'clf__learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
            'clf__max_depth':  list(range(2,10)),
            'clf__min_child_samples':list(range(20, 70)),
            'clf__num_leaves': list(range(20, 50)),
            'clf__reg_alpha': [0.01, 0.1, 1, 2, 5, 10],
            'clf__reg_lambda': [0.01, 0.1, 1, 2, 5],}

In [366]:
search = RandomizedSearchCV(lgb_pl,
                            param_distributions=param_dist,
                            cv = 5,
                            n_iter = 300,
                            #return_train_score = True,
                            verbose=1,
n_jobs=-1)
search.fit(X, y);

In [367]:
search.best_params_

In [368]:
search.best_score_

In [369]:
lgb_best = search.best_estimator_

In [371]:
lgb_best.fit(X_train, y_train)
print(lgb_best.score(X_train, y_train))
print(lgb_best.score(X_val, y_val))

## CatBoost

In [None]:
cb_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', CatBoostClassifier())
])

In [None]:
param_dist = {'pp__cat_ohe__pca__n_components': list(range(1, 10)),
             'clf__iterations':[50, 60],
             'clf__depth': list(range(1, 7)),
             'clf__l2_leaf_reg': [ 20, 50, 100, 125, 250, 500],
             'clf__learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8],
             'clf__verbose':[0]}

In [None]:
search = RandomizedSearchCV(cb_pl,
                      param_distributions=param_dist,
                       cv = 5,
                        n_iter = 100,
                       #return_train_score = True,
                       verbose=1,
                       n_jobs=-1)
search.fit(X, y);

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
cb_best = search.best_estimator_

In [None]:
cb_best.fit(X_train, y_train)
print(cb_best.score(X_train, y_train))
print(cb_best.score(X_val, y_val))

### CatBoost Optuna

In [None]:
cb_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', CatBoostClassifier())
])

In [None]:
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        'pp__cat_ohe__pca__n_components': trial.suggest_int('pp__cat_ohe__pca__n_components', 1, 15),
        "clf__iterations": trial.suggest_int("clf__n_estimators",30, 150),
        "clf__depth": trial.suggest_int("clf__depth", 2, 10),
        'clf__learning_rate' : trial.suggest_loguniform('clf__learning_rate', 0.01, 0.3),
        'clf__random_strength' :trial.suggest_int('clf__random_strength', 0, 100),
        'clf__bagging_temperature' :trial.suggest_loguniform('clf__bagging_temperature', 0.01, 100.00),
        'clf__verbose' :trial.suggest_categorical('clf__verbose', [0])
    }
    
    cv_scores = cross_val_score(cb_pl.set_params(**param_grid), X, y, cv=5, scoring='accuracy')

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="RF")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=50)

In [None]:
study.best_params

In [None]:
cb_optuna = Pipeline([
    ('pp', pipeline_pp),
    ('clf', CatBoostClassifier())
])
best_params = {'pp__cat_ohe__pca__n_components': 7,
 'clf__n_estimators': 100,
 'clf__depth': 7,
 'clf__learning_rate': 0.18811123393058113,
 'clf__random_strength': 31,
 'clf__bagging_temperature': 9.64844614645104,
 'clf__verbose': 0}
cb_optuna.set_params(**best_params);

In [None]:
cross_val_score(cb_optuna, X, y, cv=5, scoring='accuracy').mean()

In [None]:
cb_optuna.fit(X_train, y_train)
print(cb_optuna.score(X_train, y_train))
print(cb_optuna.score(X_val, y_val))

## XGBoost

In [None]:
xgb_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='error') )
])

In [None]:
param_dist = {'pp__cat_ohe__pca__n_components': list(range(1, 10)),
             'clf__n_estimators':[40, 60, 80, 100, 120, 200, 300],
             'clf__max_depth': list(range(1, 12)),
             'clf__subsample': [ 0.2, 0.4, 0.6, 0.8, 1],
             'clf__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8],
             'clf__colsample_bytree':[0.2, 0.4, 0.6, 0.8, 1],
             'clf__gamma':[0.1, 1, 2, 5, 10, 20, 50, 100]}

In [None]:
def objective(trial, X, y):
    param_grid = {'pp__cat_ohe__pca__n_components':trial.suggest_int ('pp__cat_ohe__pca__n_components',1 ,10),
                  'clf__n_estimators':trial.suggest_int('clf__n_estimators',40 , 65),
                  'clf__max_depth':trial.suggest_int('clf__max_depth',1 ,8),
                  'clf__learning_rate':trial.suggest_loguniform('clf__learning_rate',0.01 ,5),
                  'clf__colsample_bytree':trial.suggest_loguniform('clf__colsample_bytree',0.1 ,0.8),
                  'clf__subsample':trial.suggest_loguniform('clf__subsample',0.2 ,1),
                  'clf__gamma' :trial.suggest_loguniform('clf__gamma', 0.01, 100)}
    
    cv_scores = cross_val_score(xgb_pl.set_params(**param_grid), X, y, cv=5, scoring='accuracy')

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="XGB")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100)

In [None]:
xgb_best = Pipeline([
    ('pp', pipeline_pp),
    ('clf', XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='error') )
])
best_params = study.best_params
xgb_best.set_params(**best_params);

In [None]:
best_params

In [None]:
cross_val_score(xgb_best, X, y, cv=5, scoring='accuracy').mean()

In [None]:
xgb_best.fit(X_train, y_train)
print(xgb_best.score(X_train, y_train))
print(xgb_best.score(X_val, y_val))

## KNN

In [372]:
knn_pl = Pipeline([
    ('pp', pipeline_pp),
    ('clf', KNeighborsClassifier() )]
)

In [373]:
param_dist = {'pp__cat_ohe__pca__n_components': list(range(1, 10)),
             'clf__n_neighbors':list(range(2, 15))}

In [374]:
search = RandomizedSearchCV(knn_pl,
                      param_distributions=param_dist,
                       cv = 5,
                        n_iter = 100,
                       #return_train_score = True,
                       verbose=1,
                       n_jobs=-1)
search.fit(X, y);

In [375]:
search.best_score_

In [376]:
search.best_params_

In [377]:
knn_best = search.best_estimator_

In [378]:
knn_best.fit(X_train, y_train)
print(knn_best.score(X_train, y_train))
print(knn_best.score(X_val, y_val))

## Stacking

In [384]:
stacking_clf = StackingClassifier(estimators=[('svm', svm_best),
                                              ('rf',  rf_best),
                                              ('lgb', lgb_best),
                                              ('knn', knn_best)],
                                               final_estimator=LogisticRegression(**{"penalty":"l2","solver": "liblinear","C":0.2,}),
                                               n_jobs=-1)

In [385]:
cross_val_score(stacking_clf, X, y, cv=5, scoring='accuracy').mean()

# Test Data

In [183]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [391]:
stacking_clf.fit(X, y);
stacking_clf.score(X,y)

In [392]:
y_pred = stacking_clf.predict(test)

In [393]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_pred})

In [395]:
submission.to_csv('submission_stack.csv',index=False)