# References

* https://habr.com/ru/post/274171/
* https://neurohive.io/ru/osnovy-data-science/razbor-resheniya-zadachi-titanik-na-kaggle-dlya-nachinajushhih/

# Packages

In [53]:
import numpy as np # linear algebra
import pandas as pd # data processing

import matplotlib.pyplot as plt # graphs
import seaborn as sns # graphs

from sklearn.model_selection import GridSearchCV # CV
from sklearn.linear_model import LogisticRegression # Model
from sklearn.preprocessing import StandardScaler, OneHotEncoder # Preprocess feautures
from sklearn.pipeline import Pipeline, make_pipeline # Pipelines
from sklearn.impute import SimpleImputer # Imputer
from sklearn.compose import ColumnTransformer # For transformation
from sklearn.metrics import classification_report,roc_curve, roc_auc_score # Report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

from sklearn import set_config
from sklearn.model_selection import train_test_split
import re

set_config(display='diagram')
from xgboost import XGBClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.warn(label_encoder_deprecation_msg, UserWarning)

NameError: name 'label_encoder_deprecation_msg' is not defined

# Load the data

In [3]:
Train_full = pd.read_csv('titanic/train.csv',index_col='PassengerId',
                    dtype={'Pclass': 'category','Sex': 'category'})
X_valid_full = pd.read_csv('titanic/test.csv', index_col='PassengerId',
                  dtype={'Pclass': 'category','Sex': 'category'})

In [4]:
# Remove rows with missing target, separate target from predictors
Train_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = Train_full.Survived
Train_full.drop(['Survived'], axis=1, inplace=True)

In [5]:
# Break off test set from training data
X_train_full, X_test_full, y_train, y_test = train_test_split(Train_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=1902)

# Prepare the data

In [6]:
X_train_full.dtypes.value_counts() # Look on dtypes of raw features

object      4
float64     2
int64       2
category    1
category    1
dtype: int64

In [7]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype not in ('int64','float64')]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ('int64','float64')]

In [8]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Models

## Random Forest

In [9]:
# Define model
model = RandomForestClassifier(n_estimators=100, random_state=1902)

# Bundle preprocessing and modeling code in a pipeline
clf_RF = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
                        ]
                 )
# Preprocessing of training data, fit model 
clf_RF.fit(X_train, y_train)

In [10]:
# Base Results
y_test_preds = clf_RF.predict(X_test)
y_train_preds = clf_RF.predict(X_train)

print('Accuracy (train):', accuracy_score(y_train, y_train_preds))
print('Accuracy (test):', accuracy_score(y_test, y_test_preds))

Accuracy (train): 0.9789325842696629
Accuracy (test): 0.8100558659217877


## XGBoost

In [14]:
# Define model
model = XGBClassifier(n_estimators=100, random_state=1902)

# Bundle preprocessing and modeling code in a pipeline
clf_XGB = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
                        ]
                 )
# Preprocessing of training data, fit model 
clf_XGB.fit(X_train, y_train)





In [15]:
# Base Results
y_test_preds = clf_XGB.predict(X_test)
y_train_preds = clf_XGB.predict(X_train)

print('Accuracy (train):', accuracy_score(y_train, y_train_preds))
print('Accuracy (test):', accuracy_score(y_test, y_test_preds))

Accuracy (train): 0.9662921348314607
Accuracy (test): 0.8044692737430168


In [16]:
gs_params = {}

In [17]:
gs_params[1] = {'model__n_estimators':[10,25,50,100]}
gs_params[2] = {'model__learning_rate':[0.01, 0.05, 0.2, 0.3]}
gs_params[3] = {'model__max_depth':[1, 2, 4, 8]}
gs_params[4] = {'model__subsample':[0.3, 0.5, 0.7, 0.9]}
gs_params[5] = {'model__gamma':[0.05, 0.1, 0.5, 1]}

### Number of estimates

In [18]:
gs_1 = GridSearchCV(
    estimator=clf_XGB,
    param_grid=gs_params[1], 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy',
    verbose=2
)

In [19]:
gs_1.fit(X_train.append(X_test),y_train.append(y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  gs_1.fit(X_train.append(X_test),y_train.append(y_test))
  gs_1.fit(X_train.append(X_test),y_train.append(y_test))
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




In [20]:
print(gs_1.best_score_)
print(gs_1.best_params_)

0.8159625886636117
{'model__n_estimators': 10}


### Learning rate

In [21]:
gs_2 = GridSearchCV(
    estimator=clf_XGB,
    param_grid=gs_params[2], 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy',
    verbose=2
)

In [22]:
gs_2.fit(X_train.append(X_test),y_train.append(y_test))

  gs_2.fit(X_train.append(X_test),y_train.append(y_test))
  gs_2.fit(X_train.append(X_test),y_train.append(y_test))


Fitting 5 folds for each of 4 candidates, totalling 20 fits






In [23]:
print(gs_2.best_score_)
print(gs_2.best_params_)

0.8125855250768941
{'model__learning_rate': 0.05}


### Max depth

In [24]:
gs_3 = GridSearchCV(
    estimator=clf_XGB,
    param_grid=gs_params[3], 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy',
    verbose=2
)

In [25]:
gs_3.fit(X_train.append(X_test),y_train.append(y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  gs_3.fit(X_train.append(X_test),y_train.append(y_test))
  gs_3.fit(X_train.append(X_test),y_train.append(y_test))




In [26]:
print(gs_3.best_score_)
print(gs_3.best_params_)

0.8137091205825122
{'model__max_depth': 2}


### Subsample

In [27]:
gs_4 = GridSearchCV(
    estimator=clf_XGB,
    param_grid=gs_params[4], 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy',
    verbose=2
)

In [28]:
gs_4.fit(X_train.append(X_test),y_train.append(y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  gs_4.fit(X_train.append(X_test),y_train.append(y_test))
  gs_4.fit(X_train.append(X_test),y_train.append(y_test))




In [29]:
print(gs_4.best_score_)
print(gs_4.best_params_)

0.8092147385600402
{'model__subsample': 0.3}


### Gamma

In [30]:
gs_5 = GridSearchCV(
    estimator=clf_XGB,
    param_grid=gs_params[5], 
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy',
    verbose=2
)

In [31]:
gs_5.fit(X_train.append(X_test),y_train.append(y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  gs_5.fit(X_train.append(X_test),y_train.append(y_test))
  gs_5.fit(X_train.append(X_test),y_train.append(y_test))




In [32]:
print(gs_5.best_score_)
print(gs_5.best_params_)

0.822666499278137
{'model__gamma': 0.5}


### More over suboptimal

In [33]:
print(gs_1.best_params_)
print(gs_2.best_params_)
print(gs_3.best_params_)
print(gs_4.best_params_)
print(gs_5.best_params_)

{'model__n_estimators': 10}
{'model__learning_rate': 0.05}
{'model__max_depth': 2}
{'model__subsample': 0.3}
{'model__gamma': 0.5}


In [34]:
# gs_params[6] = {'model__n_estimators': [5,10,15,20]
#                 ,'model__learning_rate': [0.03,0.04,0.05,0.06,0.07]
#                 ,'model__max_depth': [2,3,4,5]
#                 ,'model__subsample': [0.2,0.25,0.3,0.35,0.4]
#                 ,'model__gamma': [0.4,0.45,0.5,0.55,0.6]}

In [55]:
# gs_6 = GridSearchCV(
#     estimator=clf_XGB,
#     param_grid=gs_params[6],
#     cv=5,
#     n_jobs=-1,
#     scoring='accuracy',
#     verbose=-1
# )

In [57]:
# gs_6.fit(X_train.append(X_test),y_train.append(y_test))

In [37]:
# print(gs_6.best_score_)
# print(gs_6.best_params_)

0.8148452702278576
{'model__gamma': 0.4, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 20, 'model__subsample': 0.4}


In [39]:
best_params = {'model__gamma': 0.4, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 20, 'model__subsample': 0.4}

In [58]:
clf_XGB.set_params(**best_params)

In [59]:
# Preprocessing of training data, fit model
clf_XGB.fit(X_train, y_train)





# Submission

In [62]:
predictions = clf_XGB.predict(X_valid)

output = pd.DataFrame({'PassengerId': X_valid.index, 'Survived': predictions})
output.to_csv('submission.csv', index=False)