# Titanic - Machine Learning from Disaster. CatBoost

<h1>Содержание<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Импорт-модулей-и-константы" data-toc-modified-id="Импорт-модулей-и-константы-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Импорт модулей и константы</a></span></li><li><span><a href="#Настройка-логирования" data-toc-modified-id="Настройка-логирования-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Настройка логирования</a></span></li><li><span><a href="#Загрузка-данных" data-toc-modified-id="Загрузка-данных-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Загрузка данных</a></span></li><li><span><a href="#Базовая-модель" data-toc-modified-id="Базовая-модель-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Базовая модель</a></span></li><li><span><a href="#Новые-признаки" data-toc-modified-id="Новые-признаки-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Новые признаки</a></span></li><li><span><a href="#Добавление-кластеризации" data-toc-modified-id="Добавление-кластеризации-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Добавление кластеризации</a></span></li><li><span><a href="#Логистическая-регрессия-как-признак" data-toc-modified-id="Логистическая-регрессия-как-признак-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Логистическая регрессия как признак</a></span></li></ul></div>

## Импорт модулей и константы

In [1]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from category_encoders.m_estimate import MEstimateEncoder
from joblib import load
import numpy as np
import optuna
# from optuna.integration import CatBoostPruningCallback
import pandas as pd
import shap
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

from useful_funcs import make_notifier


SEED = 42
N_JOBS = -1
CV=5
N_TRIALS = 1000


sklearn.set_config(transform_output="pandas")
%matplotlib inline

## Настройка логирования

In [2]:
set_notification = make_notifier()

In [3]:
set_notification(
    "Titanic - Machine Learning from Disaster. CatBoost"
    "\n\nМодули импортированы."
    "\nЛогирование настроено."
)

## Загрузка данных
Загрузим тренировочный и тестовый наборы данных.

In [4]:
X_train, y_train = load("train_set.joblib")
X_test, y_test = load("test_set.joblib")

In [5]:
X_train.info()

X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 692 to 507
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          575 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        160 non-null    object 
 10  Embarked     710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
692,693,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
481,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S
527,528,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
855,856,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S
801,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,S


## Базовая модель

In [6]:
train_pool = Pool(
    data=X_train.fillna(-999),
    label=y_train, 
    cat_features=['PassengerId', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
    text_features=['Name']
)

In [7]:
model_params = dict(
    loss_function = metrics.Logloss(),
    custom_loss=[metrics.Accuracy(), metrics.AUC()],
    random_seed=SEED,
)

In [8]:
skf = StratifiedKFold(n_splits=CV, shuffle=True)

In [9]:
cv_data = cv(
    pool=train_pool, 
    params=model_params, 
    logging_level='Silent', 
    plot=True,
    folds=skf.split(X_train.fillna(-999), y_train),
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [10]:
best_id = np.argmax(cv_data['test-Accuracy-mean'])

print(f"Best mean accuracy: {np.round(cv_data['test-Accuracy-mean'][best_id], 3)}")
print(f"Accuracy std: {np.round(cv_data['test-Accuracy-std'][best_id], 4)}")

Best mean accuracy: 0.836
Accuracy std: 0.0226


In [13]:
model = CatBoostClassifier(
    loss_function = metrics.Logloss(),
    cat_features=['PassengerId', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
    text_features=['Name'],
    random_seed=SEED,
    verbose=False
)

scores = cross_val_score(
    estimator=model,
    X=X_train.fillna(-999),
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

print(f'Scores: {scores}')
print(f'mean accuracy: {scores.mean()}')
print(f'accuracy std: {scores.std()}')

Scores: [0.81818182 0.78321678 0.85915493 0.85211268 0.80985915]
mean accuracy: 0.8245050723923963
accuracy std: 0.028007147831220762


In [12]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
692,693,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
481,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S
527,528,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
855,856,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S
801,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,S


## Новые признаки

In [18]:
def add_title(X, y=None, group_rare=False):
    title = X.Name.str.extract(pat=r"\b,\s(.+?)\.\s[\b(]?")
    title = pd.Series(title[0], name="Title").str.lower()

    if group_rare:
        title = title.where(
            title.isin(["mr", "miss", "mrs", "master"]), "aristocratic"
        )

    return pd.concat([X, title], axis="columns")


def add_family(X, y=None):
    family = X.Parch + X.SibSp
    family.name = "Family"

    return pd.concat([X, family], axis="columns")


def combine_fare_age(X, y=None):
    data = X.copy()
    data.loc[:, "FareAgeComb"] = data.Fare / data.Age

    return data


def combine_fare_pclass(X, y=None):
    data = X.copy()
    data.loc[:, "FarePclassComb"] = data.Fare / data.Pclass

    return data


def combine_sibsp_family(X, y=None):
    data = X.copy()
    data.loc[:, "SibSpFamilyComb"] = data.SibSp / (data.Family + 1)

    return data


def combine_parch_family(X, y=None):
    data = X.copy()
    data.loc[:, "ParchFamilyComb"] = data.Parch / (data.Family + 1)

    return data


def get_interaction(X, y=None):
    res = X.copy()
    col_list = X.select_dtypes(exclude='object').columns
    
    for i in range(len(col_list)):
        for j in range(i + 1, len(col_list)):
            res[f'{col_list[i]}_{col_list[j]}'] = res[col_list[i]] * res[col_list[j]]
    
    return res

In [19]:
def prep_data(X, y=None, fill_vall=-999):
    return (
    X
    .drop(columns='PassengerId')
    .fillna(fill_vall)
    .pipe(add_title, group_rare=True)
    .pipe(add_family)
    .pipe(combine_fare_age)
    .pipe(combine_fare_pclass)
    .pipe(combine_sibsp_family)
    .pipe(combine_parch_family)
    .pipe(get_interaction)
)

## Добавление кластеризации

In [20]:
class ClusterAdder(BaseEstimator, TransformerMixin):
    def __init__(self, classifier_params=dict(n_estimators=100, max_depth=10), dbscans_params=dict()):
        self.dbscans_params = dbscans_params
        self.classifier_params =classifier_params
        
        
    def fit(self, X, y=None):
        clusters_y = DBSCAN(**self.dbscans_params, n_jobs=N_JOBS).fit_predict(X)
        
        self.model = RandomForestClassifier(**self.classifier_params, n_jobs=N_JOBS).fit(X, clusters_y)
        
        return self
  

    def transform(self, X, y=None):
        res = X.copy()
        res['cluster'] = self.model.predict(X)
        
        return res

In [21]:
cat_cols = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']

data_preps = Pipeline([
    ('initial', FunctionTransformer(prep_data, kw_args=dict(fill_vall=-999))),
    ('col_spliter', 
     ColumnTransformer(
         [('pass', 'passthrough', ['Name'] + cat_cols),
          ('preps', 
           Pipeline([
               ('encoder',MEstimateEncoder(cols=cat_cols)),
               ('scaler', StandardScaler()),
               ('cluster', ClusterAdder()),
               ('drop', FunctionTransformer(lambda x: x.drop(columns=cat_cols)))
           ]),
           [col for col in prep_data(X_train).columns if col != 'Name']
          )
         ], 
         remainder='passthrough',
         verbose_feature_names_out=False
     )
    ),
])

prepared_data = data_preps.fit_transform(X_train, y_train)

train_pool = Pool(
    data=prepared_data, 
    label=y_train, 
    cat_features=cat_cols + ['cluster'],
    text_features=['Name']
)

model = CatBoostClassifier(
    loss_function = metrics.Logloss(),
    custom_loss=[metrics.Accuracy(), metrics.AUC()],
    random_seed=SEED,
    verbose=False,
)

# cv_data = cv(
#     pool=train_pool, 
#     params=model.get_params(), 
#     logging_level='Silent', 
#     stratified=True, 
#     plot=True,
#     fold_count=CV,
# )

cv_data = cv(
    pool=train_pool, 
    params=model_params, 
    logging_level='Silent', 
#     stratified=True, 
    plot=True,
    folds=skf.split(prepared_data, y_train),
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [22]:
best_id = np.argmax(cv_data['test-Accuracy-mean'])

print(f"Best mean accuracy: {np.round(cv_data['test-Accuracy-mean'][best_id], 3)}")
print(f"Accuracy std: {np.round(cv_data['test-Accuracy-std'][best_id], 4)}")

Best mean accuracy: 0.888
Accuracy std: 0.0175


In [23]:
classifier = CatBoostClassifier(
    iterations=1000,
    cat_features=cat_cols + ['cluster'],
    text_features=['Name'],
    loss_function = metrics.Logloss(),
    random_seed=SEED,
    verbose=False,
)

model = Pipeline([
    ('initial', FunctionTransformer(prep_data, kw_args=dict(fill_vall=-999))),
    ('col_spliter', 
     ColumnTransformer(
         [('pass', 'passthrough', ['Name'] + cat_cols),
          ('preps', 
           Pipeline([
               ('encoder',MEstimateEncoder(cols=cat_cols)),
               ('scaler', StandardScaler()),
               ('cluster', ClusterAdder()),
               ('drop', FunctionTransformer(lambda x: x.drop(columns=cat_cols)))
           ]),
           [col for col in prep_data(X_train).columns if col != 'Name']
          )
         ], 
         remainder='passthrough',
         verbose_feature_names_out=False
     )
    ),
    ('classifier', classifier)
])

scores = cross_val_score(
    estimator=model,
    X=X_train,
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

print(f'Scores: {scores}')
print(f'mean accuracy: {scores.mean()}')
print(f'accuracy std: {scores.std()}')

Scores: [0.8041958  0.81118881 0.83802817 0.87323944 0.8028169 ]
mean accuracy: 0.8258938244853737
accuracy std: 0.02686423341453882


## Логистическая регрессия как признак

In [None]:
# class ProbAdder(BaseEstimator, TransformerMixin):
#     def __init__(self, frac=0.05, C=0.1, penalty='l1'):
#         self.frac = frac
#         self.C = C
#         self.penalty = penalty
        
    
#     def fit(self, X, y=None):
        
#         selected_X = X.sample(frac=self.frac)
#         selected_y = y[selected_X.index]
        
#         self.model = (
#             LogisticRegression(
#                 penalty=self.penalty, 
#                 C=self.C, 
#                 class_weight='balanced', 
#                 solver='liblinear'
#             )
#             .fit(selected_X, selected_y)
#         )
        
#         return self
    
    
#     def transform(self, X, y=None):
#         res=X.copy()
#         res['prob'] = self.model.predict_proba(X)[:, 1]
        
#         return res

In [None]:
# data_preps = Pipeline([
#     ('initial', FunctionTransformer(prep_data, kw_args=dict(fill_vall=-999))),
#     ('col_spliter', 
#      ColumnTransformer(
#          [('pass', 'passthrough', ['Name'] + cat_cols),
#           ('preps', 
#            Pipeline([
#                ('encoder',MEstimateEncoder(cols=cat_cols)),
#                ('scaler', StandardScaler()),
#                ('cluster', ClusterAdder()),
#                ('prob', ProbAdder()),
#                ('drop', FunctionTransformer(lambda x: x.drop(columns=cat_cols)))
#            ]),
#            [col for col in prep_data(X_train).columns if col != 'Name']
#           )
#          ], 
#          remainder='passthrough',
#          verbose_feature_names_out=False
#      )
#     ),
# ])

# prepared_data = data_preps.fit_transform(X_train, y_train)

# train_pool = Pool(
#     data=prepared_data, 
#     label=y_train, 
#     cat_features=cat_cols + ['cluster'],
#     text_features=['Name']
# )

# model = CatBoostClassifier(
#     loss_function = metrics.Logloss(),
#     custom_loss=[metrics.Accuracy(), metrics.AUC()],
#     random_seed=SEED,
#     verbose=False,
# )

# cv_data = cv(
#     pool=train_pool, 
#     params=model.get_params(), 
#     logging_level='Silent', 
#     stratified=True, 
#     plot=True,
#     fold_count=CV,
# )

In [None]:
# best_id = np.argmax(cv_data['test-Accuracy-mean'])

# print(f"Best mean accuracy: {np.round(cv_data['test-Accuracy-mean'][best_id], 3)}")
# print(f"Accuracy std: {np.round(cv_data['test-Accuracy-std'][best_id], 4)}")

In [None]:
# classifier = CatBoostClassifier(
#     iterations=1000,
#     cat_features=cat_cols + ['cluster'],
#     text_features=['Name'],
#     loss_function = metrics.Logloss(),
#     random_seed=SEED,
#     verbose=False,
# )


# final_pipeline = Pipeline([
#     ('initial', FunctionTransformer(prep_data, kw_args=dict(fill_vall=-999))),
#     ('col_spliter', 
#      ColumnTransformer(
#          [('pass', 'passthrough', ['Name'] + cat_cols),
#           ('preps', 
#            Pipeline([
#                ('encoder',MEstimateEncoder(cols=cat_cols)),
#                ('scaler', StandardScaler()),
#                ('cluster', ClusterAdder()),
#                ('prob', ProbAdder(frac=1)),
#                ('drop', FunctionTransformer(lambda x: x.drop(columns=cat_cols)))
#            ]),
#            [col for col in prep_data(X_train).columns if col != 'Name']
#           )
#          ], 
#          remainder='passthrough',
#          verbose_feature_names_out=False
#      )
#     ),
#     ('classifier', classifier)
# ])

In [None]:
# scores = cross_val_score(
#     estimator=final_pipeline,
#     X=X_train,
#     y=y_train,
#     scoring='accuracy',
#     cv=CV,
#     n_jobs=-1,
# )

# print(f'Scores: {scores}')
# print(f'mean accuracy: {scores.mean()}')
# print(f'accuracy std: {scores.std()}')

In [None]:
final_pipeline.fit(X_train, y_train)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(
        pd
        .DataFrame(
            final_pipeline['classifier'].get_feature_importance(),
            index=final_pipeline[:-1].transform(X_train).columns,
            columns=['feature_importance']
        )
        .reset_index()
        .sort_values(by='feature_importance', ascending=False)
    )

In [None]:
def objective(trial):
    