# Titanic - Machine Learning from Disaster. Ensemble

<h1>Содержание<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Загрузка-данных" data-toc-modified-id="Загрузка-данных-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Загрузка данных</a></span></li><li><span><a href="#Подготовка-к-обучению-моделей" data-toc-modified-id="Подготовка-к-обучению-моделей-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Подготовка к обучению моделей</a></span><ul class="toc-item"><li><span><a href="#Категориальные-признаки" data-toc-modified-id="Категориальные-признаки-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Категориальные признаки</a></span></li><li><span><a href="#Функция-для-вывода-оценки-модели" data-toc-modified-id="Функция-для-вывода-оценки-модели-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Функция для вывода оценки модели</a></span></li><li><span><a href="#Трансформер-для-получения-титула-из-имени" data-toc-modified-id="Трансформер-для-получения-титула-из-имени-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Трансформер для получения титула из имени</a></span></li><li><span><a href="#Трансформер-для-удаления-признаков" data-toc-modified-id="Трансформер-для-удаления-признаков-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Трансформер для удаления признаков</a></span></li></ul></li><li><span><a href="#Базовые-модели" data-toc-modified-id="Базовые-модели-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Базовые модели</a></span><ul class="toc-item"><li><span><a href="#CatBoost" data-toc-modified-id="CatBoost-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>CatBoost</a></span></li><li><span><a href="#Градиентный-бустинг-с-LightGBM" data-toc-modified-id="Градиентный-бустинг-с-LightGBM-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Градиентный бустинг с LightGBM</a></span></li><li><span><a href="#Случайный-лес-с-LightGBM" data-toc-modified-id="Случайный-лес-с-LightGBM-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Случайный лес с LightGBM</a></span></li></ul></li><li><span><a href="#Стекинг" data-toc-modified-id="Стекинг-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Стекинг</a></span></li></ul></div>

# Импорт модулей и константы

In [1]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, metrics, cv
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from lightgbm import LGBMClassifier, LGBMRegressor
from joblib import load
import numpy as np
import pandas as pd

import sklearn
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, PolynomialFeatures

SEED = 42
N_JOBS = -1
CV = 5
N_TRIALS = 1000

sklearn.set_config(transform_output="pandas")
%matplotlib inline

## Загрузка данных

In [2]:
X_train, y_train = load("train_set.joblib")
X_test, y_test = load("test_set.joblib")

## Подготовка к обучению моделей

### Категориальные признаки

In [3]:
cat_features = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']
cat_indexes = [0, 1, 5, 7, 8, 9]

### Функция для вывода оценки модели

In [4]:
def plot_accuracy_score(scores):
    print(f'Scores: {np.round(scores, 4)}')
    print(f'mean accuracy: {np.round(scores.mean(), 3)}')
    print(f'accuracy std: {np.round(scores.std(), 4)}')

### Трансформер для получения титула из имени

In [5]:
def add_title(X, y=None, group_rare=True):
    title = X.Name.str.extract(pat=r"\b,\s(.+?)\.\s[\b(]?")
    title = pd.Series(title[0], name="Title").str.lower()

    if group_rare:
        title = title.where(
            title.isin(["mr", "miss", "mrs", "master"]), "aristocratic"
        )

    return pd.concat([X, title], axis="columns")

title_adder = FunctionTransformer(add_title)

### Трансформер для удаления признаков

In [6]:
name_remover = FunctionTransformer(
    lambda x: x.drop(columns=['PassengerId', 'Name'])
)

## Базовые модели

### CatBoost

In [7]:
classifier = CatBoostClassifier(
    loss_function = metrics.Logloss(),
    cat_features=cat_features,
    random_seed=SEED,
    verbose=False
)

catboost_pipeline = Pipeline([
    ('title', title_adder),
    ('drop', name_remover),
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('classifier', classifier),
])

scores = cross_val_score(
    estimator=catboost_pipeline,
    X=X_train,
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

plot_accuracy_score(scores)

Scores: [0.8182 0.7832 0.838  0.8451 0.8028]
mean accuracy: 0.817
accuracy std: 0.0227


### Градиентный бустинг с LightGBM

In [8]:
classifier = LGBMClassifier(
    learning_rate=0.1,
    n_estimators=250,
    boosting_type='gbdt',
    class_weight='balanced',
    n_jobs=N_JOBS,
    verbose=-100,
)

lgbm_pipeline = Pipeline([
    ('title', title_adder),
    ('drop', name_remover),
    ('encoder', MEstimateEncoder(cols=cat_features)),
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('classifier', classifier),
])

scores = cross_val_score(
    estimator=lgbm_pipeline,
    X=X_train,
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

plot_accuracy_score(scores)

Scores: [0.7692 0.8112 0.8451 0.8028 0.7817]
mean accuracy: 0.802
accuracy std: 0.0262


### Случайный лес с LightGBM

In [9]:
classifier = LGBMClassifier(
    boosting_type='rf',
    class_weight='balanced',
    n_jobs=N_JOBS,
    verbose=-100,
    bagging_fraction=0.5,
    feature_fraction=0.5,
    bagging_freq=1,
)

rf_pipeline = Pipeline([
    ('title', title_adder),
    ('drop', name_remover),
    ('encoder', MEstimateEncoder(cols=cat_features)),
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('classifier', classifier),
])

scores = cross_val_score(
    estimator=rf_pipeline,
    X=X_train,
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

plot_accuracy_score(scores)

Scores: [0.7902 0.8182 0.8732 0.8169 0.7817]
mean accuracy: 0.816
accuracy std: 0.032


## Стекинг

In [10]:
class Blender(BaseEstimator, ClassifierMixin):
    def __init__(
        self, 
        catboost_params=None, 
        lgbm_params=None, 
        rf_params=None,
        meta_model_params=None,
        lin_reg_params=None,
        cv=5,
    ):
        self.cv=cv
        
        if catboost_params is None:
            self.catboost_params = dict(
                cat_features=cat_features,
                verbose=False
            )
        else:
            self.catboost_params = catboost_params
            
        if lgbm_params is None:
            self.lgbm_params = dict(
                learning_rate=0.1,
                n_estimators=250,
                boosting_type='gbdt',
                n_jobs=N_JOBS,
                verbose=-100,
            )
        else:
            self.lgbm_params = lgbm_params
            
        if rf_params is None:
            self.rf_params = dict(    
                boosting_type='rf',
                n_jobs=N_JOBS,
                verbose=-100,
                bagging_fraction=0.5,
                feature_fraction=0.5,
                bagging_freq=1,
            )
        else:
            self.rf_params = rf_params
            
        if lin_reg_params is None:
            self.lin_reg_params = dict()
        else:
            self.lin_reg_params = lin_reg_params
            
        if meta_model_params is None:
            self.meta_model_params = dict(class_weight='balanced')
        else:
            self.meta_model_params = meta_model_params
            
    
    def get_metafeatures(self, X, y=None):
        meta_features = pd.DataFrame()
        
        if y is None:
            meta_features['catboost'] = self.catboost_pipeline.predict(X)
            meta_features['lgbm'] = self.lgbm_pipeline.predict(X)
            meta_features['rf'] = self.rf_pipeline.predict(X)
            meta_features['lin_reg'] = self.lin_reg_pipe.predict(X)
            meta_features['lin_reg_1'] = self.lin_reg_1_pipe.predict(X)
        else:
            meta_features['catboost'] = cross_val_predict(
                X=X, 
                y=y,
                estimator=self.catboost_pipeline,
                cv=self.cv,
                method='predict',
                n_jobs=N_JOBS,
            )

            meta_features['lgbm'] = cross_val_predict(
                X=X, 
                y=y,
                estimator=self.lgbm_pipeline,
                cv=self.cv,
                method='predict',
                n_jobs=N_JOBS,
            )

            meta_features['rf'] = cross_val_predict(
                X=X, 
                y=y,
                estimator=self.rf_pipeline,
                cv=self.cv,
                method='predict',
                n_jobs=N_JOBS,
            )
            
            meta_features['lin_reg'] = cross_val_predict(
                X=X, 
                y=y,
                estimator=self.lin_reg_pipe,
                cv=self.cv,
                method='predict',
                n_jobs=N_JOBS,
            )
            
            meta_features['lin_reg_1'] = cross_val_predict(
                X=X, 
                y=y,
                estimator=self.lin_reg_1_pipe,
                cv=self.cv,
                method='predict',
                n_jobs=N_JOBS,
            )
        
        return meta_features
            
            
    def fit(self, X, y=None):
        self.catboost_pipeline = Pipeline([
            ('title', title_adder),
            ('drop', name_remover),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('regressor', CatBoostRegressor(**self.catboost_params)),
        ])

        self.lgbm_pipeline = Pipeline([
            ('title', title_adder),
            ('drop', name_remover),
            ('encoder', MEstimateEncoder(cols=cat_features)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('classifier', LGBMRegressor(**self.lgbm_params)),
        ])
        
        self.rf_pipeline = Pipeline([
            ('title', title_adder),
            ('drop', name_remover),
            ('encoder', MEstimateEncoder(cols=cat_features)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('classifier', LGBMRegressor(**self.rf_params)),
        ])
        
        self.lin_reg_pipe = Pipeline([
            ('col_selector', 
             FunctionTransformer(
                 lambda x: x[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
             )
            ),
            ('title', title_adder),
            ('drop', name_remover),
            ('encoder', OneHotEncoder(cols=['Sex', 'Title', 'Pclass', 'Embarked'])),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('classifier', Ridge(**self.lin_reg_params)),
        ])
        
        self.lin_reg_1_pipe = Pipeline([
            ('col_selector', 
             FunctionTransformer(
                 lambda x: x[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
             )
            ),
            ('title', title_adder),
            ('drop', name_remover),
            ('encoder', OneHotEncoder(cols=['Sex', 'Title', 'Pclass', 'Embarked'])),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('classifier', Ridge(alpha=0.1)),
        ])
        
        meta_features = self.get_metafeatures(X, y)
        
        self.metamodel = Pipeline([
            ('interaction', PolynomialFeatures(interaction_only=True)),
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(**self.meta_model_params))
        ])
        
        self.metamodel.fit(meta_features, y)
        
        self.catboost_pipeline.fit(X,y)
        self.lgbm_pipeline.fit(X, y)
        self.rf_pipeline.fit(X, y)
        self.lin_reg_pipe.fit(X, y)
        self.lin_reg_1_pipe.fit(X, y)
        
        return self
        

    def predict(self, X):
        meta_features = self.get_metafeatures(X)
        preds = self.metamodel.predict(meta_features)
        
        return preds

In [11]:
meta_model_params = dict(class_weight='balanced', solver='liblinear', C=10)#, penalty='l1')

scores = cross_val_score(
    estimator=Blender(cv=5, meta_model_params = meta_model_params),
    X=X_train,
    y=y_train,
    scoring='accuracy',
    cv=CV,
    n_jobs=1,
)

plot_accuracy_score(scores)

Scores: [0.8042 0.8322 0.831  0.8451 0.8099]
mean accuracy: 0.824
accuracy std: 0.0152


In [12]:
accuracy_score(y_test, catboost_pipeline.fit(X_train, y_train).predict(X_test))

0.8156424581005587

In [13]:
stacking = Blender(cv=5, meta_model_params = meta_model_params).fit(X_train, y_train)

In [14]:
accuracy_score(y_test, stacking.predict(X_test))

0.8156424581005587

In [15]:
stacking.metamodel[-1].coef_

array([[ 0.        ,  1.18633116,  0.42008944,  0.32028932, -0.14012998,
         0.0456106 , -1.66828794,  1.04400154, -0.30276032, -0.19804328,
        -0.02762313,  0.21149573,  0.28529655,  0.18755451,  0.2726067 ,
         0.62294017]])