# Импорт библиотек

In [90]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score

from matplotlib import pyplot as plt
from mlens.ensemble import SuperLearner

In [91]:
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Загрузка данных

In [53]:
data = pd.read_csv('data/titanic_train.csv')

In [54]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Подготовка данных

In [55]:
data.Age = data.Age.fillna(data.Age.mean()) # заменяем пустые значения на среднее значение по признаку
data.Ticket = data.Ticket.fillna("_MISSING_") # заменяем пустые значения на _MISSING_ 
data.Fare = data.Fare.fillna(-999)
data.Embarked = data.Embarked.fillna("_MISSING_")
data = data.drop(["Cabin"], axis=1)

In [56]:
for column in ["Name", "Sex", "Ticket", "Embarked"]:
    label_encoder = LabelEncoder()
    data[column] = label_encoder.fit_transform(data[column]) # производим кодирование категориальных признаков

# Разбиение выборки на тренировочную и валидационную

In [57]:
X = data.drop(["Survived"], axis=1)
y = data.Survived

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 

# Обучение моделей, классификация и оцека результата

In [59]:
nearest_neighbors = KNeighborsClassifier()
naive_bayes = GaussianNB()
decision_tree = DecisionTreeClassifier()
logistic_regression = LogisticRegression()

gradient_boosting_classifier = GradientBoostingClassifier()
bagging_classifier = BaggingClassifier()
random_forest_classifier = RandomForestClassifier()
voting_classifier = VotingClassifier(estimators=[("naive_bayes", GaussianNB()),
                                                 ("decision_tree", DecisionTreeClassifier()),
                                                 ("logistic_regression", LogisticRegression()),
                                                 ("nearest_neighbors", KNeighborsClassifier())], voting="hard")

In [65]:
for model in [nearest_neighbors, naive_bayes, decision_tree, logistic_regression,
              gradient_boosting_classifier, bagging_classifier, random_forest_classifier, voting_classifier]:
    model.fit(X_train, y_train)

###### Из таблицы, представленной ниже можно увидеть, что модели, основанные на композициях алгоритмов показывают лучшие результаты относительно простейших моделей. 
###### Стоит так же отметить, что результат работы "продвинутых" моделей наиболее сильно зависит от параметров, с которыми инициализировалась модель, в связи с этим при выполнении лабораторной стоит попробовать подобрать для них параметры (по примеру из лаб №7) 

In [66]:
metrics_list = []
for model, model_name in [(nearest_neighbors, "nearest_neighbors"),
                          (naive_bayes, "naive_bayes"),
                          (decision_tree, "decision_tree"),
                          (logistic_regression, "logistic_regression"),
                          
                          (gradient_boosting_classifier, "gradient_boosting_classifier"),
                          (bagging_classifier, "bagging_classifier"),
                          (random_forest_classifier, "random_forest_classifier"),
                          (voting_classifier, "voting_classifier")]:
    
    prediction = model.predict(X_test)
    accuracy = accuracy_score(prediction, y_test)
    precision = precision_score(prediction, y_test)
    f1 = f1_score(prediction, y_test)
    
    metrics_list.append((model_name, accuracy, precision, f1))
    
pd.DataFrame(columns=["model_name", "accuracy_score", "precision_score", "f1_score"], data=metrics_list)

Unnamed: 0,model_name,accuracy_score,precision_score,f1_score
0,nearest_neighbors,0.7,0.44,0.54
1,naive_bayes,0.79,0.78,0.75
2,decision_tree,0.76,0.74,0.71
3,logistic_regression,0.78,0.67,0.71
4,gradient_boosting_classifier,0.82,0.7,0.76
5,bagging_classifier,0.81,0.71,0.75
6,random_forest_classifier,0.84,0.74,0.79
7,voting_classifier,0.8,0.64,0.72


# Блендинг и стекинг
###### Блендинг представяет из себя простейшее усреднение ответов различных моделей
###### Стекинг в свою очередь является более продвинутым методом ансамблирования алгоритмов. Основная идея стекинга заключается в использовании базовых классификаторов для получения предсказаний (метапризнаков) и использовании их как признаков для некоторого ”обобщающего” алгоритма (метаалгоритма).

In [145]:
model_predictions = []
for model in [gradient_boosting_classifier, bagging_classifier, random_forest_classifier, voting_classifier]:
    model_predictions.append(model.predict(X_test))

blended_prediction = []
for index in range(len(X_test)):
    element_predictions = [model_prediction[index] for model_prediction in model_predictions]
    blended_prediction.append(max(set(element_predictions), key = list(element_predictions).count))
    
print(f"blended_model accuracy_score = {accuracy_score(blended_predictions, y_test)}")
print(f"blended_model precision_score = {precision_score(blended_predictions, y_test)}")
print(f"blended_model f1_score = {f1_score(blended_predictions, y_test)}")

blended_model accuracy_score = 0.8295964125560538
blended_model precision_score = 0.6741573033707865
blended_model f1_score = 0.759493670886076


In [147]:
stacking_model = SuperLearner()
stacking_model.add([gradient_boosting_classifier, voting_classifier, bagging_classifier, random_forest_classifier])
stacking_model.add_meta(logistic_regression)
stacking_model.fit(X_train, y_train)

prediction = stacking_model.predict(X_test)
print(f"stacking_model accuracy_score = {accuracy_score(prediction, y_test)}")
print(f"stacking_model precision_score = {precision_score(prediction, y_test)}")
print(f"stacking_model f1_score = {f1_score(prediction, y_test)}")

stacking_model accuracy_score = 0.8161434977578476
stacking_model precision_score = 0.6966292134831461
stacking_model f1_score = 0.7515151515151515
