# mini hw3 - ensembles

## Подготовка данных

Загрузите и предобработайте данные (по своему усмотрению) из hw1

In [216]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.svm import SVC, SVR
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('train_features_with_answers.csv')

In [3]:
unique_vals = dict()
for col in data:
    unique_vals[col] = data[col].unique() 
    print(f"{col}:", unique_vals[col])

school: ['MS' 'GP']
sex: ['M' 'F' 'D' 'C' 'B' 'A']
age: [ nan  15.  17.  20.  18.  16.  19. 161. 181. 151. 116.  21.  22.  -1.
   1.   5.   8.]
address: ['U' 'R' nan]
famsize: ['LE3' 'GT3']
Pstatus: ['T' 'A']
Medu: [1 3 4 2 0]
Fedu: [3 4 1 2 0]
Mjob: ['at_home' 'teacher' 'other' 'services' 'health']
Fjob: ['services' 'other' 'at_home' 'teacher' 'health']
reason: ['course' 'home' 'reputation' 'other']
guardian: ['mother' 'father' 'other']
traveltime: [1 3 2 4]
studytime: [1 2 3 4]
failures: [0 1 2 3]
schoolsup: ['no' 'yes']
famsup: ['no' 'yes']
paid: ['no' 'yes']
activities: ['no' 'yes']
nursery: ['yes' 'no']
higher: ['no' 'yes']
internet: ['yes' 'no']
romantic: ['yes' 'no']
famrel: [4 3 5 1 2]
freetime: [3 5 2 1 4]
goout: [3 4 1 5 2]
Dalc: [2 3 1 5 4]
Walc: [3 5 1 2 4]
health: [3 5 2 1 4]
absences: [ 0 16  4  8  2  1  9  6  5 11  3 10 12 14 18 15 24 22 32 21 13  7]
G3: [ 9 12 13 11 14 16 10 17  8 15  0  6  7 19 18  1  5]


In [4]:
data.loc[(data["age"] < 15) | ( data["age"] > 22), "age"] = np.nan
data.loc[(data["sex"] != "M") & (data["sex"] != "F"), "sex"] = np.nan

## Адаптивное заполнения пропусков 

### Categorical features encoding

In [5]:
cat_columns = []
for name in data.columns:
    if name not in data._get_numeric_data().columns:
        cat_columns += [name]

for col in cat_columns:
    data[col] = pd.factorize(data[col], na_sentinel=-1, sort=True)[0]

data.loc[data["sex"] == -1, "sex"] = np.nan
data.loc[data["address"] == -1, "address"] = np.nan

### Разбиение данных

In [6]:
data_without_na = data.dropna()
X = data_without_na.drop(["sex", "age", "address", "G3"], axis=1)

y = dict()
y["sex"] = data_without_na["sex"]
y["age"]  = data_without_na["age"]
y["address"]  = data_without_na["address"]

X_train_d = dict()
y_train_d = dict()

X_test_d = dict()
y_test_d = dict()

In [7]:
for col in ["sex", "age", "address"]:
    X_train, X_test, y_train, y_test = train_test_split(X, y[col], test_size=0.33, random_state=42)
    X_train_d[col] = X_train
    X_test_d[col] = X_test
    y_train_d[col] = y_train
    y_test_d[col] = y_test

In [8]:
data_prep = dict()
data_prep["train"] = [X_train_d, y_train_d]
data_prep["test"] =[X_test_d, y_test_d]

In [9]:
classifiers = dict()

for stage in ["train", "test"]:
    if stage == "train":
        for col in ["sex", "age", "address"]:
            cls = SVC(kernel='linear', class_weight="balanced") #LogisticRegression(max_iter = 1000)
            cls.fit(data_prep[stage][0][col], data_prep[stage][1][col])
            classifiers[col] = cls
    else:
        for col in ["sex", "age", "address"]:
            cls = classifiers[col]
            prediction = cls.predict(data_prep[stage][0][col])
            score = accuracy_score(data_prep[stage][1][col], prediction)
            print(col.upper(), round(score, 3))

SEX 0.627
AGE 0.209
ADDRESS 0.672


### Заполнение пропусков

In [10]:
data_nans = data[data.isna().any(axis=1)]
X_nans = data_nans.drop(["sex", "age", "address", "G3"], axis=1)

X_nans_sex = X_nans[data_nans["sex"].isna()]
X_nans_age = X_nans[data_nans["age"].isna()]
X_nans_add = X_nans[data_nans["address"].isna()]


predictions = dict()
predictions["sex"] = classifiers["sex"].predict(X_nans_sex)
predictions["age"] = classifiers["age"].predict(X_nans_age)
predictions["address"] = classifiers["address"].predict(X_nans_add)

In [11]:
for col in ["sex", "age", "address"]:
    for i, (index, row) in enumerate(data[data[col].isna()].iterrows()):
        data.loc[index, col] = predictions[col][i]
    

## Обоснуйте выбор слабых (базовых) алгоритмов

В данной задаче решил взять модели, которые были использованны в первом домашнем задании. Не брал только наивный байес, потому что он сильно зависит от корреляции признаков, а это требует отдельной обработки данных для него. В силу желания автоматизировать процесс, байеса я исключил. В нем было задание сделать модель, выдающую ответы на освнове предсказаний базовых. Сейчас будем решать туже самую задачу, но более умным способом.

По поводу метрик. На итоговые предсказания я смотрю через `MSE` и `Accuracy`. Уже во время выполнения задания я осознал, почему мне нужно смотреть именно на обе эти метрики. Здесь очень явно прослеживается Bias-Variance trade-off. `MSE` даёт мне понять какая у меня дисперсия ошибок, `Accuracy` показывает на сколько точно я попадаю в нужный класс

## Постройте решение на основе подхода Blending

In [12]:
def get_classifier_models():
    models = {
        "knn": KNeighborsClassifier(),
        "svm": SVC(probability=True),
        "lr" : LogisticRegression()
    }
    meta_model = SVC()
    return models, meta_model

In [73]:
def get_classifier_grids():
    models_grid = {
        "knn": {
            "n_neighbors": np.arange(1, 15)
        },
        "svm": {
            "class_weight": ["balanced"],
            "kernel": ["linear", "poly", "rbf"],
            "C": np.linspace(0.001, 10, 20)
        },
        "lr": {
            "solver": ['saga', 'lgbfs'],
            "C": [1, 2, 3],
            "penalty": ["l1","l2"],
            "max_iter": [100, 200]
        }
    }

    meta_model_grid = {
        "class_weight": ["balanced", "uniform"],
        "kernel": ["linear", "poly", "rbf"],
        "C": np.linspace(0.001, 10, 20)
    }
    return models_grid, meta_model_grid

In [66]:
def get_regressor_models():
    models = {
        "knn": KNeighborsRegressor(),
        "svr": SVR(),
        "lr" : LinearRegression()
    }
    
    meta_model = SVR()
    return models, meta_model

In [74]:
def get_regressor_grids():
    models_grid = {
        "knn": {
            "n_neighbors": np.arange(1, 25)
        },
        "svr": {
            "kernel": ["linear", "poly", "rbf"],
            "C": np.linspace(0.001, 10, 20)
        },
        "lr": {
            "fit_intercept": [True]
        }
    }

    meta_model_grid = {
        "kernel": ["linear", "poly", "rbf"],
        "C": np.linspace(0.001, 10, 20)
    }
    return models_grid, meta_model_grid

In [16]:
def fit_predict_CV(X, y, model_type, params_grid, verbose=False):
    model_cv = GridSearchCV(model_type, params_grid, cv=KFold(), refit=True)#, scoring='neg_mean_squared_error')
    model_cv.fit(X, y) 
    
    if verbose:
        print("Best hyperparameters: ", model_cv.best_params_)
        print("Best score: ", model_cv.best_score_)
    
    return model_cv.best_estimator_

In [17]:
def train_base_models(data, models, grid, verbose=False):
    for model_name in tqdm(models.keys()):
        if verbose:
            print(f"Model: {model_name}")
        models[model_name] = fit_predict_CV(data["X"], data["y"], models[model_name], grid[model_name], verbose=verbose)
        
    return models

In [18]:
def predict_base_models(X, models, models_type=None):
    if models_type is None:
        raise ValueError("models_type must be non None")
    predictions = None
    if models_type == "cls":
        for model in models.values():
            pred = model.predict_proba(X)
            if predictions is None:
                predictions = pred
            else:
                predictions = np.hstack([predictions, pred])
    if models_type == "reg":
         for model in models.values():
            pred = model.predict(X)
            if predictions is None:
                predictions = pred[:, None]
            else:
                predictions = np.hstack([predictions, pred[:, None]])
                
    return predictions

In [19]:
def train_meta_model(data, models, meta_model, grid, models_type=None, verbose=False):
    X_train_meta_model = predict_base_models(data["X"], models, models_type)
    return fit_predict_CV(X_train_meta_model, data["y"], meta_model, grid, verbose=verbose)

In [20]:
def predict_blending(data, models, meta_model, models_type=None):
    models_preds = predict_base_models(data["X"], models, models_type)
    predictions = meta_model.predict(models_preds)
    return predictions

In [55]:
X = data.drop(["G3"], axis=1)
y = data["G3"]

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=42)

scaler = StandardScaler()
scaler.fit(x_train)

data_train = {
    "X": scaler.transform(x_train),
    "y": y_train
}
data_val = {
    "X": scaler.transform(x_val),
    "y": y_val
}
data_test = {
    "X": scaler.transform(x_test),
    "y": y_test
}

### Модели классификации

In [68]:
models, meta_model = get_classifier_models()
grid, meta_model_grid = get_classifier_grids()

In [69]:
best_models = train_base_models(data_train, models, grid, verbose=True)
trained_meta_model = train_meta_model(
    data_val, 
    best_models, 
    meta_model, 
    meta_model_grid,
    models_type="cls"
)

  0%|          | 0/3 [00:00<?, ?it/s]

Model: knn
Best hyperparameters:  {'n_neighbors': 9}
Best score:  0.18275862068965515
Model: svm
Best hyperparameters:  {'C': 3.031, 'class_weight': 'balanced', 'kernel': 'rbf'}
Best score:  0.16551724137931031
Model: lr
Best hyperparameters:  {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Best score:  0.12068965517241378


In [70]:
predictions = np.round(predict_blending(data_test, best_models, trained_meta_model, models_type="cls"))
accuracy = accuracy_score(data_test["y"], np.round(predictions))
mse = mean_squared_error(data_test["y"], np.round(predictions))

print(f"Accuracy: {accuracy:.3}")
print(f"MSE:      {mse:.5}")

Accuracy: 0.0685
MSE:      17.178


Лучшие `результаты`, которые удалось получить 
- Accuracy: 0.219
- MSE:      14.877

### Модели регрессии

In [75]:
models, meta_model = get_regressor_models()
grid, meta_model_grid = get_regressor_grids()

In [76]:
best_models = train_base_models(data_train, models, grid, verbose=True)
trained_meta_model = train_meta_model(
    data_val, 
    best_models, 
    meta_model, 
    meta_model_grid,
    models_type="reg"
)

  0%|          | 0/3 [00:00<?, ?it/s]

Model: knn
Best hyperparameters:  {'n_neighbors': 22}
Best score:  0.14643644936134298
Model: svr
Best hyperparameters:  {'C': 2.1060526315789474, 'kernel': 'rbf'}
Best score:  0.2200681277303219
Model: lr
Best hyperparameters:  {'fit_intercept': True}
Best score:  0.1365085884316724


In [77]:
predictions = np.round(predict_blending(data_test, best_models, trained_meta_model, models_type="reg"))
accuracy = accuracy_score(data_test["y"], np.round(predictions))
mse = mean_squared_error(data_test["y"], np.round(predictions))

print(f"Accuracy: {accuracy:.3}")
print(f"MSE:      {mse:.5}")

Accuracy: 0.219
MSE:      8.0822


## Постройте решение на основе подхода Stacking

Правила:
- Реализуйте пайплайн обучения и предсказания (например, sklearn.pipeline или класс)
- Проведите оптимизацию пайплайна
- Оцените вклад каждого базового алгоритма в итоговое предсказание

In [247]:
models = get_classifier_models()[0]
estimators = []
for model_name in models.keys():
    model_pipeline = make_pipeline(
        StandardScaler(),
        models[model_name]
    )
    estimators += [(model_name, model_pipeline)]

folds = StratifiedKFold(n_splits=3, shuffle=True)

clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=folds
)

In [248]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)

In [249]:
clf = clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

accuracy = accuracy_score(y_test, np.round(predictions))
mse = mean_squared_error(y_test, np.round(predictions))

print(f"Accuracy: {accuracy:.3}")
print(f"MSE:      {mse:.5}")

Accuracy: 0.231
MSE:      7.2198


### Оценка вклада моделей

In [191]:
metrics = dict()
for exclude_model_name in models.keys():
    estimators = []
    for model_name in models.keys():
        if model_name != exclude_model_name:
            model_pipeline = make_pipeline(
                StandardScaler(),
                models[model_name]
            )
            estimators += [(model_name, model_pipeline)]
    
    folds = StratifiedKFold(n_splits=5, shuffle=True)

    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=folds
    )
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    clf = clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)

    accuracy = accuracy_score(y_test, np.round(predictions))
    mse = mean_squared_error(y_test, np.round(predictions))
    metrics[exclude_model_name] = (accuracy, mse)

In [192]:
for model_name in models.keys():
    print("Metrics without", model_name)
    print(f"Accuracy: {metrics[model_name][0]:.3}")
    print(f"MSE:      {metrics[model_name][1]:.5}")
    print()

Metrics without knn
Accuracy: 0.209
MSE:      8.033

Metrics without svm
Accuracy: 0.231
MSE:      7.1758

Metrics without lr
Accuracy: 0.176
MSE:      7.3736

