# Исследование моделей

## Библиотеки

In [28]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Загрузка данных, предобработка

In [25]:
# Путь к файлу с данными.
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'dummy.csv')

In [26]:
# Загрузка данных.
data = pd.read_csv(data_path)

In [30]:
# Проверка.
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,C,TST,date
0,0.808231,0.673377,0.770908,0.571957,0.921815,0.457309,0.305348,0.534712,0.621063,0.177041,0.895601,0.877007,0.388681,0.001421,0.673024,0.822874,0.459102,0.071372,0.734894,0.232367,0.356942,0.472879,0.909957,0.639476,0.969845,0.164019,0.176951,0.16729,0.610445,0.774414,0.742411,0.270831,0.355087,0.959222,0.61404,0.409367,0.187739,0.066762,0.671984,0.014352,0.390243,0.730427,0.439314,0.596509,1,0.948128,0.242574,1979-01-01
1,0.045224,0.860874,0.249191,0.735388,0.10838,0.740083,0.928146,0.937495,0.825814,0.551267,0.938241,0.610978,0.159465,0.764799,0.109442,0.249858,0.874143,0.148369,0.002395,0.707496,0.332426,0.927871,0.015264,0.926864,0.899994,0.988074,0.059827,0.790916,0.187519,0.596276,0.096944,0.244653,0.061271,0.054832,0.775601,0.85379,0.718961,0.978334,0.333916,0.978914,0.071489,0.113931,0.183773,0.326431,0,0.246897,0.73095,1979-01-02
2,0.068763,0.512598,0.620058,0.393764,0.717195,0.623042,0.542581,0.33128,0.730439,0.834504,0.615643,0.3409,0.462791,0.480665,0.837764,0.997498,0.368087,0.472072,0.927994,0.352455,0.476502,0.375421,0.114852,0.869956,0.94518,0.087041,0.330904,0.592855,0.658857,0.592798,0.929559,0.044761,0.804134,0.981568,0.513337,0.584267,0.527923,0.693056,0.511603,0.838649,0.851771,0.073625,0.927801,0.842541,1,0.589659,0.048698,1979-01-03


## EDA и очистка данных

### Типы данных и пропуски

In [32]:
# Типы данных.
# data.dtypes

In [11]:
# Преобразование даты.
data.date = pd.to_datetime(data.date)

In [17]:
data.date.apply(lambda x: x.toordinal())

0      722450
1      722451
2      722452
3      722453
4      722454
        ...  
995    723445
996    723446
997    723447
998    723448
999    723449
Name: date, Length: 1000, dtype: int64

In [14]:
(data.date - data.date.min()).dt.components['days']

0        0
1        1
2        2
3        3
4        4
      ... 
995    995
996    996
997    997
998    998
999    999
Name: days, Length: 1000, dtype: int64

## Модели

### Классические

In [None]:
# Линейная регрессия с L2-регуляризацией.
from sklearn.linear_model import Ridge
# Случайный лес.
from sklearn.ensemble import RandomForestRegressor
# Бустниги.
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Мультитаргет-регрессия.
from sklearn.multioutput import MultiOutputRegressor

In [None]:
# Пример мультитаргет-регрессия.
ridge = Ridge()
model = MultiOutputRegressor(estimator=ridge)
model.fit(X_train, y_train)

score = model.score(X_train, y_train)
print("Training score:", score)

y_pred = model.predict(X_test)

## Мета-алгоритмы

In [None]:
# Голосование.
from sklearn.ensemble import VotingRegressor
# Пример.
voting_algorithm = VotingRegressor(
    [('CB', CatBoostRegressor()), ('XGB', XGBRegressor())]
)

y_pred = voting_algorithm.fit(X_train, y_train).predict(X_test)

In [None]:
# Бэггинг.
from sklearn.ensemble import BaggingRegressor
# Пример.
bagging = BaggingRegressor(base_estimator=Ridge(), n_estimators=10, random_state=0).fit(X, y)

y_pred = bagging.predict(X_test)

In [None]:
# Стэкинг.
from sklearn.ensemble import StackingRegressor
# Пример.
estimators = [
    ('CB', CatBoostRegressor()),
    ('XGB', XGBRegressor())
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10, random_state=42)
)

reg.fit(X_train, y_train).score(X_test, y_test)

### Нейросеть

In [None]:
from keras.models import Sequential
from keras.layers import Dense

## Метрики

In [None]:
def evraz_metric(answers: pd.DataFrame, user_csv: pd.DataFrame):
    """
    Метрика оценки качества модели, предложенная организаторами EVRAZ.
    :param answers: pd.DataFrame, датасет с реальными значениями целевых переменных.
    :param user_csv: pd.DataFrame, датасет с предсказанными значениями целевых переменных.
    :return:
    """
    # Содержание углерода в металле.
    delta_c = np.abs(np.array(answers['C']) - np.array(user_csv['C']))
    hit_rate_c = np.int64(delta_c < 0.02)
    # Температура металла.
    delta_t = np.abs(np.array(answers['TST']) - np.array(user_csv['TST']))
    hit_rate_t = np.int64(delta_t < 20)

    N = np.size(answers['C'])

    return np.sum(hit_rate_c + hit_rate_t) / 2 / N


def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)


def metrics_stat(y_true: np.array, y_pred: np.array) -> Dict[str, float]:
    """
    Вывод основных метрик.
    :param y_true: np.array, реальные значения целевой переменной.
    :param y_pred: np.array, предсказанные значения целевой переменной.
    :return: dict, словарь с названиями метрик и значениями
    """
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {'mape': mape, 'mdape': mdape, 'rmse': rmse, 'r2': r2}

## Пайплайн (классические модели)

In [None]:
scorer = make_scorer(mape, greater_is_better=False)

lb = LabelEncoder().fit(x_train['building_type'].values)

x_train['building_type'] = lb.transform(x_train['building_type'].values)
x_test['building_type'] = lb.transform(x_test['building_type'].values)


def hyperopt(estimator, params):
    column_transformer = ColumnTransformer(  # OHE for cat, Scaler for real
        transformers=[
            ('real', StandardScaler(), real_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)

        ], n_jobs=4
    )

    pipeline = Pipeline(  # column transformer and then model
        steps=[
            ('column_transformer', column_transformer),
            ('model', estimator)

        ]
    )

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        scoring=scorer,
        cv=3,
        verbose=2
    )

    grid.fit(x_train, y_train)

    # write best params to `best_params`
    best_params = grid.best_params_


    column_transformer = ColumnTransformer(  # OHE for cat, Scaler for real
        transformers=[
            ('real', StandardScaler(), real_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)

        ], n_jobs=4
    )

    pipeline = Pipeline(  # column transformer and then model with `best_params` as params
        steps=[
            ('column_transformer', column_transformer),
            ('model', estimator)

        ]
    )

    pipeline.set_params(**best_params)
    pipeline.fit(x_train, y_train)

    score_train = mape(y_train, pipeline.predict(x_train))
    score_test = mape(y_test, pipeline.predict(x_test))

    return score_train, score_test, best_params

In [None]:
param_grid = {
    'model__n_estimators': [10],
    'model__learning_rate': [0.01, 0.1, 0.3, 0.5],
    'model__min_samples_split': [2, 12, 500],
    'model__max_depth': [5, 9, 12],
    'model__subsample' : [0.5, 0.8, 1],
}

hyperopt(GradientBoostingRegressor(), param_grid)

## Sample 2

In [None]:
# Трансформеры данных
numeric_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='median')), - можно было бы применить импьютер для заполнения пропусков
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Предобработка данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)])

# Основной pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))])

# Отобразить pipeline
set_config(display='diagram')
pipe

In [None]:
# Параметры модели и преодобработки данных
parameters = {
    'preprocessor__num__scaler': [StandardScaler(), RobustScaler()],
    'classifier__C': [100, 10, 1, 0.1, 0.01, 0.001]
}

In [None]:
# Подбор параметров
X_train, y_train = data[num_cols + cat_cols], data[target_col]
cross_val = StratifiedShuffleSplit(n_splits=5,test_size=0.3,random_state=42)

grid = GridSearchCV(pipe, parameters, cv=cross_val, scoring='roc_auc').fit(X_train, y_train)

In [None]:
print("Лучшие параметры:", grid.best_params_)
print("Лучший score:", grid.best_score_)

In [None]:
# Модель
model = CatBoostClassifier(iterations=100, # количество деревьев уменьшено до 100 для ускорения расчетов
                           cat_features=cat_cols, # категориальные фичи
                           eval_metric='AUC:hints=skip_train~false', # метрика
                           verbose=False) # вывод инфорамации

# Сетка для подбора параметров
grid = {'learning_rate': [0.01, 0.03, 0.05, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

# Поиск лучших параметров
grid_search_result = model.grid_search(grid,
                                       X=X_train,
                                       y=y_train,
                                       cv=5,
                                       refit=True,
                                       shuffle=True,
                                       stratified=True,
                                       verbose=False,
                                       plot=True)

In [None]:
# CatBoost с лучшими параметрами
best_boost = CatBoostClassifier(iterations=100,
                                cat_features=cat_cols,
                                custom_metric='AUC:hints=skip_train~false',
                                verbose=False,
                                **grid_search_result['params'])

best_boost.fit(X_train, y_train)

In [None]:
# Результаты кросс-валидации
pd.DataFrame(grid_search_result['cv_results'])

In [None]:
print("Лучшие параметры:", grid_search_result['params'])
print("Лучший score:", 0.8740685544)

## Пайплайн (нейросеть)

In [None]:
def create_data(n):
    x1 = np.array([i/100+np.random.uniform(-1,3) for i in range(n)]).reshape(n,1)
    x2 = np.array([i/100+np.random.uniform(-3,5)+2 for i in range(n)]).reshape(n,1)
    x3 = np.array([i/100+np.random.uniform(-6,5)-3 for i in range(n)]).reshape(n,1)

    y1= [x1[i]-x2[i]+x3[i]+np.random.uniform(-2,2) for i in range(n)]
    y2= [x1[i]+x2[i]-x3[i]+5+np.random.uniform(-1,3) for i in range(n)]
    X = np.hstack((x1, x2, x3))
    Y = np.hstack((y1, y2))
    return X, Y

X, Y = create_data(n=450)

plt.plot(Y)
plt.show()

print("X:", X.shape, "Y:", Y.shape)
in_dim = X.shape[1]
out_dim = Y.shape[1]

xtrain, xtest, ytrain, ytest=train_test_split(X, Y, test_size=0.15)
print("xtrain:", xtrain.shape, "ytrian:", ytrain.shape)

model = Sequential()
model.add(Dense(100, input_dim=in_dim, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(out_dim))
model.compile(loss="mse", optimizer="adam")
model.summary()
 
model.fit(xtrain, ytrain, epochs=100, batch_size=12, verbose=0)
 
ypred = model.predict(xtest)
print("y1 MSE:%.4f" % mean_squared_error(ytest[:,0], ypred[:,0]))
print("y2 MSE:%.4f" % mean_squared_error(ytest[:,1], ypred[:,1]))

x_ax = range(len(xtest))
plt.scatter(x_ax, ytest[:,0],  s=6, label="y1-test")
plt.plot(x_ax, ypred[:,0], label="y1-pred")
plt.scatter(x_ax, ytest[:,1],  s=6, label="y2-test")
plt.plot(x_ax, ypred[:,1], label="y2-pred")
plt.legend()
plt.show()

In [None]:
# mlp for multi-output regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
 
# get the dataset
def get_dataset():
	X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3, random_state=2)
	return X, y
 
# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(n_outputs))
	model.compile(loss='mae', optimizer='adam')
	return model
 
# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y):
	results = list()
	n_inputs, n_outputs = X.shape[1], y.shape[1]
	# define evaluation procedure
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	# enumerate folds
	for train_ix, test_ix in cv.split(X):
		# prepare data
		X_train, X_test = X[train_ix], X[test_ix]
		y_train, y_test = y[train_ix], y[test_ix]
		# define model
		model = get_model(n_inputs, n_outputs)
		# fit model
		model.fit(X_train, y_train, verbose=0, epochs=100)
		# evaluate model on test set
		mae = model.evaluate(X_test, y_test, verbose=0)
		# store result
		print('>%.3f' % mae)
		results.append(mae)
	return results
 
# load dataset
X, y = get_dataset()
# evaluate model
results = evaluate_model(X, y)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(results), std(results)))
