In [2]:
import pandas as pd
import numpy as np

In [9]:
train = pd.read_csv('out_data/result.csv')

test = train.sample(4)

test['type'] = 'test'
train['type'] = 'train'

df = pd.concat([train, test], ignore_index=True)

In [10]:
# В этой ячейке можно добавлять новые фичи и всячески предобрабатывать объединённый датасет.

In [11]:
# Разбиваем данные на train и test
train_df = df[df['type'] == 'train'].copy().dropna().reset_index(drop=True)
test_df = df[df['type'] == 'test'].copy().reset_index(drop=True)
train_df = train_df.sort_values(['id'])
test_df = test_df.sort_values(['id'])

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

from typing import Tuple

def metric_mape(y_valid: list, val_pred: list) -> float:
    """
    Calculate MAPE metric

    :param y_valid: list, real values
    :param val_pred: list, predicted values
    :return: float, MAPE metric
    """
    y_valid = np.array(y_valid).reshape(-1)
    val_pred = np.array(val_pred)
    return np.sum(np.abs(y_valid - val_pred)) / np.sum(y_valid) * 100


def calculate_metrics(df: pd.DataFrame,
                      forecast_name='forecast') -> Tuple[float, float, float,
                                                    float, float, float]:
    """
    Calculate metrics

    :param df: pd.DataFrame, result dataframe
    :return:
    """
    y_true = df.alpha
    y_pr = df[forecast_name]

    mae = mean_absolute_error(y_true, y_pr)
    mse = mean_squared_error(y_true, y_pr)

    mape = metric_mape(y_true, y_pr)

    print(f"MAE: {mae:.2f}\n"
          f"MSE: {mse:.2f}\n"
          f"MAPE: {mape:.2f} %\n")

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
# числовые фичи
num_features = ['b', 'd', 'dd', 'sigma_m', 'sigma_w']
# категориальные фичи
cat_features = []

# таргеты
alpha_train = train_df.alpha
alpha_test = test_df.alpha
beta_train = train_df.beta
beta_test = test_df.beta
gamma_train = train_df.gamma
gamma_test = test_df.gamma

# категориальные фичи
X_train_cat = train_df[cat_features].values
X_test_cat = test_df[cat_features].values

# скалируем числовые фичи
X_train_num = train_df[num_features].copy()
X_test_num = test_df[num_features].copy()
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

# итоговые датасеты для обучения
X_train = np.hstack((X_train_cat, X_train_num))
X_test = np.hstack((X_test_cat, X_test_num))

In [15]:
# from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression

In [16]:
# Обучаем линейную регрессию
model = LinearRegression()
model.fit(X_train, alpha_train)
forecast = model.predict(X_test)
test_df['prediction'] = forecast

test_df['prediction'] = test_df['prediction'].apply(lambda x: 0 if x < 0 else x)

In [17]:
test_df[['id', 'prediction']].sort_values(['id']).to_csv('prediction.csv', index=False)

In [18]:
calculate_metrics(test_df, forecast_name='prediction')

MAE: 0.12
MSE: 0.02
MAPE: 4.37 %

