In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/processed/mart.csv').dropna()
df = df.drop(columns=["Unnamed: 0"])
target = "raw_mix.lab.measure.sito_009"
features = df.columns[df.columns != target]
X = df[features]
y = df[target]

In [3]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit, TimeSeriesSplit, KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error


In [55]:
def train_and_evaluate_model(splitter, model):

    scores = {'mae': [], 'baseline_mae': [], 'baseline_shifted_mae': [], 'same_direction': []}

    for train_index, test_index in splitter.split(X):

        scaler = StandardScaler().fit(X.iloc[train_index])
        
        X_train = scaler.transform(X.iloc[train_index])
        X_test = scaler.transform(X.iloc[test_index])

        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]

        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        scores['mae'].append(mean_absolute_error(y_test, y_pred))

        scores['baseline_mae'].append(mean_absolute_error(y_test, np.full_like(y_test, y_test.mean())))

        scores['baseline_shifted_mae'].append(mean_absolute_error(y_test[:-1], y_test[1:]))
    
        y_test_diff = y_test[1:].values - y_test[:-1].values
        y_pred_diff = y_pred[1:] - y_pred[:-1]

        scores['same_direction'].append(np.sum((y_test_diff * y_pred_diff) > 0) / len(y_test_diff))

    scores = {key: sum(value) / len(value) for key, value in scores.items()}

    return scores

In [49]:
splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

model = Lasso(alpha=0.5)

train_and_evaluate_model(splitter, model)

{'mae': 1.1716765489387297,
 'baseline_mae': 1.174485596707819,
 'baseline_shifted_mae': 1.5130841121495329,
 'same_direction': 0.0}

In [50]:
splitter = TimeSeriesSplit(n_splits=5)

model = Lasso(alpha=0.5)

train_and_evaluate_model(splitter, model)

{'mae': 1.300039337939119,
 'baseline_mae': 1.1858704709001389,
 'baseline_shifted_mae': 1.2613636363636365,
 'same_direction': 0.0}

In [51]:
splitter = KFold(n_splits=5)

model = Lasso(alpha=0.5)

train_and_evaluate_model(splitter, model)

{'mae': 1.2476340682194,
 'baseline_mae': 1.1901386580985804,
 'baseline_shifted_mae': 1.2358755069652616,
 'same_direction': 0.0}

In [52]:
splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)

train_and_evaluate_model(splitter, model)

{'mae': 1.073101851851851,
 'baseline_mae': 1.174485596707819,
 'baseline_shifted_mae': 1.5130841121495329,
 'same_direction': 0.5981308411214953}

In [53]:
splitter = TimeSeriesSplit(n_splits=5)

model = RandomForestRegressor(random_state=42)

train_and_evaluate_model(splitter, model)

{'mae': 1.2739483146067418,
 'baseline_mae': 1.1858704709001389,
 'baseline_shifted_mae': 1.2613636363636365,
 'same_direction': 0.5363636363636364}

In [54]:
splitter = KFold(n_splits=5)

model = RandomForestRegressor(random_state=42)

train_and_evaluate_model(splitter, model)

{'mae': 1.2191010730356526,
 'baseline_mae': 1.1901386580985804,
 'baseline_shifted_mae': 1.2358755069652616,
 'same_direction': 0.4813260447892788}

Выводы: 

- Наилучшие метрики модели показывают при простом разделении (аля train_test_split), однако при использовании KFold метрика приближается к baseline модели.

- RandomForest показал лучшие результаты чем линейная регрессия с l1 регуляризацией.