In [17]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [18]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, TargetEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [19]:
# Подготовка данных
path = "/content/drive/MyDrive/ML/StickerBot/Stickers.csv"
df = pd.read_csv(path).dropna()
df['date'] = pd.to_datetime(df['date'])
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(by='date')

In [20]:
# Определение целевых переменных
target_columns = ['price_1', 'price_2', 'price_3', 'price_4', 'price_5', 'target_price']
X = df.drop(target_columns, axis=1)
y = df[target_columns]

In [21]:
# Кодирование team_name
encoder = TargetEncoder()
X['team_name_encoded'] = encoder.fit_transform(X[['team_name']], y['target_price'].values)
X = X.drop('team_name', axis=1)

In [22]:
# Преобразование даты
X['month'] = df['date'].dt.month
X['year'] = df['date'].dt.year
X['day'] = df['date'].dt.day
X = X.drop('date', axis=1)

In [23]:
# Шкалирование
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=['number']).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [24]:
# Разделение данных
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [25]:
# Преобразование типов
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

In [26]:
models = {
    'Linear Regression': MultiOutputRegressor(LinearRegression()),
    'Ridge': MultiOutputRegressor(Ridge(alpha=1.0)),
    'Lasso': MultiOutputRegressor(Lasso(alpha=0.1)),
    'Random Forest': RandomForestRegressor(n_estimators=1000, max_depth=8, random_state=42),
    'XGBoost': MultiOutputRegressor(XGBRegressor(n_estimators=1000, max_depth=8, learning_rate=0.1, random_state=42)),
    'LightGBM' : MultiOutputRegressor(LGBMRegressor(
    n_estimators=500,
      max_depth=7,
      learning_rate=0.1,
      num_leaves=31,
      min_data_in_leaf=20,
      feature_fraction=0.8,
      bagging_fraction=0.8,
      bagging_freq=5,
      verbose=-1
    ))
}

In [27]:
def evaluate_multioutput(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    metrics_dict = {
        'RMSE': [],
        'MAE': [],
        'R2': []
    }

    for i in range(y_test.shape[1]):
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], preds[:, i]))
        mae = mean_absolute_error(y_test.iloc[:, i], preds[:, i])
        r2 = r2_score(y_test.iloc[:, i], preds[:, i])

        metrics_dict['RMSE'].append(rmse)
        metrics_dict['MAE'].append(mae)
        metrics_dict['R2'].append(r2)

    return {
        'Avg_RMSE': np.mean(metrics_dict['RMSE']),
        'Avg_MAE': np.mean(metrics_dict['MAE']),
        'Avg_R2': np.mean(metrics_dict['R2']),
        'Metrics_by_month': metrics_dict
    }

In [28]:
# Обучение и оценка моделей
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    results[name] = evaluate_multioutput(model, X_train, y_train, X_test, y_test)

Training Linear Regression...
Training Ridge...
Training Lasso...
Training Random Forest...
Training XGBoost...
Training LightGBM...


In [29]:
# Вывод результатов
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Avg_RMSE': [v['Avg_RMSE'] for v in results.values()],
    'Avg_MAE': [v['Avg_MAE'] for v in results.values()],
    'Avg_R2': [v['Avg_R2'] for v in results.values()]
}).set_index('Model')

print("Средние метрики по всем горизонтам:")
print(results_df.sort_values(by='Avg_RMSE'))

Средние метрики по всем горизонтам:
                     Avg_RMSE     Avg_MAE    Avg_R2
Model                                              
Random Forest      104.649331   49.334356  0.850367
Lasso              105.251304   69.014927  0.862059
Linear Regression  105.279325   69.039456  0.861897
Ridge              105.513997   68.437304  0.862729
XGBoost            114.225314   52.729616  0.852015
LightGBM           216.108364  140.661229  0.443794


In [30]:
# Детальные метрики по месяцам для лучшей модели
best_model_name = results_df['Avg_RMSE'].idxmin()
print(f"\nДетальные метрики для {best_model_name}:")
for i, month in enumerate(['1', '2', '3', '4', '5', '6']):
    print(f"Месяц {month}:")
    print(f"RMSE: {results[best_model_name]['Metrics_by_month']['RMSE'][i]:.2f}")
    print(f"MAE: {results[best_model_name]['Metrics_by_month']['MAE'][i]:.2f}")
    print(f"R2: {results[best_model_name]['Metrics_by_month']['R2'][i]:.2f}\n")


Детальные метрики для Random Forest:
Месяц 1:
RMSE: 86.39
MAE: 42.12
R2: 0.95

Месяц 2:
RMSE: 66.87
MAE: 28.64
R2: 0.96

Месяц 3:
RMSE: 103.21
MAE: 49.09
R2: 0.88

Месяц 4:
RMSE: 155.44
MAE: 72.40
R2: 0.76

Месяц 5:
RMSE: 126.86
MAE: 58.97
R2: 0.70

Месяц 6:
RMSE: 89.13
MAE: 44.79
R2: 0.85

