In [51]:
import pandas as pd

import os

from typing import Callable

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score

import optuna

# Регрессия

In [52]:
RAW_FEATS = 'regression_raw_feats.csv'
RAW_TARGET = 'regression_raw_target.csv'
PROCESSED_FEATS = 'regression_processed_feats.csv'
PROCESSED_TARGET = 'regression_processed_target.csv'

## Готовая модель

In [53]:
model_class = KNeighborsRegressor
metric = mean_squared_error
random_state = 42
test_size = 0.2

### Бейзлайн

In [54]:
df_feats = pd.read_csv(RAW_FEATS)
df_target = pd.read_csv(RAW_TARGET)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df_feats, df_target, test_size = test_size, random_state = random_state)

In [56]:
model = model_class()

model.fit(X_train, y_train)

In [57]:
y_pred = model.predict(X_test)

In [58]:
print(f'Model score for metric "{metric.__name__}": {round(metric(y_test, y_pred), 4)}')

Model score for metric "mean_squared_error": 277.7168


### Улучшенный бейзлайн

#### Улучшение за счёт препроцессинга

In [59]:
df_feats = pd.read_csv(PROCESSED_FEATS)
df_target = pd.read_csv(PROCESSED_TARGET)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df_feats, df_target, test_size = test_size, random_state = random_state)

In [61]:
model = model_class()

model.fit(X_train, y_train)

In [62]:
y_pred = model.predict(X_test)

In [63]:
print(f'Model score for metric "{metric.__name__}": {round(metric(y_test, y_pred), 4)}')

Model score for metric "mean_squared_error": 18.0143


#### Дополнительное улучшение за счёт подбора параметров

In [64]:
n_trials = 10
cv = 5

In [65]:
def objective(trial):
    """
    Целевая функция для Optuna.
    """
    knn_params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 50, log=True),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'p': trial.suggest_int('p', 1, 2)
    }

    trial_model = model_class(**knn_params)
    
    score = cross_val_score(trial_model, X_train, y_train, cv=5, scoring=make_scorer(metric), n_jobs=-1).mean()
    
    return score

study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=random_state))
study.optimize(objective, n_trials=n_trials)

print(f"Лучшие параметры для модели: {study.best_params}")
print(f"Лучшая метрика ({metric.__name__}): {study.best_value:.4f}")

best_model = model_class(**study.best_params)
best_model.fit(X_train, y_train)

[I 2025-12-04 21:31:39,481] A new study created in memory with name: no-name-5c430907-d99a-4011-b2db-1b37577e45c0
[I 2025-12-04 21:31:39,535] Trial 0 finished with value: 22.754202989919886 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 22.754202989919886.
[I 2025-12-04 21:31:39,581] Trial 1 finished with value: 23.58540550688309 and parameters: {'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 22.754202989919886.
[I 2025-12-04 21:31:39,628] Trial 2 finished with value: 22.146108731807566 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'p': 2}. Best is trial 2 with value: 22.146108731807566.
[I 2025-12-04 21:31:39,691] Trial 3 finished with value: 22.18979992222166 and parameters: {'n_neighbors': 31, 'weights': 'uniform', 'p': 1}. Best is trial 2 with value: 22.146108731807566.
[I 2025-12-04 21:31:39,754] Trial 4 finished with value: 22.09942983186393 and parameters: {'n_neighbors': 6, 'weights': 'unif

Лучшие параметры для модели: {'n_neighbors': 16, 'weights': 'distance', 'p': 1}
Лучшая метрика (mean_squared_error): 20.5815


In [66]:
y_pred = best_model.predict(X_test)

In [67]:
print(f'Model score for metric "{metric.__name__}": {round(metric(y_test, y_pred), 4)}')

Model score for metric "mean_squared_error": 15.6002


## Своя модель