# Лабораторная работа 1: Линейная регрессия
Предсказание RiskScore

## Подготовка

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from eda import perform_eda
from feature_engineering import improve_preprocessing
from normalization import apply_normalization
from visualization import compare_normalizations, plot_convergence
from linear_regression import LinearRegressionCustom, compare_models, tune_ridge_alpha
from cross_validation import evaluate_cross_validation
from metrics import compare_metrics_with_sklearn


ImportError: cannot import name 'tune_ridge_alpha' from 'linear_regression' (/home/gaalex/MAI/5sem/ML/Lab1/linear_regression.py)

## 1. Загрузка данных

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')


## 2. EDA

In [None]:
correlations, train_clean = perform_eda(train_df, save_plots=True)


## 3. Предобработка и нормализация

In [None]:
X, y, X_test, train_processed = improve_preprocessing(train_df, test_df)
clip_bounds = np.quantile(y, [0.02, 0.98])
X_zscore, zscore_norm = apply_normalization(X, method='z-score')
X_minmax, minmax_norm = apply_normalization(X, method='min-max')
compare_normalizations(X, y, lambda: LinearRegressionCustom(method='analytical', alpha=1.0), {'Z-Score': zscore_norm, 'Min-Max': minmax_norm})
X_normalized = X_zscore


## 4. Подбор гиперпараметров

In [None]:
best_alpha = tune_ridge_alpha(X_normalized, y, clip_bounds=clip_bounds)


## 5. Обучение моделей

In [None]:
results, best_model = compare_models(X_normalized, y, alpha=best_alpha, clip_bounds=clip_bounds)
models_dict = {name: model for name, _, _, model in results if hasattr(model, 'loss_history') and len(getattr(model, 'loss_history', [])) > 0}
if models_dict:
    plot_convergence(models_dict, 'convergence.png')


## 6. Кросс-валидация и метрики

In [None]:
cv_results = evaluate_cross_validation(
    X_normalized,
    y,
    lambda: LinearRegressionCustom(method='analytical', alpha=best_alpha, clip_bounds=clip_bounds),
    k=5,
    loo_samples=200,
)
from sklearn.model_selection import train_test_split
X_train_val, X_val_test, y_train_val, y_val_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)
temp_model = LinearRegressionCustom(method='analytical', alpha=best_alpha, clip_bounds=clip_bounds)
temp_model.fit(X_train_val, y_train_val)
y_pred_val = temp_model.predict(X_val_test)
metrics_comparison = compare_metrics_with_sklearn(y_val_test, y_pred_val)


## 7. Submission

In [None]:
if X_test is not None:
    final_model = LinearRegressionCustom(method='analytical', alpha=best_alpha, clip_bounds=clip_bounds)
    final_model.fit(X_normalized, y)
    X_test_normalized = zscore_norm.transform(X_test)
    predictions = final_model.predict(X_test_normalized)
    submission = pd.DataFrame({'ID': range(len(predictions)), 'RiskScore': predictions})
    submission.to_csv('submission.csv', index=False)
    print(f'Submission сохранен, строк: {len(submission)}')
else:
    print('Нет тестовых данных')
print(f"K-fold mse: {cv_results['kfold']['mean']:.4f} ± {cv_results['kfold']['std']:.4f}")
