In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('../data/prepared_data_for_regression.csv')


if 'AQI_Bucket' in data.columns:
    data = data.drop(columns=['AQI_Bucket'])


X = data.drop(columns=['AQI'])
y = data['AQI']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

numeric_features = [col for col in X.columns if not col.startswith('City_')]
categorical_features = [col for col in X.columns if col.startswith('City_')]




scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])


# Обучаем
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Предсказания
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Метрики
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n" + "="*50)
print("МЕТРИКИ МОДЕЛИ (линейная регрессия)")
print("="*50)
print(f"{'Метрика':<15} | {'Значение':>8}")
print("-"*30)
print(f"{'R² (train)':<15} | {train_r2:>8.4f}")
print(f"{'R² (test)':<15} | {test_r2:>8.4f}")
print(f"{'RMSE (test)':<15} | {test_rmse:>8.2f}")
print(f"{'MAE (test)':<15} | {test_mae:>8.2f}")
print("-"*30)
print(f"Модель объясняет {test_r2*100:.1f}% дисперсии AQI на тестовых данных")



plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(2, figsize=(16, 12))  # ← ЭТА СТРОКА БЫЛА ПРОПУЩЕНА!
fig.suptitle('Анализ линейной регрессии для предсказания AQI', fontsize=16, fontweight='bold')

# 1. Предсказание vs Реальное (Train и Test)
ax = axes[0]
ax.scatter(y_train, y_train_pred, alpha=0.5, s=12, label='Train', color='#1f77b4')
ax.scatter(y_test, y_test_pred, alpha=0.7, s=15, label='Test', color='#ff7f0e')
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Идеальное предсказание')
ax.set_xlabel('Реальный AQI')
ax.set_ylabel('Предсказанный AQI')
ax.set_title('Предсказание vs Реальность')
ax.legend()
ax.grid(True)

# 2. Остатки (Residuals = y_true - y_pred)
residuals_train = y_train - y_train_pred
residuals_test = y_test - y_test_pred

ax = axes[1]
ax.scatter(y_train_pred, residuals_train, alpha=0.5, s=12, label='Train', color='#1f77b4')
ax.scatter(y_test_pred, residuals_test, alpha=0.7, s=15, label='Test', color='#ff7f0e')
ax.axhline(0, color='red', linestyle='--', lw=1.5)
ax.set_xlabel('Предсказанный AQI')
ax.set_ylabel('Остаток (y_true - y_pred)')
ax.set_title('Остатки')
ax.legend()
ax.grid(True)



plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

data = pd.read_csv('../data/prepared_data_for_regression.csv')


if 'AQI_Bucket' in data.columns:
    data = data.drop(columns=['AQI_Bucket'])


X = data.drop(columns=['AQI'])
y = data['AQI']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)


numeric_features = [col for col in X.columns if not col.startswith('City_')]


scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

# Обучаем LASSO с подбором alpha через кросс-валидацию
lasso = LassoCV(
    alphas=np.logspace(-4, 2, 100),  # от 0.0001 до 100
    cv=5,
    random_state=42,
    max_iter=10000
)
lasso.fit(X_train_scaled, y_train)
# Лучший alpha
best_alpha = lasso.alpha_
print(f"Выбран alpha = {best_alpha:.5f}")

# Визуализация подбора alpha (CV-ошибки)
# ---------------------------------------------------
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(1, 1, figsize=(9, 5))

# Получаем значения alpha и MSE по фолдам
alphas = lasso.alphas_
mse_path = lasso.mse_path_  # shape: (n_alphas, n_folds)

# Среднее и стандартное отклонение MSE по фолдам
mean_mse = mse_path.mean(axis=1)
std_mse = mse_path.std(axis=1)

# Строим кривую
ax.plot(alphas, mean_mse, 'o-', color='#1f77b4', label='Средний CV MSE', markersize=4)
ax.fill_between(alphas, mean_mse - std_mse, mean_mse + std_mse, 
                alpha=0.2, color='#1f77b4', label='±1 std')

# Отмечаем выбранный alpha
ax.axvline(best_alpha, color='red', linestyle='--', 
           linewidth=2, label=f'Выбранный α = {best_alpha:.5f}')

# Настройки
ax.set_xscale('log')
ax.set_xlabel('α (логарифмическая шкала)')
ax.set_ylabel('CV MSE (5-fold)')
ax.set_title('Подбор гиперпараметра α в LASSO через кросс-валидацию')



# Подпись: сколько признаков при лучшем alpha
n_nonzero_best = np.sum(lasso.coef_ != 0)
ax.text(0.02, 0.95, f'При α = {best_alpha:.3f}:\n{n_nonzero_best} из {total_features} признаков\nостались ненулевыми',
        transform=ax.transAxes, fontsize=10, 
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()


# Предсказания
y_train_pred = lasso.predict(X_train_scaled)
y_test_pred = lasso.predict(X_test_scaled)

# Метрики
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n" + "="*50)
print("МЕТРИКИ МОДЕЛИ (LASSO-регрессия)")
print("="*50)
print(f"{'Метрика':<15} | {'Значение':>8}")
print("-"*30)
print(f"{'R² (train)':<15} | {train_r2:>8.4f}")
print(f"{'R² (test)':<15} | {test_r2:>8.4f}")
print(f"{'RMSE (test)':<15} | {test_rmse:>8.2f}")
print(f"{'MAE (test)':<15} | {test_mae:>8.2f}")
print("-"*30)
print(f"Модель объясняет {test_r2*100:.1f}% дисперсии AQI на тестовых данных")

# Количество ненулевых коэффициентов
n_nonzero = np.sum(lasso.coef_ != 0)
total_features = len(lasso.coef_)
print(f"\nLASSO отобрал {n_nonzero} из {total_features} признаков (остальные обнулены)")

# ---------------------------------------------------
# Графики
# ---------------------------------------------------
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(2, 1, figsize=(10, 10))
fig.suptitle(f'Анализ LASSO-регрессии (α = {best_alpha:.5f}) для предсказания AQI', 
             fontsize=14, fontweight='bold')

# 1. Предсказание vs Реальное
ax = axes[0]
ax.scatter(y_train, y_train_pred, alpha=0.5, s=15, label='Train', color='#2ca02c')
ax.scatter(y_test, y_test_pred, alpha=0.7, s=20, label='Test', color='#d62728', edgecolors='k', linewidth=0.3)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Идеальное предсказание')
ax.set_xlabel('Реальный AQI')
ax.set_ylabel('Предсказанный AQI')
ax.set_title('Предсказание vs Реальность')
ax.legend()
ax.grid(True)

# 2. Остатки
ax = axes[1]
residuals_train = y_train - y_train_pred
residuals_test = y_test - y_test_pred

ax.scatter(y_train_pred, residuals_train, alpha=0.5, s=15, label='Train', color='#2ca02c')
ax.scatter(y_test_pred, residuals_test, alpha=0.7, s=20, label='Test', color='#d62728', edgecolors='k', linewidth=0.3)
ax.axhline(0, color='red', linestyle='--', lw=1.5, label='Нулевая ошибка')
ax.set_xlabel('Предсказанный AQI')
ax.set_ylabel('Остаток (y_true - y_pred)')
ax.set_title('Остатки (Residuals)')
ax.legend()
ax.grid(True)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

data = pd.read_csv('../data/prepared_data_for_regression.csv')

if 'AQI_Bucket' in data.columns:
    data = data.drop(columns=['AQI_Bucket'])

X = data.drop(columns=['AQI'])
y = data['AQI']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)


numeric_features = [col for col in X.columns if not col.startswith('City_')]


scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

# Обучаем LASSO с подбором alpha через кросс-валидацию
ridge_model = Ridge(alpha=1.0, random_state=42)  # alpha > 0 обязателен для Ridge
ridge_model.fit(X_train_scaled, y_train)

# Предсказания
y_train_pred_ridge = ridge_model.predict(X_train_scaled)
y_test_pred_ridge = ridge_model.predict(X_test_scaled)

# Метрики
train_r2_ridge = r2_score(y_train, y_train_pred_ridge)
test_r2_ridge = r2_score(y_test, y_test_pred_ridge)
test_rmse_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))
test_mae_ridge = mean_absolute_error(y_test, y_test_pred_ridge)

print("\n" + "="*50)
print("МЕТРИКИ МОДЕЛИ (Ridge-регрессия, alpha=1.0)")
print("="*50)
print(f"{'Метрика':<15} | {'Значение':>8}")
print("-"*30)
print(f"{'R² (train)':<15} | {train_r2_ridge:>8.4f}")
print(f"{'R² (test)':<15} | {test_r2_ridge:>8.4f}")
print(f"{'RMSE (test)':<15} | {test_rmse_ridge:>8.2f}")
print(f"{'MAE (test)':<15} | {test_mae_ridge:>8.2f}")
print("-"*30)
print(f"Модель объясняет {test_r2_ridge*100:.1f}% дисперсии AQI на тестовых данных")
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(2, figsize=(16, 12))
fig.suptitle('Анализ Ridge-регрессии для предсказания AQI (alpha=1.0)', fontsize=16, fontweight='bold')

# 1. Предсказание vs Реальное
ax = axes[0]
ax.scatter(y_train, y_train_pred_ridge, alpha=0.5, s=12, label='Train', color='#2ca02c')
ax.scatter(y_test, y_test_pred_ridge, alpha=0.7, s=15, label='Test', color='#d62728')
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Идеальное предсказание')
ax.set_xlabel('Реальный AQI')
ax.set_ylabel('Предсказанный AQI (Ridge)')
ax.set_title('Ridge: Предсказание vs Реальность')
ax.legend()
ax.grid(True)

# 2. Остатки
residuals_train_ridge = y_train - y_train_pred_ridge
residuals_test_ridge = y_test - y_test_pred_ridge

ax = axes[1]
ax.scatter(y_train_pred_ridge, residuals_train_ridge, alpha=0.5, s=12, label='Train', color='#2ca02c')
ax.scatter(y_test_pred_ridge, residuals_test_ridge, alpha=0.7, s=15, label='Test', color='#d62728')
ax.axhline(0, color='red', linestyle='--', lw=1.5)
ax.set_xlabel('Предсказанный AQI (Ridge)')
ax.set_ylabel('Остаток (y_true - y_pred)')
ax.set_title('Ridge: Остатки')
ax.legend()
ax.grid(True)

plt.tight_layout()
plt.show()