In [1]:

# ============================================================================
# MLP 
# ============================================================================

import pandas as pd
import numpy as np
import pickle
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import time

print("="*80)
print("MLP")
print("="*80)

# ============================================================================
# [1] ЗАГРУЗКА ДАННЫХ 
# ============================================================================

print("\n[1] ЗАГРУЗКА ДАННЫХ...")

# Feature engineering данные (17 числовых признаков)
train_df = pd.read_csv('train_features_full.csv')
test_df = pd.read_csv('test_features_full.csv')

print(f" Train: {len(train_df):,} × {train_df.shape[1]}")
print(f" Test: {len(test_df):,} × {test_df.shape[1]}")

# Preprocessing embeddings 
print("\n Загрузка embeddings...")
with open('train_embeddings.pkl', 'rb') as f:
    train_embeddings_pkl = pickle.load(f)

with open('test_embeddings.pkl', 'rb') as f:
    test_embeddings_pkl = pickle.load(f)

# Извлекаем book embeddings (768-dim)
train_book_emb = train_embeddings_pkl['book_emb']  # (874496, 768)
test_book_emb = test_embeddings_pkl['book_emb']    # (107260, 768)

print(f" Train book embeddings: {train_book_emb.shape}")
print(f" Test book embeddings: {test_book_emb.shape}")

# Baseline
with open('baseline_artifacts.pkl', 'rb') as f:
    baseline_artifacts = pickle.load(f)

baseline_rmse = baseline_artifacts['best_baseline_rmse']
print(f" Baseline RMSE: {baseline_rmse:.4f}")

# ============================================================================
# [2] СОЗДАНИЕ ПРИЗНАКОВ 
# ============================================================================

print("\n[2] СОЗДАНИЕ ПРИЗНАКОВ...")

# 17 числовых признаков
numeric_features = [
    # Interaction (6)
    'tag_overlap_count', 'tag_overlap_ratio', 'tag_jaccard',
    'history_similarity', 'embedding_cosine_sim', 'embedding_euclidean_dist',
    # User (4)
    'avg_user_rating', 'ratings_count', 'tag_vocab_size', 'activity_score',
    # Book (3)
    'book_avg_rating', 'book_ratings_count', 'book_popularity',
    # Preprocessing (4)
    'language_code_encoded', 'year_normalized', 'publication_era', 'average_rating'
]

# Энкодим segment
segment_mapping = {'new': 0, 'inactive': 1, 'active': 2, 'very_active': 3}
train_df['segment_encoded'] = train_df['segment'].map(segment_mapping).fillna(1)
test_df['segment_encoded'] = test_df['segment'].map(segment_mapping).fillna(1)

# Базовый набор: 17 + 1 = 18
base_features = numeric_features + ['segment_encoded']

# Создаем DataFrame с embeddings
emb_columns = [f'book_emb_{i}' for i in range(768)]
train_emb_df = pd.DataFrame(train_book_emb, columns=emb_columns, index=train_df.index)
test_emb_df = pd.DataFrame(test_book_emb, columns=emb_columns, index=test_df.index)

# Полный набор: 18 + 768 = 786
all_features = base_features + emb_columns

print(f" Базовых признаков: {len(base_features)}")
print(f" Embedding признаков: {len(emb_columns)}")
print(f" ВСЕГО: {len(all_features)}")

# Объединяем базовые + embeddings
X_train_full = pd.concat([
    train_df[base_features].reset_index(drop=True),
    train_emb_df.reset_index(drop=True)
], axis=1).values

X_test_full = pd.concat([
    test_df[base_features].reset_index(drop=True),
    test_emb_df.reset_index(drop=True)
], axis=1).values

y_train_full = train_df['rating'].values
y_test = test_df['rating'].values

print(f"\n X_train: {X_train_full.shape}")
print(f" X_test: {X_test_full.shape}")

# ============================================================================
# [3] ОБУЧЕНИЕ MLP
# ============================================================================

print("\n[3] ОБУЧЕНИЕ MLP...")

# Split train/val
X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(
    X_train_full, y_train_full,
    test_size=0.15,
    random_state=42,
    stratify=y_train_full
)

print(f" NN train: {X_train_nn.shape}")
print(f" NN val: {X_val_nn.shape}")

# Нормализация
scaler_nn = StandardScaler()
X_train_nn_scaled = scaler_nn.fit_transform(X_train_nn)
X_val_nn_scaled = scaler_nn.transform(X_val_nn)
X_test_nn_scaled = scaler_nn.transform(X_test_full)

print(" Нормализация завершена")

# Обучение
print("\n Архитектура: Input(786) → Dense(128) → Dense(64) → Output(1)")

start_time = time.time()

mlp = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    learning_rate_init=1e-3,
    batch_size=256,
    max_iter=30,
    random_state=42,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5
)

mlp.fit(X_train_nn_scaled, y_train_nn)

training_time = time.time() - start_time
print(f"\n Обучение завершено за {training_time/60:.1f} минут")

# ============================================================================
# [4] ОЦЕНКА
# ============================================================================

print("\n[4] ОЦЕНКА...")

# Валидация
mlp_pred_val = mlp.predict(X_val_nn_scaled)
mlp_rmse_val = np.sqrt(mean_squared_error(y_val_nn, np.clip(mlp_pred_val, 1, 5)))
mlp_mae_val = mean_absolute_error(y_val_nn, np.clip(mlp_pred_val, 1, 5))

print(f"\n Валидация:")
print(f"   RMSE: {mlp_rmse_val:.4f}")
print(f"   MAE: {mlp_mae_val:.4f}")

# Test
mlp_pred_test = mlp.predict(X_test_nn_scaled)
mlp_rmse_test = np.sqrt(mean_squared_error(y_test, np.clip(mlp_pred_test, 1, 5)))
mlp_mae_test = mean_absolute_error(y_test, np.clip(mlp_pred_test, 1, 5))
mlp_improvement = (baseline_rmse - mlp_rmse_test) / baseline_rmse * 100

print(f"\n Test:")
print(f"   RMSE: {mlp_rmse_test:.4f}")
print(f"   MAE: {mlp_mae_test:.4f}")
print(f"   Улучшение vs baseline: {mlp_improvement:+.2f}%")

# По bucket
low_mask = y_test <= 3
high_mask = y_test >= 4

mlp_rmse_low = np.sqrt(mean_squared_error(y_test[low_mask], np.clip(mlp_pred_test[low_mask], 1, 5)))
mlp_rmse_high = np.sqrt(mean_squared_error(y_test[high_mask], np.clip(mlp_pred_test[high_mask], 1, 5)))

print(f"   RMSE (<=3): {mlp_rmse_low:.4f}")
print(f"   RMSE (>=4): {mlp_rmse_high:.4f}")

# ============================================================================
# [5] СРАВНЕНИЕ С ДРУГИМИ МОДЕЛЯМИ
# ============================================================================

print("\n[5] СРАВНЕНИЕ...")

# Загружаем готовые предсказания
predictions_dict = np.load('ml_predictions_full.npy', allow_pickle=True).item()

catboost_pred = predictions_dict['catboost']
lgb_pred = predictions_dict['lightgbm']
knn_pred = predictions_dict['knn_features']

comparison = {
    'Model': [
        '[BASELINE]',
        'CatBoost (786 features)',
        'LightGBM (786 features)',
        'KNN (786 features)',
        'MLP (786 features)'
    ],
    'RMSE': [
        baseline_rmse,
        np.sqrt(mean_squared_error(y_test, np.clip(catboost_pred, 1, 5))),
        np.sqrt(mean_squared_error(y_test, np.clip(lgb_pred, 1, 5))),
        np.sqrt(mean_squared_error(y_test, np.clip(knn_pred, 1, 5))),
        mlp_rmse_test
    ]
}

df_comp = pd.DataFrame(comparison)
df_comp['Improvement'] = (baseline_rmse - df_comp['RMSE']) / baseline_rmse * 100

print("\n" + "="*70)
print(f"{'Model':<30} {'RMSE':<10} {'Improvement':<15}")
print("="*70)

for _, row in df_comp.iterrows():
    if 'BASELINE' in row['Model']:
        print(f"{row['Model']:<30} {row['RMSE']:<10.4f} {'-':<15}")
        print("-"*70)
    else:
        print(f"{row['Model']:<30} {row['RMSE']:<10.4f} {row['Improvement']:+.2f}%")

print("="*70)

# Позиция MLP
better_models = sum(df_comp.iloc[1:4]['RMSE'] < mlp_rmse_test)
mlp_rank = better_models + 1
print(f"\n MLP: {mlp_rank} место из 4 моделей")

# ============================================================================
# [6] STACKING
# ============================================================================

print("\n[6] STACKING С MLP...")

from sklearn.linear_model import Ridge

X_meta = np.column_stack([
    catboost_pred,
    lgb_pred,
    knn_pred,
    mlp_pred_test
])

X_meta_train, X_meta_val, y_meta_train, y_meta_val = train_test_split(
    X_meta, y_test,
    test_size=0.5,
    random_state=42
)

meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_meta_train)

stacking_pred = meta_model.predict(X_meta_val)

stacking_rmse = np.sqrt(mean_squared_error(y_meta_val, np.clip(stacking_pred, 1, 5)))
stacking_mae = mean_absolute_error(y_meta_val, np.clip(stacking_pred, 1, 5))
stacking_improvement = (baseline_rmse - stacking_rmse) / baseline_rmse * 100

print(f"\n Коэффициенты Ridge:")
print(f"   CatBoost: {meta_model.coef_[0]:+.4f}")
print(f"   LightGBM: {meta_model.coef_[1]:+.4f}")
print(f"   KNN: {meta_model.coef_[2]:+.4f}")
print(f"   MLP: {meta_model.coef_[3]:+.4f}")

print(f"\n Stacking:")
print(f"   RMSE: {stacking_rmse:.4f}")
print(f"   MAE: {stacking_mae:.4f}")
print(f"   Улучшение: {stacking_improvement:+.2f}%")

# ============================================================================
# [7] СОХРАНЕНИЕ
# ============================================================================

print("\n[7] СОХРАНЕНИЕ...")

with open('mlp_model_786.pkl', 'wb') as f:
    pickle.dump({'model': mlp, 'scaler': scaler_nn}, f)

np.save('mlp_predictions_786.npy', {
    'test': mlp_pred_test,
    'validation': mlp_pred_val,
    'y_test': y_test,
    'rmse': mlp_rmse_test
})

with open('stacking_with_mlp_786.pkl', 'wb') as f:
    pickle.dump({'model': meta_model, 'rmse': stacking_rmse}, f)

print(" Сохранено: mlp_model_786.pkl")
print(" Сохранено: mlp_predictions_786.npy")
print(" Сохранено: stacking_with_mlp_786.pkl")

print("\n" + "="*80)
print("ГОТОВО!")
print("="*80)
print(f"\n MLP (786 признаков): {mlp_rmse_test:.4f} ({mlp_improvement:+.2f}%)")
print(f" Stacking: {stacking_rmse:.4f} ({stacking_improvement:+.2f}%)")
print("="*80)


MLP

[1] ЗАГРУЗКА ДАННЫХ...
 Train: 874,496 × 21
 Test: 107,260 × 21

 Загрузка embeddings...
 Train book embeddings: (874496, 768)
 Test book embeddings: (107260, 768)
 Baseline RMSE: 0.8104

[2] СОЗДАНИЕ ПРИЗНАКОВ...
 Базовых признаков: 18
 Embedding признаков: 768
 ВСЕГО: 786

 X_train: (874496, 786)
 X_test: (107260, 786)

[3] ОБУЧЕНИЕ MLP...
 NN train: (743321, 786)
 NN val: (131175, 786)
 Нормализация завершена

 Архитектура: Input(786) → Dense(128) → Dense(64) → Output(1)
Iteration 1, loss = 0.33763621
Validation score: 0.374129
Iteration 2, loss = 0.29842484
Validation score: 0.381988
Iteration 3, loss = 0.29125489
Validation score: 0.401923
Iteration 4, loss = 0.28682646
Validation score: 0.405178
Iteration 5, loss = 0.28406278
Validation score: 0.405419
Iteration 6, loss = 0.28163529
Validation score: 0.407705
Iteration 7, loss = 0.27911023
Validation score: 0.411542
Iteration 8, loss = 0.27705440
Validation score: 0.414508
Iteration 9, loss = 0.27529693
Validation score: 0.4