In [None]:
# nto25_notebook.ipynb

# %% [markdown]
# # NTO25 ML Competition Baseline
# –ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω –¥–ª—è —Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏—è –ø–æ –º–∞—à–∏–Ω–Ω–æ–º—É –æ–±—É—á–µ–Ω–∏—é

# %%
# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–µ–π (—Ä–∞—Å–∫–æ–º–º–µ–Ω—Ç–∏—Ä—É–π—Ç–µ –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏)
# !pip install lightgbm pandas numpy scikit-learn tqdm joblib pyarrow

# %%
import sys
import time
from pathlib import Path
from typing import Any

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

# %%
# =============================================================================
# –ö–û–ù–°–¢–ê–ù–¢–´
# =============================================================================

# --- –§–ê–ô–õ–´ ---
# –ò—Å–ø–æ–ª—å–∑—É–µ–º –∞–±—Å–æ–ª—é—Ç–Ω—ã–µ –ø—É—Ç–∏ –∏–∑ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–≥–æ –∫–æ–¥–∞
TRAIN_FILENAME = "/home/evstigneva/nto252/mix/train.csv"
TEST_FILENAME = "/home/evstigneva/nto252/test.csv" 
USER_DATA_FILENAME = "/home/evstigneva/nto252/mix/users.csv"
BOOK_DATA_FILENAME = "/home/evstigneva/nto252/mix/books.csv"
BOOK_GENRES_FILENAME = "/home/evstigneva/nto252/mix/book_genres.csv"
GENRES_FILENAME = "/home/evstigneva/nto252/mix/genres.csv"
BOOK_DESCRIPTIONS_FILENAME = "/home/evstigneva/nto252/mix/book_descriptions.csv"
SUBMISSION_FILENAME = "/home/evstigneva/nto252/submission30.11.25_7.csv"
TFIDF_VECTORIZER_FILENAME = "tfidf_vectorizer.pkl"
PROCESSED_DATA_FILENAME = "processed_features.parquet"

# --- –ù–ê–ó–í–ê–ù–ò–Ø –ö–û–õ–û–ù–û–ö ---
COL_USER_ID = "user_id"
COL_BOOK_ID = "book_id"
COL_TARGET = "rating"
COL_SOURCE = "source"
COL_PREDICTION = "rating_predict"
COL_HAS_READ = "has_read"
COL_TIMESTAMP = "timestamp"

# –§–∏—á–µ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏ (–Ω–æ–≤—ã–µ)
F_USER_MEAN_RATING = "user_mean_rating"
F_USER_RATINGS_COUNT = "user_ratings_count"
F_BOOK_MEAN_RATING = "book_mean_rating"
F_BOOK_RATINGS_COUNT = "book_ratings_count"
F_AUTHOR_MEAN_RATING = "author_mean_rating"
F_BOOK_GENRES_COUNT = "book_genres_count"

# –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –∏–∑ —Å—ã—Ä—ã—Ö –¥–∞–Ω–Ω—ã—Ö
COL_GENDER = "gender"
COL_AGE = "age"
COL_AUTHOR_ID = "author_id"
COL_PUBLICATION_YEAR = "publication_year"
COL_LANGUAGE = "language"
COL_PUBLISHER = "publisher"
COL_AVG_RATING = "avg_rating"
COL_GENRE_ID = "genre_id"
COL_DESCRIPTION = "description"

# --- –ó–ù–ê–ß–ï–ù–ò–Ø ---
VAL_SOURCE_TRAIN = "train"
VAL_SOURCE_TEST = "test"

# --- –ú–ê–ì–ò–ß–ï–°–ö–ò–ï –ß–ò–°–õ–ê ---
MISSING_CAT_VALUE = "-1"
MISSING_NUM_VALUE = -1
PREDICTION_MIN_VALUE = 0
PREDICTION_MAX_VALUE = 10

# %%
# =============================================================================
# –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø
# =============================================================================

# –û–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–æ—Ä–Ω–µ–≤—É—é –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é
ROOT_DIR = Path("/home/evstigneva/nto25/baseline").resolve()
DATA_DIR = ROOT_DIR / "data"
RAW_DATA_DIR = Path("/home/evstigneva/Zagr")  # –î–∏—Ä–µ–∫—Ç–æ—Ä–∏—è —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –¥–∞–Ω–Ω—ã–º–∏
INTERIM_DATA_DIR = DATA_DIR / "interim" 
PROCESSED_DATA_DIR = DATA_DIR / "processed"
OUTPUT_DIR = ROOT_DIR / "output"
MODEL_DIR = OUTPUT_DIR / "models"
SUBMISSION_DIR = OUTPUT_DIR / "submissions"

# –°–æ–∑–¥–∞–µ–º –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏
for dir_path in [DATA_DIR, PROCESSED_DATA_DIR, MODEL_DIR, SUBMISSION_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("‚úÖ –î–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ —Å–æ–∑–¥–∞–Ω—ã:")
print(f"   - –î–∞–Ω–Ω—ã–µ: {DATA_DIR}")
print(f"   - –ú–æ–¥–µ–ª–∏: {MODEL_DIR}")
print(f"   - –†–µ–∑—É–ª—å—Ç–∞—Ç—ã: {SUBMISSION_DIR}")
print(f"   - –ò—Å—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ: {RAW_DATA_DIR}")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ —Ñ–∞–π–ª–æ–≤ –¥–∞–Ω–Ω—ã—Ö
print("\nüîç –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ñ–∞–π–ª–æ–≤ –¥–∞–Ω–Ω—ã—Ö:")
data_files = [
    TRAIN_FILENAME, TEST_FILENAME, USER_DATA_FILENAME, 
    BOOK_DATA_FILENAME, BOOK_GENRES_FILENAME, GENRES_FILENAME, BOOK_DESCRIPTIONS_FILENAME
]

for file_path in data_files:
    if Path(file_path).exists():
        print(f"   ‚úÖ {Path(file_path).name} - –Ω–∞–π–¥–µ–Ω")
    else:
        print(f"   ‚ùå {Path(file_path).name} - –ù–ï –ù–ê–ô–î–ï–ù")

# --- –ü–ê–†–ê–ú–ï–¢–†–´ ---
RANDOM_STATE = 42
TARGET = COL_TARGET

# --- –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø –í–†–ï–ú–ï–ù–ù–û–ì–û –†–ê–ó–î–ï–õ–ï–ù–ò–Ø ---
TEMPORAL_SPLIT_RATIO = 0.8

# --- –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø –û–ë–£–ß–ï–ù–ò–Ø ---
EARLY_STOPPING_ROUNDS = 50
MODEL_FILENAME = "lgb_model.txt"

# --- –ü–ê–†–ê–ú–ï–¢–†–´ TF-IDF ---
TFIDF_MAX_FEATURES = 500
TFIDF_MIN_DF = 2
TFIDF_MAX_DF = 0.95
TFIDF_NGRAM_RANGE = (1, 2)

# --- –§–ò–ß–ò ---
CAT_FEATURES = [
    COL_USER_ID,
    COL_BOOK_ID,
    COL_GENDER,
    COL_AGE,
    COL_AUTHOR_ID,
    COL_PUBLICATION_YEAR,
    COL_LANGUAGE,
    COL_PUBLISHER,
]

# --- –ü–ê–†–ê–ú–ï–¢–†–´ –ú–û–î–ï–õ–ò ---
LGB_PARAMS = {
    "objective": "rmse",
    "metric": "rmse",
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "num_leaves": 31,
    "verbose": -1,
    "n_jobs": -1,
    "seed": RANDOM_STATE,
    "boosting_type": "gbdt",
}

LGB_FIT_PARAMS = {
    "eval_metric": "rmse",
    "callbacks": [],
}

# %%
# =============================================================================
# –§–£–ù–ö–¶–ò–ò –û–ë–†–ê–ë–û–¢–ö–ò –î–ê–ù–ù–´–•
# =============================================================================

def load_and_merge_data():
    """–ó–∞–≥—Ä—É–∂–∞–µ—Ç —Å—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ –∏ –æ–±—ä–µ–¥–∏–Ω—è–µ—Ç –∏—Ö –≤ –µ–¥–∏–Ω—ã–π DataFrame."""
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ç–∏–ø—ã –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –ø–∞–º—è—Ç–∏
    dtype_spec = {
        COL_USER_ID: "int32",
        COL_BOOK_ID: "int32",
        COL_TARGET: "float32",
        COL_GENDER: "category",
        COL_AGE: "float32",
        COL_AUTHOR_ID: "int32",
        COL_PUBLICATION_YEAR: "float32",
        COL_LANGUAGE: "category",
        COL_PUBLISHER: "category",
        COL_AVG_RATING: "float32",
        COL_GENRE_ID: "int16",
    }

    # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç—ã
    train_df = pd.read_csv(
        TRAIN_FILENAME,
        dtype={
            k: v
            for k, v in dtype_spec.items()
            if k in [COL_USER_ID, COL_BOOK_ID, COL_TARGET]
        },
        parse_dates=[COL_TIMESTAMP],
    )

    # –§–∏–ª—å—Ç—Ä—É–µ–º —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ: —Ç–æ–ª—å–∫–æ –∫–Ω–∏–≥–∏ —Å —Ä–µ–π—Ç–∏–Ω–≥–æ–º (has_read=1)
    initial_count = len(train_df)
    train_df = train_df[train_df[COL_HAS_READ] == 1].copy()
    filtered_count = len(train_df)
    print(f"–û—Ç—Ñ–∏–ª—å—Ç—Ä–æ–≤–∞–Ω–æ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö: {initial_count} -> {filtered_count} —Å—Ç—Ä–æ–∫ (—Ç–æ–ª—å–∫–æ has_read=1)")
    
    test_df = pd.read_csv(
        TEST_FILENAME,
        dtype={k: v for k, v in dtype_spec.items() if k in [COL_USER_ID, COL_BOOK_ID]},
    )
    user_data_df = pd.read_csv(
        USER_DATA_FILENAME,
        dtype={
            k: v for k, v in dtype_spec.items() if k in [COL_USER_ID, COL_GENDER, COL_AGE]
        },
    )
    book_data_df = pd.read_csv(
        BOOK_DATA_FILENAME,
        dtype={
            k: v
            for k, v in dtype_spec.items()
            if k
            in [
                COL_BOOK_ID,
                COL_AUTHOR_ID,
                COL_PUBLICATION_YEAR,
                COL_LANGUAGE,
                COL_AVG_RATING,
                COL_PUBLISHER,
            ]
        },
    )
    book_genres_df = pd.read_csv(
        BOOK_GENRES_FILENAME,
        dtype={k: v for k, v in dtype_spec.items() if k in [COL_BOOK_ID, COL_GENRE_ID]},
    )
    genres_df = pd.read_csv(GENRES_FILENAME)
    book_descriptions_df = pd.read_csv(
        BOOK_DESCRIPTIONS_FILENAME,
        dtype={COL_BOOK_ID: "int32"},
    )

    print("–î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–æ–≤...")

    # –û–±—ä–µ–¥–∏–Ω—è–µ–º train –∏ test
    train_df[COL_SOURCE] = VAL_SOURCE_TRAIN
    test_df[COL_SOURCE] = VAL_SOURCE_TEST
    combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

    # –î–æ–±–∞–≤–ª—è–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
    combined_df = combined_df.merge(user_data_df, on=COL_USER_ID, how="left")

    # –£–¥–∞–ª—è–µ–º –¥—É–±–ª–∏–∫–∞—Ç—ã –∏–∑ book_data_df –ø–µ—Ä–µ–¥ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ–º
    book_data_df = book_data_df.drop_duplicates(subset=[COL_BOOK_ID])
    combined_df = combined_df.merge(book_data_df, on=COL_BOOK_ID, how="left")

    print(f"–†–∞–∑–º–µ—Ä –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö: {combined_df.shape}")
    return combined_df, book_genres_df, genres_df, book_descriptions_df

# %%
# =============================================================================
# –§–£–ù–ö–¶–ò–ò –§–ò–ß–ï–ô
# =============================================================================

def add_aggregate_features(df, train_df):
    """–í—ã—á–∏—Å–ª—è–µ—Ç –∏ –¥–æ–±–∞–≤–ª—è–µ—Ç –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –∫–Ω–∏–≥ –∏ –∞–≤—Ç–æ—Ä–æ–≤."""
    print("–î–æ–±–∞–≤–ª–µ–Ω–∏–µ –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö —Ñ–∏—á...")

    # –ê–≥—Ä–µ–≥–∞—Ç—ã –ø–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è–º
    user_agg = train_df.groupby(COL_USER_ID)[TARGET].agg(["mean", "count"]).reset_index()
    user_agg.columns = [
        COL_USER_ID,
        F_USER_MEAN_RATING,
        F_USER_RATINGS_COUNT,
    ]

    # –ê–≥—Ä–µ–≥–∞—Ç—ã –ø–æ –∫–Ω–∏–≥–∞–º
    book_agg = train_df.groupby(COL_BOOK_ID)[TARGET].agg(["mean", "count"]).reset_index()
    book_agg.columns = [
        COL_BOOK_ID,
        F_BOOK_MEAN_RATING,
        F_BOOK_RATINGS_COUNT,
    ]

    # –ê–≥—Ä–µ–≥–∞—Ç—ã –ø–æ –∞–≤—Ç–æ—Ä–∞–º
    author_agg = train_df.groupby(COL_AUTHOR_ID)[TARGET].agg(["mean"]).reset_index()
    author_agg.columns = [COL_AUTHOR_ID, F_AUTHOR_MEAN_RATING]

    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –∞–≥—Ä–µ–≥–∞—Ç—ã —Å –æ—Å–Ω–æ–≤–Ω—ã–º –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–æ–º
    df = df.merge(user_agg, on=COL_USER_ID, how="left")
    df = df.merge(book_agg, on=COL_BOOK_ID, how="left")
    return df.merge(author_agg, on=COL_AUTHOR_ID, how="left")

def add_genre_features(df, book_genres_df):
    """–í—ã—á–∏—Å–ª—è–µ—Ç –∏ –¥–æ–±–∞–≤–ª—è–µ—Ç –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∂–∞–Ω—Ä–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–π –∫–Ω–∏–≥–∏."""
    print("–î–æ–±–∞–≤–ª–µ–Ω–∏–µ —Ñ–∏—á –∂–∞–Ω—Ä–æ–≤...")
    genre_counts = book_genres_df.groupby(COL_BOOK_ID)[COL_GENRE_ID].count().reset_index()
    genre_counts.columns = [
        COL_BOOK_ID,
        F_BOOK_GENRES_COUNT,
    ]
    return df.merge(genre_counts, on=COL_BOOK_ID, how="left")

def add_text_features(df, train_df, descriptions_df):
    """–î–æ–±–∞–≤–ª—è–µ—Ç TF-IDF —Ñ–∏—á–∏ –∏–∑ –æ–ø–∏—Å–∞–Ω–∏–π –∫–Ω–∏–≥."""
    print("–î–æ–±–∞–≤–ª–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —Ñ–∏—á (TF-IDF)...")

    vectorizer_path = MODEL_DIR / TFIDF_VECTORIZER_FILENAME

    # –ü–æ–ª—É—á–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ –∫–Ω–∏–≥–∏ –∏–∑ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞
    train_books = train_df[COL_BOOK_ID].unique()

    # –ò–∑–≤–ª–µ–∫–∞–µ–º –æ–ø–∏—Å–∞–Ω–∏—è —Ç–æ–ª—å–∫–æ –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –∫–Ω–∏–≥
    train_descriptions = descriptions_df[descriptions_df[COL_BOOK_ID].isin(train_books)].copy()
    train_descriptions[COL_DESCRIPTION] = train_descriptions[COL_DESCRIPTION].fillna("")

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Å—É—â–µ—Å—Ç–≤–æ–≤–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä–∞ (–¥–ª—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è)
    if vectorizer_path.exists():
        print(f"–ó–∞–≥—Ä—É–∑–∫–∞ —Å—É—â–µ—Å—Ç–≤—É—é—â–µ–≥–æ –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä–∞ –∏–∑ {vectorizer_path}")
        vectorizer = joblib.load(vectorizer_path)
    else:
        # –û–±—É—á–∞–µ–º –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä —Ç–æ–ª—å–∫–æ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏—è—Ö
        print("–û–±—É—á–µ–Ω–∏–µ TF-IDF –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä–∞ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏—è—Ö...")
        vectorizer = TfidfVectorizer(
            max_features=TFIDF_MAX_FEATURES,
            min_df=TFIDF_MIN_DF,
            max_df=TFIDF_MAX_DF,
            ngram_range=TFIDF_NGRAM_RANGE,
        )
        vectorizer.fit(train_descriptions[COL_DESCRIPTION])
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä –¥–ª—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –≤ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–∏
        joblib.dump(vectorizer, vectorizer_path)
        print(f"–í–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ {vectorizer_path}")

    # –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∏—Ä—É–µ–º –≤—Å–µ –æ–ø–∏—Å–∞–Ω–∏—è –∫–Ω–∏–≥
    all_descriptions = descriptions_df[[COL_BOOK_ID, COL_DESCRIPTION]].copy()
    all_descriptions[COL_DESCRIPTION] = all_descriptions[COL_DESCRIPTION].fillna("")

    # –°–æ–∑–¥–∞–µ–º –º–∞–ø–ø–∏–Ω–≥ book_id -> description
    description_map = dict(
        zip(all_descriptions[COL_BOOK_ID], all_descriptions[COL_DESCRIPTION], strict=False)
    )

    # –ü–æ–ª—É—á–∞–µ–º –æ–ø–∏—Å–∞–Ω–∏—è –¥–ª—è –∫–Ω–∏–≥ –≤ df (–≤ —Ç–æ–º –∂–µ –ø–æ—Ä—è–¥–∫–µ)
    df_descriptions = df[COL_BOOK_ID].map(description_map).fillna("")

    # –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∏—Ä—É–µ–º –≤ TF-IDF —Ñ–∏—á–∏
    tfidf_matrix = vectorizer.transform(df_descriptions)

    # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º —Ä–∞–∑—Ä–µ–∂–µ–Ω–Ω—É—é –º–∞—Ç—Ä–∏—Ü—É –≤ DataFrame
    tfidf_feature_names = [f"tfidf_{i}" for i in range(tfidf_matrix.shape[1])]
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=tfidf_feature_names,
        index=df.index,
    )

    # –û–±—ä–µ–¥–∏–Ω—è–µ–º TF-IDF —Ñ–∏—á–∏ —Å –æ—Å–Ω–æ–≤–Ω—ã–º DataFrame
    df_with_tfidf = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    print(f"–î–æ–±–∞–≤–ª–µ–Ω–æ {len(tfidf_feature_names)} TF-IDF —Ñ–∏—á.")
    return df_with_tfidf

def handle_missing_values(df, train_df):
    """–ó–∞–ø–æ–ª–Ω—è–µ—Ç –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –∏—Å–ø–æ–ª—å–∑—É—è –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—É—é —Å—Ç—Ä–∞—Ç–µ–≥–∏—é."""
    print("–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π...")

    # –í—ã—á–∏—Å–ª—è–µ–º –≥–ª–æ–±–∞–ª—å–Ω–æ–µ —Å—Ä–µ–¥–Ω–µ–µ –∏–∑ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –∑–∞–ø–æ–ª–Ω–µ–Ω–∏—è
    global_mean = train_df[TARGET].mean()

    # –ó–∞–ø–æ–ª–Ω—è–µ–º –≤–æ–∑—Ä–∞—Å—Ç –º–µ–¥–∏–∞–Ω–æ–π
    age_median = df[COL_AGE].median()
    df[COL_AGE] = df[COL_AGE].fillna(age_median)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏ –¥–ª—è "—Ö–æ–ª–æ–¥–Ω–æ–≥–æ —Å—Ç–∞—Ä—Ç–∞" –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π/–ø—Ä–µ–¥–º–µ—Ç–æ–≤
    if F_USER_MEAN_RATING in df.columns:
        df[F_USER_MEAN_RATING] = df[F_USER_MEAN_RATING].fillna(global_mean)
    if F_BOOK_MEAN_RATING in df.columns:
        df[F_BOOK_MEAN_RATING] = df[F_BOOK_MEAN_RATING].fillna(global_mean)
    if F_AUTHOR_MEAN_RATING in df.columns:
        df[F_AUTHOR_MEAN_RATING] = df[F_AUTHOR_MEAN_RATING].fillna(global_mean)

    if F_USER_RATINGS_COUNT in df.columns:
        df[F_USER_RATINGS_COUNT] = df[F_USER_RATINGS_COUNT].fillna(0)
    if F_BOOK_RATINGS_COUNT in df.columns:
        df[F_BOOK_RATINGS_COUNT] = df[F_BOOK_RATINGS_COUNT].fillna(0)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º avg_rating –∏–∑ book_data –≥–ª–æ–±–∞–ª—å–Ω—ã–º —Å—Ä–µ–¥–Ω–∏–º
    df[COL_AVG_RATING] = df[COL_AVG_RATING].fillna(global_mean)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º —Å—á–µ—Ç—á–∏–∫–∏ –∂–∞–Ω—Ä–æ–≤ –Ω—É–ª—è–º–∏
    df[F_BOOK_GENRES_COUNT] = df[F_BOOK_GENRES_COUNT].fillna(0)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º TF-IDF —Ñ–∏—á–∏ –Ω—É–ª—è–º–∏ (–¥–ª—è –∫–Ω–∏–≥ –±–µ–∑ –æ–ø–∏—Å–∞–Ω–∏–π)
    tfidf_cols = [col for col in df.columns if col.startswith("tfidf_")]
    for col in tfidf_cols:
        df[col] = df[col].fillna(0.0)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º –æ—Å—Ç–∞–≤—à–∏–µ—Å—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ —Ñ–∏—á–∏ —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–º –∑–Ω–∞—á–µ–Ω–∏–µ–º
    for col in CAT_FEATURES:
        if col in df.columns:
            if df[col].dtype.name in ("category", "object") and df[col].isna().any():
                df[col] = df[col].astype(str).fillna(MISSING_CAT_VALUE).astype("category")
            elif pd.api.types.is_numeric_dtype(df[col].dtype) and df[col].isna().any():
                df[col] = df[col].fillna(MISSING_NUM_VALUE)

    return df

def create_features(df, book_genres_df, descriptions_df, include_aggregates=False):
    """–ó–∞–ø—É—Å–∫–∞–µ—Ç –ø–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω –∏–Ω–∂–µ–Ω–µ—Ä–∏–∏ —Ñ–∏—á."""
    print("–ó–∞–ø—É—Å–∫ –ø–∞–π–ø–ª–∞–π–Ω–∞ –∏–Ω–∂–µ–Ω–µ—Ä–∏–∏ —Ñ–∏—á...")
    train_df = df[df[COL_SOURCE] == VAL_SOURCE_TRAIN].copy()

    # –ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏ –≤—ã—á–∏—Å–ª—è—é—Ç—Å—è –æ—Ç–¥–µ–ª—å–Ω–æ –≤–æ –≤—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è
    if include_aggregates:
        df = add_aggregate_features(df, train_df)

    df = add_genre_features(df, book_genres_df)
    df = add_text_features(df, train_df, descriptions_df)
    df = handle_missing_values(df, train_df)

    # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏ –≤ pandas 'category' dtype –¥–ª—è LightGBM
    for col in CAT_FEATURES:
        if col in df.columns:
            df[col] = df[col].astype("category")

    print("–ò–Ω–∂–µ–Ω–µ—Ä–∏—è —Ñ–∏—á –∑–∞–≤–µ—Ä—à–µ–Ω–∞.")
    return df

# %%
# =============================================================================
# –§–£–ù–ö–¶–ò–ò –í–†–ï–ú–ï–ù–ù–û–ì–û –†–ê–ó–î–ï–õ–ï–ù–ò–Ø
# =============================================================================

def temporal_split_by_date(df, split_date, timestamp_col=COL_TIMESTAMP):
    """–†–∞–∑–¥–µ–ª—è–µ—Ç DataFrame –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—É—é –∏ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—É—é –≤—ã–±–æ—Ä–∫–∏ –ø–æ –∞–±—Å–æ–ª—é—Ç–Ω–æ–π –¥–∞—Ç–µ."""
    if timestamp_col not in df.columns:
        raise ValueError(
            f"–ö–æ–ª–æ–Ω–∫–∞ —Å –≤—Ä–µ–º–µ–Ω–Ω—ã–º–∏ –º–µ—Ç–∫–∞–º–∏ '{timestamp_col}' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ DataFrame. –î–æ—Å—Ç—É–ø–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏: {df.columns.tolist()}"
        )

    # –£–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞ –≤ datetime —Ñ–æ—Ä–º–∞—Ç–µ
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        df = df.copy()
        df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # –†–∞–∑–¥–µ–ª—è–µ–º –ø–æ –ø–æ—Ä–æ–≥—É –¥–∞—Ç—ã
    train_mask = df[timestamp_col] <= split_date
    val_mask = df[timestamp_col] > split_date

    # –í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã–µ –ø—Ä–æ–≤–µ—Ä–∫–∏
    if train_mask.sum() == 0:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω–æ –∑–∞–ø–∏—Å–µ–π —Å –≤—Ä–µ–º–µ–Ω–Ω–æ–π –º–µ—Ç–∫–æ–π <= {split_date}. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ split_date.")

    if val_mask.sum() == 0:
        raise ValueError(f"–ù–µ –Ω–∞–π–¥–µ–Ω–æ –∑–∞–ø–∏—Å–µ–π —Å –≤—Ä–µ–º–µ–Ω–Ω–æ–π –º–µ—Ç–∫–æ–π > {split_date}. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ split_date.")

    # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç–∏: —É–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ –≤—Å–µ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã–µ –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –º–µ—Ç–∫–∏ –ø–æ—Å–ª–µ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö
    if train_mask.sum() > 0 and val_mask.sum() > 0:
        max_train_timestamp = df.loc[train_mask, timestamp_col].max()
        min_val_timestamp = df.loc[val_mask, timestamp_col].min()

        if min_val_timestamp <= max_train_timestamp:
            raise ValueError(
                f"–í–∞–ª–∏–¥–∞—Ü–∏—è –≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è –Ω–µ —É–¥–∞–ª–∞—Å—å: –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞ ({min_val_timestamp}) "
                f"–Ω–µ –±–æ–ª—å—à–µ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–π –≤—Ä–µ–º–µ–Ω–Ω–æ–π –º–µ—Ç–∫–∏ ({max_train_timestamp}). "
                "–≠—Ç–æ —É–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞ –ø—Ä–æ–±–ª–µ–º—É —Ü–µ–ª–æ—Å—Ç–Ω–æ—Å—Ç–∏ –¥–∞–Ω–Ω—ã—Ö."
            )

    return train_mask, val_mask

def get_split_date_from_ratio(df, ratio, timestamp_col=COL_TIMESTAMP):
    """–í—ã—á–∏—Å–ª—è–µ—Ç –¥–∞—Ç—É —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è –Ω–∞ –æ—Å–Ω–æ–≤–µ —Å–æ–æ—Ç–Ω–æ—à–µ–Ω–∏—è —Ç–æ—á–µ–∫ –¥–∞–Ω–Ω—ã—Ö."""
    if not 0 < ratio < 1:
        raise ValueError(f"–°–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –º–µ–∂–¥—É 0 –∏ 1, –ø–æ–ª—É—á–µ–Ω–æ {ratio}")

    if timestamp_col not in df.columns:
        raise ValueError(f"–ö–æ–ª–æ–Ω–∫–∞ —Å –≤—Ä–µ–º–µ–Ω–Ω—ã–º–∏ –º–µ—Ç–∫–∞–º–∏ '{timestamp_col}' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ DataFrame.")

    # –£–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞ –≤ datetime —Ñ–æ—Ä–º–∞—Ç–µ
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        df = df.copy()
        df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # –í—ã—á–∏—Å–ª—è–µ–º –ø–æ—Ä–æ–≥ –Ω–∞ –æ—Å–Ω–æ–≤–µ —Å–æ–æ—Ç–Ω–æ—à–µ–Ω–∏—è
    sorted_timestamps = df[timestamp_col].sort_values()
    threshold_index = int(len(sorted_timestamps) * ratio)

    return sorted_timestamps.iloc[threshold_index]

# %%
# =============================================================================
# –û–°–ù–û–í–ù–´–ï –§–£–ù–ö–¶–ò–ò –ü–ê–ô–ü–õ–ê–ô–ù–ê
# =============================================================================

def prepare_data():
    """–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Å—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–µ —Ñ–∏—á–∏ –≤ processed –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é."""
    print("=" * 60)
    print("–ü–∞–π–ø–ª–∞–π–Ω –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∏ –¥–∞–Ω–Ω—ã—Ö")
    print("=" * 60)

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö —Ñ–∞–π–ª–æ–≤
    required_files = [TRAIN_FILENAME, TEST_FILENAME, USER_DATA_FILENAME, BOOK_DATA_FILENAME, 
                     BOOK_GENRES_FILENAME, GENRES_FILENAME, BOOK_DESCRIPTIONS_FILENAME]
    
    missing_files = [f for f in required_files if not Path(f).exists()]
    if missing_files:
        raise FileNotFoundError(f"–û—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ —Ñ–∞–π–ª—ã: {missing_files}")

    # –ó–∞–≥—Ä—É–∂–∞–µ–º –∏ –æ–±—ä–µ–¥–∏–Ω—è–µ–º —Å—ã—Ä—ã–µ –¥–∞–Ω–Ω—ã–µ
    merged_df, book_genres_df, _, descriptions_df = load_and_merge_data()

    # –ü—Ä–∏–º–µ–Ω—è–µ–º –∏–Ω–∂–µ–Ω–µ—Ä–∏—é —Ñ–∏—á –ë–ï–ó –∞–≥—Ä–µ–≥–∞—Ç–æ–≤
    featured_df = create_features(merged_df, book_genres_df, descriptions_df, include_aggregates=False)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –ø—É—Ç—å –≤—ã–≤–æ–¥–∞
    processed_path = PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –∫–∞–∫ parquet –¥–ª—è —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏
    print(f"\n–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –≤ {processed_path}...")
    featured_df.to_parquet(processed_path, index=False, engine="pyarrow", compression="snappy")
    print("–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã!")

    # –ü–µ—á–∞—Ç–∞–µ–º —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫—É
    train_rows = len(featured_df[featured_df[COL_SOURCE] == VAL_SOURCE_TRAIN])
    test_rows = len(featured_df[featured_df[COL_SOURCE] == VAL_SOURCE_TEST])
    total_features = len(featured_df.columns)

    print("\n–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∑–∞–≤–µ—Ä—à–µ–Ω–∞!")
    print(f"  - –¢—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ —Å—Ç—Ä–æ–∫–∏: {train_rows:,}")
    print(f"  - –¢–µ—Å—Ç–æ–≤—ã–µ —Å—Ç—Ä–æ–∫–∏: {test_rows:,}")
    print(f"  - –í—Å–µ–≥–æ —Ñ–∏—á: {total_features}")
    print(f"  - –í—ã—Ö–æ–¥–Ω–æ–π —Ñ–∞–π–ª: {processed_path}")
    
    return featured_df

def train():
    """–ó–∞–ø—É—Å–∫–∞–µ—Ç –ø–∞–π–ø–ª–∞–π–Ω –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏ —Å –≤—Ä–µ–º–µ–Ω–Ω—ã–º —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ–º."""
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
    processed_path = PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME

    if not processed_path.exists():
        raise FileNotFoundError(
            f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã –≤ {processed_path}. "
            "–ü–æ–∂–∞–ª—É–π—Å—Ç–∞, —Å–Ω–∞—á–∞–ª–∞ –∑–∞–ø—É—Å—Ç–∏—Ç–µ prepare_data()."
        )

    print(f"–ó–∞–≥—Ä—É–∑–∫–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏–∑ {processed_path}...")
    featured_df = pd.read_parquet(processed_path, engine="pyarrow")
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(featured_df):,} —Å—Ç—Ä–æ–∫ —Å {len(featured_df.columns)} —Ñ–∏—á–∞–º–∏")

    # –†–∞–∑–¥–µ–ª—è–µ–º —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –∏ —Ç–µ—Å—Ç–æ–≤—ã–µ –Ω–∞–±–æ—Ä—ã
    train_set = featured_df[featured_df[COL_SOURCE] == VAL_SOURCE_TRAIN].copy()

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –∫–æ–ª–æ–Ω–∫–∏ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –º–µ—Ç–æ–∫
    if COL_TIMESTAMP not in train_set.columns:
        raise ValueError(
            f"–ö–æ–ª–æ–Ω–∫–∞ —Å –≤—Ä–µ–º–µ–Ω–Ω—ã–º–∏ –º–µ—Ç–∫–∞–º–∏ '{COL_TIMESTAMP}' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–º –Ω–∞–±–æ—Ä–µ. "
            "–£–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ –¥–∞–Ω–Ω—ã–µ –±—ã–ª–∏ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω—ã —Å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –º–µ—Ç–æ–∫."
        )

    # –£–±–µ–∂–¥–∞–µ–º—Å—è, —á—Ç–æ –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞ –≤ datetime —Ñ–æ—Ä–º–∞—Ç–µ
    if not pd.api.types.is_datetime64_any_dtype(train_set[COL_TIMESTAMP]):
        train_set[COL_TIMESTAMP] = pd.to_datetime(train_set[COL_TIMESTAMP])

    # –í—ã–ø–æ–ª–Ω—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω–æ–µ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ
    print(f"\n–í—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è —Å —Å–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ–º {TEMPORAL_SPLIT_RATIO}...")
    split_date = get_split_date_from_ratio(train_set, TEMPORAL_SPLIT_RATIO, COL_TIMESTAMP)
    print(f"–î–∞—Ç–∞ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è: {split_date}")

    train_mask, val_mask = temporal_split_by_date(train_set, split_date, COL_TIMESTAMP)

    # –†–∞–∑–¥–µ–ª—è–µ–º –¥–∞–Ω–Ω—ã–µ
    train_split = train_set[train_mask].copy()
    val_split = train_set[val_mask].copy()

    print(f"–¢—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–µ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ: {len(train_split):,} —Å—Ç—Ä–æ–∫")
    print(f"–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–µ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ: {len(val_split):,} —Å—Ç—Ä–æ–∫")

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—É—é –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç—å
    max_train_timestamp = train_split[COL_TIMESTAMP].max()
    min_val_timestamp = val_split[COL_TIMESTAMP].min()
    print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–∞—è –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {max_train_timestamp}")
    print(f"–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞: {min_val_timestamp}")

    if min_val_timestamp <= max_train_timestamp:
        raise ValueError(
            f"–í–∞–ª–∏–¥–∞—Ü–∏—è –≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è –Ω–µ —É–¥–∞–ª–∞—Å—å: –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—Ä–µ–º–µ–Ω–Ω–∞—è –º–µ—Ç–∫–∞ ({min_val_timestamp}) "
            f"–Ω–µ –±–æ–ª—å—à–µ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–π –≤—Ä–µ–º–µ–Ω–Ω–æ–π –º–µ—Ç–∫–∏ ({max_train_timestamp})."
        )
    print("‚úÖ –í–∞–ª–∏–¥–∞—Ü–∏—è –≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏—è –ø—Ä–æ–π–¥–µ–Ω–∞: –≤—Å–µ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã–µ –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –º–µ—Ç–∫–∏ –ø–æ—Å–ª–µ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö")

    # –í—ã—á–∏—Å–ª—è–µ–º –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏ —Ç–æ–ª—å–∫–æ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–º —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–∏ (–¥–ª—è –ø—Ä–µ–¥–æ—Ç–≤—Ä–∞—â–µ–Ω–∏—è —É—Ç–µ—á–∫–∏ –¥–∞–Ω–Ω—ã—Ö)
    print("\n–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö —Ñ–∏—á —Ç–æ–ª—å–∫–æ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–º —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–∏...")
    train_split_with_agg = add_aggregate_features(train_split.copy(), train_split)
    val_split_with_agg = add_aggregate_features(val_split.copy(), train_split)  # –ò—Å–ø–æ–ª—å–∑—É–µ–º train_split –¥–ª—è –∞–≥—Ä–µ–≥–∞—Ç–æ–≤!

    # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è (–∏—Å–ø–æ–ª—å–∑—É–µ–º train_split –¥–ª—è –∑–∞–ø–æ–ª–Ω–µ–Ω–∏—è)
    print("–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π...")
    train_split_final = handle_missing_values(train_split_with_agg, train_split)
    val_split_final = handle_missing_values(val_split_with_agg, train_split)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ñ–∏—á–∏ (X) –∏ —Ç–∞—Ä–≥–µ—Ç (y)
    exclude_cols = [
        COL_SOURCE,
        TARGET,
        COL_PREDICTION,
        COL_TIMESTAMP,
    ]
    features = [col for col in train_split_final.columns if col not in exclude_cols]

    # –ò—Å–∫–ª—é—á–∞–µ–º –æ—Å—Ç–∞–≤—à–∏–µ—Å—è object –∫–æ–ª–æ–Ω–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ —è–≤–ª—è—é—Ç—Å—è —Ñ–∏—á–∞–º–∏ –º–æ–¥–µ–ª–∏
    non_feature_object_cols = train_split_final[features].select_dtypes(include=["object"]).columns.tolist()
    features = [f for f in features if f not in non_feature_object_cols]

    X_train = train_split_final[features]
    y_train = train_split_final[TARGET]
    X_val = val_split_final[features]
    y_val = val_split_final[TARGET]

    print(f"–¢—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ —Ñ–∏—á–∏: {len(features)}")

    # –û–±—É—á–∞–µ–º –µ–¥–∏–Ω—É—é –º–æ–¥–µ–ª—å
    print("\n–û–±—É—á–µ–Ω–∏–µ LightGBM –º–æ–¥–µ–ª–∏...")
    model = lgb.LGBMRegressor(**LGB_PARAMS)

    # –û–±–Ω–æ–≤–ª—è–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã fit —Å –∫–æ–ª–±—ç–∫–æ–º —Ä–∞–Ω–Ω–µ–π –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
    fit_params = LGB_FIT_PARAMS.copy()
    fit_params["callbacks"] = [lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS)]

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=fit_params["eval_metric"],
        callbacks=fit_params["callbacks"],
    )

    # –û—Ü–µ–Ω–∏–≤–∞–µ–º –º–æ–¥–µ–ª—å
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    print(f"\n–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã–π RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å
    model_path = MODEL_DIR / MODEL_FILENAME
    model.booster_.save_model(str(model_path))
    print(f"–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ {model_path}")

    print("\n–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.")
    
    return model, rmse, mae

def predict():
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ–≥–æ –Ω–∞–±–æ—Ä–∞."""
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
    processed_path = PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME

    if not processed_path.exists():
        raise FileNotFoundError(
            f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã –≤ {processed_path}. "
            "–ü–æ–∂–∞–ª—É–π—Å—Ç–∞, —Å–Ω–∞—á–∞–ª–∞ –∑–∞–ø—É—Å—Ç–∏—Ç–µ prepare_data()."
        )

    print(f"–ó–∞–≥—Ä—É–∑–∫–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏–∑ {processed_path}...")
    featured_df = pd.read_parquet(processed_path, engine="pyarrow")
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(featured_df):,} —Å—Ç—Ä–æ–∫ —Å {len(featured_df.columns)} —Ñ–∏—á–∞–º–∏")

    # –†–∞–∑–¥–µ–ª—è–µ–º —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –∏ —Ç–µ—Å—Ç–æ–≤—ã–µ –Ω–∞–±–æ—Ä—ã
    train_set = featured_df[featured_df[COL_SOURCE] == VAL_SOURCE_TRAIN].copy()
    test_set = featured_df[featured_df[COL_SOURCE] == VAL_SOURCE_TEST].copy()

    print(f"–¢—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–π –Ω–∞–±–æ—Ä: {len(train_set):,} —Å—Ç—Ä–æ–∫")
    print(f"–¢–µ—Å—Ç–æ–≤—ã–π –Ω–∞–±–æ—Ä: {len(test_set):,} —Å—Ç—Ä–æ–∫")

    # –í—ã—á–∏—Å–ª—è–µ–º –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏ –Ω–∞ –í–°–ï–• —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö (–¥–ª—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –≤ —Ç–µ—Å—Ç–æ–≤—ã—Ö –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è—Ö)
    print("\n–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö —Ñ–∏—á –Ω–∞ –≤—Å–µ—Ö —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö...")
    test_set_with_agg = add_aggregate_features(test_set.copy(), train_set)

    # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è (–∏—Å–ø–æ–ª—å–∑—É–µ–º train_set –¥–ª—è –∑–∞–ø–æ–ª–Ω–µ–Ω–∏—è)
    print("–û–±—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏—è...")
    test_set_final = handle_missing_values(test_set_with_agg, train_set)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ñ–∏—á–∏ (–∏—Å–∫–ª—é—á–∞–µ–º source, target, prediction, timestamp –∫–æ–ª–æ–Ω–∫–∏)
    exclude_cols = [
        COL_SOURCE,
        TARGET,
        COL_PREDICTION,
        COL_TIMESTAMP,
    ]
    features = [col for col in test_set_final.columns if col not in exclude_cols]

    # –ò—Å–∫–ª—é—á–∞–µ–º –æ—Å—Ç–∞–≤—à–∏–µ—Å—è object –∫–æ–ª–æ–Ω–∫–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ —è–≤–ª—è—é—Ç—Å—è —Ñ–∏—á–∞–º–∏ –º–æ–¥–µ–ª–∏
    non_feature_object_cols = test_set_final[features].select_dtypes(include=["object"]).columns.tolist()
    features = [f for f in features if f not in non_feature_object_cols]

    X_test = test_set_final[features]
    print(f"–§–∏—á–∏ –¥–ª—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è: {len(features)}")

    # –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å
    model_path = MODEL_DIR / MODEL_FILENAME
    if not model_path.exists():
        raise FileNotFoundError(
            f"–ú–æ–¥–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ {model_path}. " 
            "–ü–æ–∂–∞–ª—É–π—Å—Ç–∞, —Å–Ω–∞—á–∞–ª–∞ –∑–∞–ø—É—Å—Ç–∏—Ç–µ train()."
        )

    print(f"\n–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏–∑ {model_path}...")
    model = lgb.Booster(model_file=str(model_path))

    # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
    print("–ì–µ–Ω–µ—Ä–∞—Ü–∏—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π...")
    test_preds = model.predict(X_test)

    # –û–±—Ä–µ–∑–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –¥–æ –≤–∞–ª–∏–¥–Ω–æ–≥–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ —Ä–µ–π—Ç–∏–Ω–≥–∞ [0, 10]
    clipped_preds = np.clip(test_preds, PREDICTION_MIN_VALUE, PREDICTION_MAX_VALUE)

    # –°–æ–∑–¥–∞–µ–º —Ñ–∞–π–ª submission
    submission_df = test_set[[COL_USER_ID, COL_BOOK_ID]].copy()
    submission_df[COL_PREDICTION] = clipped_preds

    submission_path = SUBMISSION_DIR / SUBMISSION_FILENAME

    submission_df.to_csv(submission_path, index=False)
    print(f"\n–§–∞–π–ª submission —Å–æ–∑–¥–∞–Ω –≤: {submission_path}")
    print(f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è: min={clipped_preds.min():.4f}, max={clipped_preds.max():.4f}, mean={clipped_preds.mean():.4f}")
    
    return submission_df, clipped_preds

def validate():
    """–í–∞–ª–∏–¥–∏—Ä—É–µ—Ç —Å—Ç—Ä—É–∫—Ç—É—Ä—É –∏ —Ñ–æ—Ä–º–∞—Ç —Ñ–∞–π–ª–∞ submission."""
    print("–í–∞–ª–∏–¥–∞—Ü–∏—è —Ñ–∞–π–ª–∞ submission...")

    try:
        # –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ –∏ —Ñ–∞–π–ª submission
        test_df = pd.read_csv(TEST_FILENAME)
        sub_df = pd.read_csv(SUBMISSION_DIR / SUBMISSION_FILENAME)

        # 1. –ü—Ä–æ–≤–µ—Ä—è–µ–º –¥–ª–∏–Ω—É
        assert len(sub_df) == len(test_df), f"–ù–µ—Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ –¥–ª–∏–Ω—ã submission. –û–∂–∏–¥–∞–ª–æ—Å—å {len(test_df)}, –ø–æ–ª—É—á–µ–Ω–æ {len(sub_df)}."
        print("‚úÖ –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–ª–∏–Ω—ã –ø—Ä–æ–π–¥–µ–Ω–∞.")

        # 2. –ü—Ä–æ–≤–µ—Ä—è–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è—Ö
        assert (
            not sub_df[COL_PREDICTION].isna().any()
        ), f"–ù–∞–π–¥–µ–Ω—ã –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ '{COL_PREDICTION}'."
        print("‚úÖ –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –ø—Ä–æ–π–¥–µ–Ω–∞.")

        # 3. –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –Ω–∞–±–æ—Ä –ø–∞—Ä (user_id, book_id) —Å–æ–≤–ø–∞–¥–∞–µ—Ç
        test_keys = (
            test_df[[COL_USER_ID, COL_BOOK_ID]]
            .copy()
            .set_index([COL_USER_ID, COL_BOOK_ID])
        )
        sub_keys = (
            sub_df[[COL_USER_ID, COL_BOOK_ID]]
            .copy()
            .set_index([COL_USER_ID, COL_BOOK_ID])
        )

        assert test_keys.index.equals(
            sub_keys.index
        ), "–ù–∞–±–æ—Ä –ø–∞—Ä (user_id, book_id) –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Ç–µ—Å—Ç–æ–≤—ã–º –Ω–∞–±–æ—Ä–æ–º."
        print("‚úÖ –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å–æ–≤–ø–∞–¥–µ–Ω–∏—è –ø–∞—Ä (user_id, book_id) –ø—Ä–æ–π–¥–µ–Ω–∞.")

        # 4. –ü—Ä–æ–≤–µ—Ä—è–µ–º –¥–∏–∞–ø–∞–∑–æ–Ω –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
        assert (
            sub_df[COL_PREDICTION]
            .between(PREDICTION_MIN_VALUE, PREDICTION_MAX_VALUE)
            .all()
        ), f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –Ω–µ –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ [{PREDICTION_MIN_VALUE}, {PREDICTION_MAX_VALUE}]."
        print("‚úÖ –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π [0, 10] –ø—Ä–æ–π–¥–µ–Ω–∞.")

        print("\n–í–∞–ª–∏–¥–∞—Ü–∏—è —É—Å–ø–µ—à–Ω–∞! –§–∞–π–ª submission –∏–º–µ–µ—Ç –∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç.")
        return True

    except FileNotFoundError as e:
        print(f"–û—à–∏–±–∫–∞: {e}. –£–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ —Ñ–∞–π–ª—ã —Å—É—â–µ—Å—Ç–≤—É—é—Ç.")
        return False
    except AssertionError as e:
        print(f"–í–∞–ª–∏–¥–∞—Ü–∏—è –Ω–µ —É–¥–∞–ª–∞—Å—å: {e}")
        return False
    except Exception as e:
        print(f"–ü—Ä–æ–∏–∑–æ—à–ª–∞ –Ω–µ–ø—Ä–µ–¥–≤–∏–¥–µ–Ω–Ω–∞—è –æ—à–∏–±–∫–∞: {e}")
        return False

# %%
# =============================================================================
# –ó–ê–ü–£–°–ö –ü–ê–ô–ü–õ–ê–ô–ù–ê - –í–´–ë–ï–†–ò–¢–ï –ù–£–ñ–ù–´–ï –Ø–ß–ï–ô–ö–ò –î–õ–Ø –ó–ê–ü–£–°–ö–ê
# =============================================================================

# %% [markdown]
# ## üöÄ –ó–∞–ø—É—Å–∫ –ø–æ–ª–Ω–æ–≥–æ –ø–∞–π–ø–ª–∞–π–Ω–∞
# –í—ã–ø–æ–ª–Ω–∏—Ç–µ —è—á–µ–π–∫–∏ –Ω–∏–∂–µ –ø–æ –ø–æ—Ä—è–¥–∫—É:

# %%
# 1. –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–•
print("üéØ –®–ê–ì 1: –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–•")
featured_df = prepare_data()

# %%
# 2. –û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ò
print("üéØ –®–ê–ì 2: –û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ò")
model, rmse, mae = train()

# %%
# 3. –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
print("üéØ –®–ê–ì 3: –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï")
submission_df, predictions = predict()

# %%
# 4. –í–ê–õ–ò–î–ê–¶–ò–Ø
print("üéØ –®–ê–ì 4: –í–ê–õ–ò–î–ê–¶–ò–Ø")
validation_success = validate()

# %%
# 5. –†–ï–ó–£–õ–¨–¢–ê–¢–´
print("üéØ –®–ê–ì 5: –†–ï–ó–£–õ–¨–¢–ê–¢–´")
print("\n" + "="*50)
print("üìä –ò–¢–û–ì–û–í–´–ï –†–ï–ó–£–õ–¨–¢–ê–¢–´")
print("="*50)
print(f"‚úÖ –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö: {len(featured_df):,} —Å—Ç—Ä–æ–∫")
print(f"‚úÖ –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏: RMSE = {rmse:.4f}, MAE = {mae:.4f}")
print(f"‚úÖ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è: {len(predictions):,} –ø—Ä–æ–≥–Ω–æ–∑–æ–≤")
print(f"‚úÖ –í–∞–ª–∏–¥–∞—Ü–∏—è: {'–ü–†–û–ô–î–ï–ù–ê' if validation_success else '–ù–ï –ü–†–û–ô–î–ï–ù–ê'}")
print(f"üìÅ –§–∞–π–ª submission: {SUBMISSION_DIR / SUBMISSION_FILENAME}")

# %% [markdown]
# ## üîß –ó–∞–ø—É—Å–∫ –æ—Ç–¥–µ–ª—å–Ω—ã—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤
# –ï—Å–ª–∏ –Ω—É–∂–Ω–æ –∑–∞–ø—É—Å—Ç–∏—Ç—å —Ç–æ–ª—å–∫–æ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–µ —á–∞—Å—Ç–∏:

# %%
# –¢–û–õ–¨–ö–û –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–•
# featured_df = prepare_data()

# %%
# –¢–û–õ–¨–ö–û –û–ë–£–ß–ï–ù–ò–ï
# model, rmse, mae = train()

# %%
# –¢–û–õ–¨–ö–û –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï
# submission_df, predictions = predict()

# %%
# –¢–û–õ–¨–ö–û –í–ê–õ–ò–î–ê–¶–ò–Ø
# validate()

# %% [markdown]
# ## üìä –ê–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)

# %%
# –ü–†–û–°–ú–û–¢–† –ü–û–î–ì–û–¢–û–í–õ–ï–ù–ù–´–• –î–ê–ù–ù–´–•
if 'featured_df' in locals():
    print("üìà –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö:")
    print(f"–†–∞–∑–º–µ—Ä: {featured_df.shape}")
    print(f"–ö–æ–ª–æ–Ω–∫–∏: {list(featured_df.columns)}")
    print("\n–ü–µ—Ä–≤—ã–µ 5 —Å—Ç—Ä–æ–∫:")
    display(featured_df.head())

# %%
# –°–¢–ê–¢–ò–°–¢–ò–ö–ê –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ô
if 'predictions' in locals():
    print("üìä –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π:")
    print(f"Min: {predictions.min():.4f}")
    print(f"Max: {predictions.max():.4f}") 
    print(f"Mean: {predictions.mean():.4f}")
    print(f"Std: {predictions.std():.4f}")
    
    # –ì–∏—Å—Ç–æ–≥—Ä–∞–º–º–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.hist(predictions, bins=50, alpha=0.7, color='skyblue')
    plt.title('–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥–æ–≤')
    plt.xlabel('–†–µ–π—Ç–∏–Ω–≥')
    plt.ylabel('–ß–∞—Å—Ç–æ—Ç–∞')
    plt.grid(True, alpha=0.3)
    plt.show()

In [5]:
import pandas as pd
import numpy as np

def process_ratings(base_file, update_file, output_file):
    """
    –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Ä–µ–π—Ç–∏–Ω–≥–∏ –∏–∑ –¥–≤—É—Ö —Ñ–∞–π–ª–æ–≤ –ø–æ –∑–∞–¥–∞–Ω–Ω—ã–º –ø—Ä–∞–≤–∏–ª–∞–º.
    
    Args:
        base_file: –ø—É—Ç—å –∫ –ø–µ—Ä–≤–æ–º—É CSV —Ñ–∞–π–ª—É —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –¥–∞–Ω–Ω—ã–º–∏
        update_file: –ø—É—Ç—å –∫–æ –≤—Ç–æ—Ä–æ–º—É CSV —Ñ–∞–π–ª—É —Å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º–∏
        output_file: –ø—É—Ç—å –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞
    """
    
    # 1. –û—Ç–∫—Ä—ã–≤–∞–µ–º –ø–µ—Ä–≤—ã–π CSV —Ñ–∞–π–ª
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –ø–µ—Ä–≤–æ–≥–æ —Ñ–∞–π–ª–∞...")
    try:
        df_base = pd.read_csv(base_file)
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ —Å—Ç—Ä–æ–∫ –∏–∑ –ø–µ—Ä–≤–æ–≥–æ —Ñ–∞–π–ª–∞: {len(df_base)}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ –ø–µ—Ä–≤–æ–≥–æ —Ñ–∞–π–ª–∞: {e}")
        return
    
    # 2. –û—Ç–∫—Ä—ã–≤–∞–µ–º –≤—Ç–æ—Ä–æ–π CSV —Ñ–∞–π–ª
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤—Ç–æ—Ä–æ–≥–æ —Ñ–∞–π–ª–∞...")
    try:
        df_update = pd.read_csv(update_file)
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ —Å—Ç—Ä–æ–∫ –∏–∑ –≤—Ç–æ—Ä–æ–≥–æ —Ñ–∞–π–ª–∞: {len(df_update)}")
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ –≤—Ç–æ—Ä–æ–≥–æ —Ñ–∞–π–ª–∞: {e}")
        return
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö —Å—Ç–æ–ª–±—Ü–æ–≤
    required_base_cols = ['user_id', 'book_id', 'rating_predict']
    required_update_cols = ['user_id', 'book_id', 'rating_predict']
    
    if not all(col in df_base.columns for col in required_base_cols):
        print(f"–ü–µ—Ä–≤—ã–π —Ñ–∞–π–ª –¥–æ–ª–∂–µ–Ω —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Å—Ç–æ–ª–±—Ü—ã: {required_base_cols}")
        return
    
    if not all(col in df_update.columns for col in required_update_cols):
        print(f"–í—Ç–æ—Ä–æ–π —Ñ–∞–π–ª –¥–æ–ª–∂–µ–Ω —Å–æ–¥–µ—Ä–∂–∞—Ç—å —Å—Ç–æ–ª–±—Ü—ã: {required_update_cols}")
        return
    
    # –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ –ø–æ–∏—Å–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π
    print("–°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π...")
    update_dict = {}
    for _, row in df_update.iterrows():
        key = (row['user_id'], row['book_id'])
        update_dict[key] = row['rating_predict']
    
    # 3-4. –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Å—Ç—Ä–æ–∫–∏
    print("–û–±—Ä–∞–±–æ—Ç–∫–∞ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...")
    
    processed_count = 0
    updated_count = 0
    
    for idx, row in df_base.iterrows():
        key = (row['user_id'], row['book_id'])
        original_rating = row['rating_predict']
        
        if key in update_dict:
            rating_predict = update_dict[key]
            
            # 3. –ï—Å–ª–∏ —Ä–µ–π—Ç–∏–Ω–≥ –æ—Ç 0 –¥–æ 2, –∑–∞–º–µ–Ω—è–µ–º –Ω–∞ —Å—Ä–µ–¥–Ω–µ–µ + rating_predict
            if 0 <= original_rating <= 2:
                new_rating = (original_rating + rating_predict) / 2
                df_base.at[idx, 'rating_predict'] = new_rating
                processed_count += 1
            
            # 4. –î–ª—è –æ—Å—Ç–∞–ª—å–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –∏–∑–º–µ–Ω—è–µ–º –Ω–∞ –¥–µ–ª—å—Ç—É
            else:
                delta = rating_predict - original_rating
                new_rating = original_rating + delta
                df_base.at[idx, 'rating_predict'] = new_rating
                updated_count += 1
    
    print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ –∑–Ω–∞—á–µ–Ω–∏–π 0-2: {processed_count}")
    print(f"–û–±–Ω–æ–≤–ª–µ–Ω–æ –¥—Ä—É–≥–∏—Ö –∑–Ω–∞—á–µ–Ω–∏–π: {updated_count}")
    
    # 5. –û–∫—Ä—É–≥–ª—è–µ–º –∑–Ω–∞—á–µ–Ω–∏—è —Ä–µ–π—Ç–∏–Ω–≥–∞
    print("–û–∫—Ä—É–≥–ª–µ–Ω–∏–µ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...")
    df_base['rating_predict'] = df_base['rating_predict'].apply(lambda x: round(x) if not pd.isna(x) else x)
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ —Ä–µ–π—Ç–∏–Ω–≥–∏ –≤ –¥–æ–ø—É—Å—Ç–∏–º–æ–º –¥–∏–∞–ø–∞–∑–æ–Ω–µ (–Ω–∞–ø—Ä–∏–º–µ—Ä, 1-10)
    print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...")
    df_base['rating_predict'] = df_base['rating_predict'].clip(lower=0, upper=10)  # –£—Å—Ç–∞–Ω–æ–≤–∏—Ç–µ –Ω—É–∂–Ω—ã–π –¥–∏–∞–ø–∞–∑–æ–Ω
    
    df_base['rating_predict'] = df_base['rating_predict'].replace(9, 10)

    # 6. –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –Ω–æ–≤—ã–π CSV —Ñ–∞–π–ª
    print(f"–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞ –≤ {output_file}...")
    try:
        df_base.to_csv(output_file, index=False)
        print(f"–§–∞–π–ª —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω. –í—Å–µ–≥–æ —Å—Ç—Ä–æ–∫: {len(df_base)}")
        
        # –í—ã–≤–æ–¥–∏–º —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫—É
        print("\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ —Ä–µ–π—Ç–∏–Ω–≥–∞–º:")
        print(f"–ú–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: {df_base['rating_predict'].min()}")
        print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: {df_base['rating_predict'].max()}")
        print(f"–°—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥: {df_base['rating_predict'].mean():.2f}")
        print(f"–ú–µ–¥–∏–∞–Ω–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: {df_base['rating_predict'].median()}")
        
        # –ü—Ä–∏–º–µ—Ä –ø–µ—Ä–≤—ã—Ö 5 —Å—Ç—Ä–æ–∫
        print("\n–ü–µ—Ä–≤—ã–µ 5 —Å—Ç—Ä–æ–∫ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞:")
        print(df_base.head())
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏ —Ñ–∞–π–ª–∞: {e}")
        return

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
if __name__ == "__main__":
    # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç–∏ –∫ –≤–∞—à–∏–º —Ñ–∞–π–ª–∞–º
    base_file = "/home/evstigneva/Zagr/sample_submission.csv"  # –ü–µ—Ä–≤—ã–π CSV —Ñ–∞–π–ª
    update_file = "/home/evstigneva/Zagr/submission30.11.25_7.csv"  # –í—Ç–æ—Ä–æ–π CSV —Ñ–∞–π–ª —Å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º–∏
    output_file = "/home/evstigneva/nto252/submission2.12.25_1.csv"  # –í—ã—Ö–æ–¥–Ω–æ–π —Ñ–∞–π–ª
    
    # –ó–∞–ø—É—Å–∫–∞–µ–º –æ–±—Ä–∞–±–æ—Ç–∫—É
    process_ratings(base_file, update_file, output_file)

–ó–∞–≥—Ä—É–∑–∫–∞ –ø–µ—Ä–≤–æ–≥–æ —Ñ–∞–π–ª–∞...
–ó–∞–≥—Ä—É–∂–µ–Ω–æ —Å—Ç—Ä–æ–∫ –∏–∑ –ø–µ—Ä–≤–æ–≥–æ —Ñ–∞–π–ª–∞: 2894
–ó–∞–≥—Ä—É–∑–∫–∞ –≤—Ç–æ—Ä–æ–≥–æ —Ñ–∞–π–ª–∞...
–ó–∞–≥—Ä—É–∂–µ–Ω–æ —Å—Ç—Ä–æ–∫ –∏–∑ –≤—Ç–æ—Ä–æ–≥–æ —Ñ–∞–π–ª–∞: 2894
–°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π...
–û–±—Ä–∞–±–æ—Ç–∫–∞ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ –∑–Ω–∞—á–µ–Ω–∏–π 0-2: 0
–û–±–Ω–æ–≤–ª–µ–Ω–æ –¥—Ä—É–≥–∏—Ö –∑–Ω–∞—á–µ–Ω–∏–π: 2894
–û–∫—Ä—É–≥–ª–µ–Ω–∏–µ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...
–ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ —Ä–µ–π—Ç–∏–Ω–≥–æ–≤...
–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞ –≤ /home/evstigneva/nto252/submission2.12.25_1.csv...
–§–∞–π–ª —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω. –í—Å–µ–≥–æ —Å—Ç—Ä–æ–∫: 2894

–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ —Ä–µ–π—Ç–∏–Ω–≥–∞–º:
–ú–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: 2
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: 10
–°—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥: 7.82
–ú–µ–¥–∏–∞–Ω–Ω—ã–π —Ä–µ–π—Ç–∏–Ω–≥: 8.0

–ü–µ—Ä–≤—ã–µ 5 —Å—Ç—Ä–æ–∫ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞:
   user_id  book_id  rating_predict
0      281  24