In [None]:
pip install lightgbm



In [None]:
import os
import sys
import zipfile
import glob
import time
import gc
from pathlib import Path
from typing import Any, List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed=42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

class Constants:
    COL_USER_ID = "user_id"
    COL_BOOK_ID = "book_id"
    COL_TARGET = "rating"
    COL_HAS_READ = "has_read"
    COL_TIMESTAMP = "timestamp"
    COL_PREDICTION = "rating_predict"

    COL_GENDER = "gender"
    COL_AGE = "age"
    COL_AUTHOR_ID = "author_id"
    COL_PUBLICATION_YEAR = "publication_year"
    COL_LANGUAGE = "language"
    COL_PUBLISHER = "publisher"
    COL_AVG_RATING = "avg_rating"
    COL_DESCRIPTION = "description"

    COL_GENRE_ID = "genre_id"

    F_USER_MEAN_RATING = "user_mean_rating"
    F_USER_RATINGS_COUNT = "user_ratings_count"
    F_BOOK_MEAN_RATING = "book_mean_rating"
    F_BOOK_RATINGS_COUNT = "book_ratings_count"
    F_AUTHOR_MEAN_RATING = "author_mean_rating"
    F_BOOK_GENRES_COUNT = "book_genres_count"

    MISSING_CAT_VALUE = "-1"
    MISSING_NUM_VALUE = -1
    PREDICTION_MIN_VALUE = 0
    PREDICTION_MAX_VALUE = 10


class Config:
    RANDOM_STATE = 42
    TARGET = Constants.COL_TARGET

    TEMPORAL_SPLIT_RATIO = 0.8

    EARLY_STOPPING_ROUNDS = 50
    MODEL_FILENAME = "lgb_model.txt"

    TFIDF_MAX_FEATURES = 300
    TFIDF_MIN_DF = 2
    TFIDF_MAX_DF = 0.95
    TFIDF_NGRAM_RANGE = (1, 2)

    CAT_FEATURES = [
        Constants.COL_USER_ID,
        Constants.COL_BOOK_ID,
        Constants.COL_GENDER,
        Constants.COL_AGE,
        Constants.COL_AUTHOR_ID,
        Constants.COL_PUBLICATION_YEAR,
        Constants.COL_LANGUAGE,
        Constants.COL_PUBLISHER,
    ]

    LGB_PARAMS = {
        "objective": "rmse",
        "metric": "rmse",
        "n_estimators": 2000,
        "learning_rate": 0.05,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "lambda_l1": 0.1,
        "lambda_l2": 0.1,
        "num_leaves": 31,
        "min_child_samples": 20,
        "verbose": -1,
        "n_jobs": -1,
        "seed": RANDOM_STATE,
        "boosting_type": "gbdt",
    }


def find_file(filename):
    for root, dirs, files in os.walk("."):
        if filename in files:
            return os.path.join(root, filename)
    raise FileNotFoundError(f"Файл {filename} не найден")


def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and col_type.name != "category" and "datetime" not in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Оптимизация памяти: {start_mem:.1f}MB -> {end_mem:.1f}MB")
    return df


def temporal_split_by_date(df: pd.DataFrame, split_date: pd.Timestamp) -> Tuple[pd.Series, pd.Series]:
    train_mask = df[Constants.COL_TIMESTAMP] <= split_date
    val_mask = df[Constants.COL_TIMESTAMP] > split_date
    return train_mask, val_mask


def get_split_date_from_ratio(df: pd.DataFrame, ratio: float) -> pd.Timestamp:
    sorted_timestamps = df[Constants.COL_TIMESTAMP].sort_values()
    threshold_index = int(len(sorted_timestamps) * ratio)
    return sorted_timestamps.iloc[threshold_index]


def load_and_merge_data():
    print("Загрузка данных...")

    zip_files = glob.glob('*.zip')
    if zip_files:
        print(f"Распаковка архива: {zip_files[0]}")
        with zipfile.ZipFile(zip_files[0], 'r') as zip_ref:
            zip_ref.extractall('.')

    files_to_find = ['train.csv', 'test.csv', 'books.csv', 'users.csv',
                     'book_descriptions.csv', 'book_genres.csv']

    file_paths = {}
    for file in files_to_find:
        try:
            file_paths[file] = find_file(file)
            print(f"Найден: {file}")
        except FileNotFoundError as e:
            if file == 'book_genres.csv':
                print(f"Файл {file} не найден, продолжаем без него")
                file_paths[file] = None
            else:
                raise e

    print("Чтение train.csv...")
    train = pd.read_csv(file_paths['train.csv'], sep=';', header=None)
    train_split = train[0].str.split(',', expand=True)
    headers = train_split.iloc[0]
    train_data = train_split.iloc[1:].copy()
    train_data.columns = headers

    for col in ['user_id', 'book_id', 'has_read', 'rating']:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    train_data['timestamp'] = pd.to_datetime(train_data['timestamp'], errors='coerce')

    train_df = train_data[train_data['has_read'] == 1].copy()
    print(f"Train после фильтрации has_read=1: {len(train_df)} строк")

    test_df = pd.read_csv(file_paths['test.csv'], sep=',')
    books_df = pd.read_csv(file_paths['books.csv'], sep=',')
    users_df = pd.read_csv(file_paths['users.csv'], sep=',')
    desc_df = pd.read_csv(file_paths['book_descriptions.csv'], sep=',')

    if file_paths['book_genres.csv']:
        genres_df = pd.read_csv(file_paths['book_genres.csv'], sep=',')
    else:
        genres_df = pd.DataFrame(columns=['book_id', 'genre_id'])

    print(f"Размеры данных - Train: {train_df.shape}, Test: {test_df.shape}, "
          f"Books: {books_df.shape}, Users: {users_df.shape}")

    train_df['_source'] = 'train'
    test_df['_source'] = 'test'

    combined = pd.concat([train_df, test_df], ignore_index=True)
    combined = combined.merge(users_df, on='user_id', how='left')

    books_df = books_df.drop_duplicates(subset=['book_id'])
    combined = combined.merge(books_df, on='book_id', how='left')

    print(f"Объединенный датасет: {combined.shape}")

    return combined, desc_df, genres_df, train_df, test_df


def add_genre_features(df: pd.DataFrame, genres_df: pd.DataFrame) -> pd.DataFrame:
    print("Добавление жанров...")

    if genres_df.empty:
        print("Нет данных о жанрах")
        df['book_genres_count'] = 0
        return df

    genre_counts = genres_df.groupby('book_id').size().reset_index(name='book_genres_count')

    if 'genre_id' in genres_df.columns:
        top_genre_ids = genres_df['genre_id'].value_counts().head(10).index.tolist()
        for i, genre_id in enumerate(top_genre_ids[:5]):
            genre_books = genres_df[genres_df['genre_id'] == genre_id]['book_id'].unique()
            df[f'genre_top_{i+1}'] = df['book_id'].isin(genre_books).astype(int)

    df = df.merge(genre_counts, on='book_id', how='left')
    df['book_genres_count'] = df['book_genres_count'].fillna(0)

    return df


def add_text_features(df: pd.DataFrame, desc_df: pd.DataFrame, train_books: list):
    print("Добавление TF-IDF...")

    desc_df['description'] = desc_df['description'].fillna('')
    train_desc = desc_df[desc_df['book_id'].isin(train_books)]

    if len(train_desc) == 0:
        print("Нет описаний для обучения TF-IDF")
        return df

    tfidf = TfidfVectorizer(
        max_features=Config.TFIDF_MAX_FEATURES,
        min_df=Config.TFIDF_MIN_DF,
        max_df=Config.TFIDF_MAX_DF,
        ngram_range=Config.TFIDF_NGRAM_RANGE,
        stop_words='english'
    )

    tfidf.fit(train_desc['description'])

    desc_map = dict(zip(desc_df['book_id'], desc_df['description']))
    df_descriptions = df['book_id'].map(desc_map).fillna('')

    tfidf_matrix = tfidf.transform(df_descriptions)

    tfidf_features = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])],
        index=df.index
    )

    df = pd.concat([df, tfidf_features], axis=1)
    print(f"Добавлено {tfidf_matrix.shape[1]} TF-IDF фич")

    return df


def add_aggregate_features(df: pd.DataFrame, train_df: pd.DataFrame) -> pd.DataFrame:
    print("Добавление агрегатных фич...")

    user_agg = train_df.groupby('user_id')['rating'].agg(['mean', 'count']).reset_index()
    user_agg.columns = ['user_id', 'user_mean_rating', 'user_ratings_count']

    book_agg = train_df.groupby('book_id')['rating'].agg(['mean', 'count']).reset_index()
    book_agg.columns = ['book_id', 'book_mean_rating', 'book_ratings_count']

    if 'author_id' in train_df.columns:
        author_agg = train_df.groupby('author_id')['rating'].agg(['mean']).reset_index()
        author_agg.columns = ['author_id', 'author_mean_rating']
    else:
        author_agg = pd.DataFrame(columns=['author_id', 'author_mean_rating'])

    df = df.merge(user_agg, on='user_id', how='left')
    df = df.merge(book_agg, on='book_id', how='left')

    if not author_agg.empty:
        df = df.merge(author_agg, on='author_id', how='left')

    return df


def add_interaction_features(df: pd.DataFrame):
    print("Добавление фич взаимодействия...")

    if 'user_mean_rating' in df.columns and 'book_mean_rating' in df.columns:
        df['rating_diff'] = df['book_mean_rating'] - df['user_mean_rating']

    if 'user_ratings_count' in df.columns and 'book_ratings_count' in df.columns:
        df['interaction_count'] = np.log1p(df['user_ratings_count']) * np.log1p(df['book_ratings_count'])

    if 'publication_year' in df.columns:
        current_year = 2024
        df['publication_year'] = pd.to_numeric(df['publication_year'], errors='coerce')
        df['book_age'] = current_year - df['publication_year']
        df['book_age'] = df['book_age'].fillna(df['book_age'].median())
        df['is_old_book'] = (df['book_age'] > 20).astype(int)
        df['is_recent_book'] = (df['book_age'] <= 5).astype(int)

    if 'age' in df.columns:
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        df['age_group'] = pd.cut(
            df['age'],
            bins=[0, 18, 25, 35, 50, 100],
            labels=[1, 2, 3, 4, 5]
        ).astype(float)

    for col in ['user_ratings_count', 'book_ratings_count', 'book_genres_count']:
        if col in df.columns:
            df[f'log_{col}'] = np.log1p(df[col])

    return df


def handle_missing_values(df: pd.DataFrame, train_stats: dict):
    print("Заполнение пропусков...")

    num_cols = [
        'user_mean_rating', 'user_ratings_count',
        'book_mean_rating', 'book_ratings_count',
        'author_mean_rating', 'avg_rating',
        'book_genres_count', 'publication_year', 'age'
    ]

    for col in num_cols:
        if col in df.columns:
            if col in train_stats:
                df[col] = df[col].fillna(train_stats[col])
            else:
                df[col] = df[col].fillna(df[col].median() if col != 'age' else df['age'].median())

    cat_cols = ['gender', 'language', 'publisher', 'author_id']
    for col in cat_cols:
        if col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].fillna('unknown')
            else:
                df[col] = df[col].fillna(Constants.MISSING_NUM_VALUE).astype(int)

    tfidf_cols = [col for col in df.columns if col.startswith('tfidf_')]
    for col in tfidf_cols:
        df[col] = df[col].fillna(0)

    genre_cols = [col for col in df.columns if col.startswith('genre_')]
    for col in genre_cols:
        df[col] = df[col].fillna(0)

    return df


def main_pipeline():
    print("Запуск основного пайплайна...")

    combined, desc_df, genres_df, train_df, test_df = load_and_merge_data()
    train_books = train_df['book_id'].unique()

    print("Подготовка train данных...")
    train_processed = combined[combined['_source'] == 'train'].copy()

    train_processed = add_genre_features(train_processed, genres_df)
    train_processed = add_text_features(train_processed, desc_df, train_books)
    train_processed = add_interaction_features(train_processed)

    train_processed = reduce_mem_usage(train_processed)

    print(f"Временной сплит (ratio={Config.TEMPORAL_SPLIT_RATIO})...")

    if Constants.COL_TIMESTAMP not in train_processed.columns:
        print("Создание временных меток...")
        train_processed[Constants.COL_TIMESTAMP] = pd.date_range(
            start='2020-01-01',
            periods=len(train_processed),
            freq='D'
        )
    else:
        if not pd.api.types.is_datetime64_any_dtype(train_processed[Constants.COL_TIMESTAMP]):
            train_processed[Constants.COL_TIMESTAMP] = pd.to_datetime(
                train_processed[Constants.COL_TIMESTAMP], errors='coerce'
            )

    split_date = get_split_date_from_ratio(train_processed, Config.TEMPORAL_SPLIT_RATIO)
    train_mask, val_mask = temporal_split_by_date(train_processed, split_date)

    train_split = train_processed[train_mask].copy()
    val_split = train_processed[val_mask].copy()

    print(f"Train split: {len(train_split)} записей")
    print(f"Val split: {len(val_split)} записей")

    train_split = add_aggregate_features(train_split, train_split)

    train_stats = {}
    num_cols = ['user_mean_rating', 'book_mean_rating', 'author_mean_rating', 'avg_rating', 'age']
    for col in num_cols:
        if col in train_split.columns:
            train_stats[col] = train_split[col].mean()

    train_split = handle_missing_values(train_split, train_stats)
    val_split = add_aggregate_features(val_split, train_split)
    val_split = handle_missing_values(val_split, train_stats)

    exclude_cols = ['_source', 'timestamp', 'has_read', 'title', 'author_name', 'rating', 'description']
    features = [col for col in train_split.columns if col not in exclude_cols]
    features = [f for f in features if not isinstance(train_split[f].iloc[0], str) if f in train_split.columns]

    common_features = list(set(features) & set(val_split.columns) & set(train_split.columns))

    print(f"Используется {len(common_features)} фич")

    X_train = train_split[common_features]
    y_train = train_split[Constants.COL_TARGET]
    X_val = val_split[common_features]
    y_val = val_split[Constants.COL_TARGET]

    for col in Config.CAT_FEATURES:
        if col in common_features:
            X_train[col] = X_train[col].astype('category')
            X_val[col] = X_val[col].astype('category')

    print("Обучение LightGBM...")

    train_data = lgb.Dataset(
        X_train, label=y_train,
        categorical_feature=[c for c in Config.CAT_FEATURES if c in common_features],
        free_raw_data=False
    )

    val_data = lgb.Dataset(
        X_val, label=y_val,
        categorical_feature=[c for c in Config.CAT_FEATURES if c in common_features],
        reference=train_data,
        free_raw_data=False
    )

    callbacks = [
        lgb.early_stopping(stopping_rounds=Config.EARLY_STOPPING_ROUNDS, verbose=True),
        lgb.log_evaluation(period=100),
    ]

    print("Начало обучения...")
    model = lgb.train(
        Config.LGB_PARAMS,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'val'],
        callbacks=callbacks,
    )

    val_preds = model.predict(X_val)
    val_preds = np.clip(val_preds, 0, 10)

    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    score = 1 - ((rmse/10) + (mae/10)) / 2

    print(f"\nРезультаты на валидации:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"SCORE: {score:.5f}"

    print("Предсказание на тесте...")
    test_processed = combined[combined['_source'] == 'test'].copy()

    test_processed = add_genre_features(test_processed, genres_df)
    test_processed = add_text_features(test_processed, desc_df, train_books)
    test_processed = add_interaction_features(test_processed)
    test_processed = add_aggregate_features(test_processed, train_processed[train_processed['_source'] == 'train'])
    test_processed = handle_missing_values(test_processed, train_stats)

    test_features = [f for f in common_features if f in test_processed.columns]
    X_test = test_processed[test_features]

    for col in Config.CAT_FEATURES:
        if col in test_features:
            X_test[col] = X_test[col].astype('category')

    print("Создание предсказаний...")
    test_preds = model.predict(X_test)
    test_preds = np.clip(test_preds, 0, 10)

    train_median = np.median(y_train)
    current_median = np.median(test_preds)

    if current_median < train_median - 0.5:
        shift = (train_median - current_median) * 0.6
        test_preds = test_preds + shift

    test_preds = np.clip(test_preds, 1, 9)

    print(f"Статистика предсказаний:")
    print(f"Медиана: {np.median(test_preds):.2f}")
    print(f"Среднее: {test_preds.mean():.2f}")

    submission = test_df[['user_id', 'book_id']].copy()
    submission['rating_predict'] = test_preds

    submission.to_csv('submission_final.csv', index=False)
    print(f"Сабмит сохранен: submission_final1.csv")

    model.save_model('lgb_model_final.txt')
    print("Модель сохранена: lgb_model_final.txt")

    return score, submission


if __name__ == "__main__":
    start_time = time.time()

    try:
        print("Запуск пайплайна...")
        score, submission = main_pipeline()

        print("Финальная проверка...")

        if submission['rating_predict'].min() < 0 or submission['rating_predict'].max() > 10:
            print("Предупреждение: Предсказания выходят за пределы 0-10")
            submission['rating_predict'] = submission['rating_predict'].clip(0, 10)
            submission.to_csv('submission_final_clipped.csv', index=False)
            print("Сохранен clipped вариант: submission_final_clipped.csv")

        print(f"Медиана финальных предсказаний: {submission['rating_predict'].median():.2f}")

        if score > 0.61:
            print(f"Ожидаемая точность: ~{score:.4f}")
        else:
            print(f"Ожидаемая точность: ~{score:.4f}")

    except Exception as e:
        print(f"Ошибка: {e}")
        import traceback
        traceback.print_exc()

        print("Создание простого сабмита...")
        try:
            test_path = find_file('test.csv')
            test_df = pd.read_csv(test_path, sep=',')
            submission = test_df[['user_id', 'book_id']].copy()
            submission['rating_predict'] = 5.0
            submission.to_csv('submission_basic.csv', index=False)
            print("Создан простой сабмит: submission_basic.csv")
        except:
            print("Не удалось создать простой сабмит")

    finally:
        elapsed = time.time() - start_time
        print(f"\nВремя выполнения: {elapsed:.1f} секунд")

Запуск пайплайна...
Запуск основного пайплайна...
Загрузка данных...
Найден: train.csv
Найден: test.csv
Найден: books.csv
Найден: users.csv
Найден: book_descriptions.csv
Найден: book_genres.csv
Чтение train.csv...
Train после фильтрации has_read=1: 156179 строк
Размеры данных - Train: (156179, 5), Test: (2894, 2), Books: (50490, 8), Users: (7277, 3)
Объединенный датасет: (159073, 15)
Подготовка train данных...
Добавление жанров...
Добавление TF-IDF...
Добавлено 300 TF-IDF фич
Добавление фич взаимодействия...
Оптимизация памяти: 388.4MB -> 191.2MB
Временной сплит (ratio=0.8)...
Train split: 124944 записей
Val split: 31235 записей
Добавление агрегатных фич...
Заполнение пропусков...
Добавление агрегатных фич...
Заполнение пропусков...
Используется 325 фич
Обучение LightGBM...
Начало обучения...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	train's rmse: 2.50336	val's rmse: 2.88889

Результаты на валидации:
RMSE: 2.8889
MAE: 2.1177
SCO