In [None]:
import numpy as np
import pandas as pd
import os
import sys
import gc
import time
import warnings
import pickle
import random
import lightgbm as lgb
# from sklearn.model_selection import train_test_split # Not strictly needed for this implementation
from math import ceil, floor
from datetime import timedelta # Импортируем timedelta
import psutil # Для мониторинга памяти, если нужно

warnings.filterwarnings('ignore')

# --- 0. Конфигурация и Настройка Директорий ---
print("--- Stage 0: Configuration & Setup ---")
dir_ = '/kaggle/input/kazakhstan-ai-respa-take-home' # <--- ИЗМЕНИТЕ НА ВАШ КОРНЕВОЙ КАТАЛОГ
# Если запускаете локально, задайте полные пути или относительные от места запуска скрипта
raw_data_dir = os.path.join(dir_) # Пример пути
processed_data_dir = os.path.join('/kaggle/working/', 'processed')
model_dir = os.path.join("/kaggle/working/", 'models')
submission_dir = os.path.join('/kaggle/working/', 'submissions')

# Создание папок
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(submission_dir, exist_ok=True)
print(f"Raw data expected in: {os.path.abspath(raw_data_dir)}")
print(f"Processed data will be saved to: {os.path.abspath(processed_data_dir)}")
print(f"Models will be saved to: {os.path.abspath(model_dir)}")
print(f"Submissions will be saved to: {os.path.abspath(submission_dir)}")


TARGET = 'num_papers'
P_HORIZON = 8  # Прогнозируем на 8 недель
SEED = 42
HIST_END_DATE = pd.to_datetime('2025-02-09') # Дата конца исторических данных

# Конфигурация признаков
lag_weeks = [1, 2, 3, 4, 8, 12, 52] # Лаги в неделях
rolling_windows = [4, 8, 12, 26, 52] # Окна для скользящих средних/std
lag_for_roll = 1 # Сдвиг перед расчетом скользящих окон
validation_weeks = P_HORIZON # Сколько недель использовать для валидации перед обучением рекурсивной модели
recursive_history_weeks = 52 * 2 # Сколько недель истории включать в данные для рекурсивного прогноза

# LGBM Параметры (стартовые, ТРЕБУЮТ НАСТРОЙКИ!)
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie', # или 'poisson'/'regression_l2'
    'tweedie_variance_power': 1.1, # Настроить на валидации (1-2)
    'metric': 'rmse',
    'subsample': 0.7,
    'subsample_freq': 1,
    'learning_rate': 0.02, # Настроить
    'num_leaves': 2**7 - 1, # Настроить
    'min_data_in_leaf': 2**7 - 1, # Настроить
    'feature_fraction': 0.7,
    'max_bin': 100, # Обычно достаточно для признаков с небольшим числом уникальных значений
    'n_estimators': 3000, # Управляется через early stopping
    'boost_from_average': False,
    'verbose': -1,
    'seed': SEED,
    'num_threads': -1 # Использовать все доступные ядра
}
early_stopping_rounds = 50 # Для early stopping

# --- Вспомогательные Функции ---
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # Также полезно для torch, tensorflow, если они используются
    # try:
    #     import torch
    #     torch.manual_seed(seed)
    #     torch.cuda.manual_seed(seed)
    #     torch.backends.cudnn.deterministic = True
    #     torch.backends.cudnn.benchmark = False
    # except ImportError:
    #     pass
    # try:
    #     import tensorflow as tf
    #     tf.random.set_seed(seed)
    # except ImportError:
    #     pass

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isna(c_min) or pd.isna(c_max): # Пропускаем, если есть NaN, т.к. они могут влиять на min/max типов
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Проверяем NaN явно перед конвертацией
                # Осторожно с float16, может привести к потере точности
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                     # Только если очень уверены в отсутствии проблем с точностью/NaN
                     # df[col] = df[col].astype(np.float16)
                     # Вместо этого используем float32 как более безопасный вариант
                     df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# Функция для рекурсивного расчета rolling features
def make_lag_roll(df, target_col, group_col, shift_day, roll_wind):
    col_name = f'rolling_mean_{shift_day}_{roll_wind}'
    df[col_name] = df.groupby(group_col)[target_col].shift(shift_day).rolling(roll_wind, min_periods=1).mean().astype(np.float16) # min_periods=1, чтобы избежать NaN в начале рекурсии
    # Добавим std тоже
    col_name_std = f'rolling_std_{shift_day}_{roll_wind}'
    df[col_name_std] = df.groupby(group_col)[target_col].shift(shift_day).rolling(roll_wind, min_periods=2).std().astype(np.float16) # min_periods=2 для std
    return df[[col_name, col_name_std]]

# --- Этап 1: Препроцессинг и Инжиниринг Признаков ---
def run_preprocessing():
    print("\n--- Running Stage 1: Preprocessing & Feature Engineering ---")
    overall_start_time = time.time()

    # 1. Загрузка Данных
    print("Loading data...")
    try:
        # ИЗМЕНИТЕ ИМЕНА ФАЙЛОВ НА ВАШИ!
        history_df = pd.read_csv(os.path.join(raw_data_dir, 'train.csv')) # ИЛИ arxiv_history.csv
        history_df.rename(columns={'week':'absolute_week_id', 'value':'num_papers'}, inplace=True, errors='ignore') # Пример переименования
        assert 'absolute_week_id' in history_df.columns
        assert 'category' in history_df.columns
        assert TARGET in history_df.columns

        test_df_template = pd.read_csv(os.path.join(raw_data_dir, 'test-7.csv')) # Шаблон
    except FileNotFoundError as e:
        print(f"ERROR: Required input file not found: {e}. Exiting.")
        sys.exit()
    except Exception as e:
        print(f"ERROR loading data: {e}. Exiting.")
        sys.exit()

    history_df = reduce_mem_usage(history_df)
    test_df_template = reduce_mem_usage(test_df_template)

    LAST_HIST_WEEK = history_df['absolute_week_id'].max()
    FIRST_PRED_WEEK = LAST_HIST_WEEK + 1
    TOTAL_WEEKS_TO_GENERATE = LAST_HIST_WEEK + P_HORIZON
    print(f"Historical data ends at week {LAST_HIST_WEEK}. Prediction starts week {FIRST_PRED_WEEK}.")

    # 2. Генерация Календаря
    print("Generating calendar...")
    # Используем существующие week ID, если есть в истории
    min_hist_week = history_df['absolute_week_id'].min()
    all_weeks = range(min_hist_week, TOTAL_WEEKS_TO_GENERATE + 1)
    calendar_df = pd.DataFrame({'absolute_week_id': all_weeks})

    # Предполагаем, что у нас НЕТ точных дат, работаем с номерами
    # Если есть даты, используйте метод с timedelta из прошлого ответа
    calendar_df['year'] = (calendar_df['absolute_week_id'] / 52.1775).astype(int) + 2021 # Примерное вычисление года
    calendar_df['week_of_year'] = ((calendar_df['absolute_week_id'] - 1) % 52).astype(int) + 1 # Примерное
    calendar_df['month'] = ((calendar_df['week_of_year'] - 1) // (52 / 12)).astype(int) + 1
    calendar_df['quarter'] = ((calendar_df['month'] - 1) // 3).astype(int) + 1

    # Циклические признаки
    calendar_df['week_sin'] = np.sin(2 * np.pi * calendar_df['week_of_year'] / 52.1775).astype(np.float32)
    calendar_df['week_cos'] = np.cos(2 * np.pi * calendar_df['week_of_year'] / 52.1775).astype(np.float32)
    calendar_df['month_sin'] = np.sin(2 * np.pi * calendar_df['month'] / 12).astype(np.float32)
    calendar_df['month_cos'] = np.cos(2 * np.pi * calendar_df['month'] / 12).astype(np.float32)

    calendar_df = reduce_mem_usage(calendar_df, verbose=False)

    # 3. Создание Базовой Сетки
    print("Creating base grid...")
    categories = history_df['category'].unique()
    future_grid = []
    for week_pred_rel in range(1, P_HORIZON + 1):
        abs_week = FIRST_PRED_WEEK + week_pred_rel - 1
        for cat in categories:
            future_grid.append([cat, week_pred_rel, abs_week, np.nan])

    future_df = pd.DataFrame(future_grid, columns=['category', 'week_id', 'absolute_week_id', TARGET])
    future_df = reduce_mem_usage(future_df, verbose=False)

    grid_df = pd.concat([history_df[['category', 'absolute_week_id', TARGET]], future_df.drop(columns=['week_id'])]).reset_index(drop=True)
    grid_df = grid_df.merge(future_df[['category', 'absolute_week_id', 'week_id']], on=['category', 'absolute_week_id'], how='left')
    del future_df; gc.collect()

    # 4. Добавление Календарных Признаков
    print("Adding calendar features...")
    grid_df = grid_df.merge(calendar_df, on='absolute_week_id', how='left')
    grid_df = reduce_mem_usage(grid_df, verbose=False)

    # 5. Создание Лаговых Признаков
    print("Creating lag features...")
    grid_df = grid_df.sort_values(by=['category', 'absolute_week_id'])
    for lag in lag_weeks:
        grid_df[f'{TARGET}_lag_{lag}'] = grid_df.groupby('category')[TARGET].shift(lag).astype(np.float16)
    grid_df = reduce_mem_usage(grid_df, verbose=False)

    # 6. Создание Скользящих Признаков
    print("Creating rolling features...")
    for window in rolling_windows:
        print(f" Rolling window: {window}")
        grid_df[f'rolling_mean_{lag_for_roll}_{window}'] = grid_df.groupby('category')[TARGET].shift(lag_for_roll).rolling(window, min_periods=max(1, ceil(window/4))).mean().astype(np.float16)
        grid_df[f'rolling_std_{lag_for_roll}_{window}'] = grid_df.groupby('category')[TARGET].shift(lag_for_roll).rolling(window, min_periods=max(2, ceil(window/4))).std().astype(np.float16)
    grid_df = reduce_mem_usage(grid_df)

    # 7. Создание Целевого Кодирования
    print("Creating mean encoding features...")
    # Кодирование будет рассчитано перед обучением каждой модели на правильном срезе данных

    # 8. Финальная обработка и сохранение
    print("Finalizing features and saving...")
    grid_df = grid_df.sort_values(by=['category', 'absolute_week_id']).reset_index(drop=True)

    # Определяем признаки для модели (кроме идентификаторов и цели)
    # `category` оставим для LabelEncoding внутри обучения
    cols_to_drop = ['absolute_week_id', 'week_id', TARGET] # Список колонок НЕ используемых как признаки
    model_features = [col for col in grid_df.columns if col not in cols_to_drop and col != 'category']
    categorical_features = ['year', 'week_of_year', 'month', 'quarter'] # Добавить другие, если есть
    # Добавляем саму 'category' к категориальным для передачи в LGBM
    categorical_for_lgbm = categorical_features + ['category_encoded']

    print("Feature List:", model_features)
    print("Categorical Features for Model:", categorical_for_lgbm)

    # Сохранение итоговой сетки
    grid_path = os.path.join(processed_data_dir, 'grid_arxiv_features.pkl')
    grid_df.to_pickle(grid_path)
    print(f"Feature grid saved to {grid_path}. Shape: {grid_df.shape}")
    print(f"Memory usage: {grid_df.memory_usage().sum() / 1024**2:.2f} Mb")
    print(f"--- Stage 1: Completed in {(time.time() - overall_start_time)/60:.2f} min ---")
    gc.collect()
    return grid_df, model_features, categorical_for_lgbm, categories

# --- Этап 2: Обучение Моделей ---
def run_training(grid_df, model_features, categorical_for_lgbm, categories):
    print("\n--- Running Stage 2: Model Training ---")
    overall_start_time = time.time()
    seed_everything(SEED)

    # Используем validation_weeks для определения валидационного периода
    VALID_START_WEEK = FIRST_PRED_WEEK - validation_weeks
    print(f"Training up to week {VALID_START_WEEK-1}")
    print(f"Validating on weeks {VALID_START_WEEK} to {FIRST_PRED_WEEK-1}")

    # Сохраняем данные для рекурсивного предсказания (история + будущие NaN)
    print("Preparing data for recursive prediction...")
    start_hist_recursive = FIRST_PRED_WEEK - recursive_history_weeks
    weeks_for_recursive_test = range(start_hist_recursive, FIRST_PRED_WEEK + P_HORIZON)
    recursive_base_data = grid_df[grid_df['absolute_week_id'].isin(weeks_for_recursive_test)].copy()
    recursive_base_data.loc[recursive_base_data['absolute_week_id'] >= FIRST_PRED_WEEK, TARGET] = np.nan
    recursive_test_path = os.path.join(processed_data_dir, 'test_recursive_arxiv.pkl')
    recursive_base_data.to_pickle(recursive_test_path)
    print(f"Data for recursive prediction saved. Shape: {recursive_base_data.shape}")
    del recursive_base_data; gc.collect()

    # Цикл обучения по категориям
    trained_models_recursive = {}
    trained_models_nonrecursive = {}

    # Применим Label Encoding для 'category' один раз
    grid_df['category_encoded'] = grid_df['category'].astype('category').cat.codes.astype('int16')
    features_to_use = model_features + ['category_encoded']
    categorical_to_use_lgbm = [f for f in categorical_for_lgbm if f in features_to_use]

    for category_code, category_name in enumerate(categories):
        print(f"-- Training for Category: {category_name} (Code: {category_code}) --")
        category_df = grid_df[grid_df['category_encoded'] == category_code]

        # Перерасчет mean-encoding для избежания утечек на валидации
        # Обычно делается в K-Fold CV, здесь упрощенный вариант
        # Рассчитываем на данных строго ДО НАЧАЛА валидационного сета
        encoding_train_mask = category_df['absolute_week_id'] < VALID_START_WEEK
        if encoding_train_mask.sum() > 0:
             mean_enc = category_df.loc[encoding_train_mask, TARGET].mean()
             std_enc = category_df.loc[encoding_train_mask, TARGET].std()
             category_df['enc_category_mean'] = mean_enc.astype(np.float16)
             category_df['enc_category_std'] = std_enc.fillna(0).astype(np.float16) # fillna если только 1 значение в истории
        else:
             category_df['enc_category_mean'] = 0.0
             category_df['enc_category_std'] = 0.0

        # Удаление строк с NaN в лагах/роллингах ТОЛЬКО ДЛЯ ОБУЧАЮЩЕЙ части
        # Не затрагивает валидацию и будущее
        lag_check_col = f'{TARGET}_lag_{max(lag_weeks)}' # Самый длинный лаг
        roll_check_col = f'rolling_mean_{lag_for_roll}_{max(rolling_windows)}' # Пример роллинга
        check_cols = [col for col in [lag_check_col, roll_check_col] if col in category_df.columns]

        train_mask_nr = category_df['absolute_week_id'] < VALID_START_WEEK
        valid_mask_nr = (category_df['absolute_week_id'] >= VALID_START_WEEK) & (category_df['absolute_week_id'] < FIRST_PRED_WEEK)

        # Очищаем NaN из трейна для обеих моделей (делаем это тут, чтобы не копировать дважды)
        train_df_clean = category_df[train_mask_nr].dropna(subset=check_cols).copy()
        valid_df = category_df[valid_mask_nr].copy() # На валидации NaN не убираем

        # 1. Обучение нерекурсивной модели
        if not train_df_clean.empty and not valid_df.empty:
            print(f"   Training non-recursive model... Train shape: {train_df_clean.shape}, Valid shape: {valid_df.shape}")
            train_data_nr = lgb.Dataset(train_df_clean[features_to_use],
                                        label=train_df_clean[TARGET],
                                        categorical_feature=[f for f in categorical_to_use_lgbm if f in features_to_use])
            valid_data_nr = lgb.Dataset(valid_df[features_to_use],
                                        label=valid_df[TARGET],
                                        categorical_feature=[f for f in categorical_to_use_lgbm if f in features_to_use],
                                        reference=train_data_nr)

            estimator_nr = lgb.train(lgb_params,
                                     train_data_nr,
                                     valid_sets=[valid_data_nr],
                                     callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=100)], # Вывод при улучшении
                                     num_boost_round=lgb_params['n_estimators'])

            model_path_nr = os.path.join(model_dir, f'lgb_model_nonrecursive_{category_name}.bin')
            estimator_nr.save_model(model_path_nr)
            trained_models_nonrecursive[category_name] = estimator_nr # Сохраняем модель
            print(f"   Non-recursive model saved to {model_path_nr}. Best Iteration: {estimator_nr.best_iteration}")
            del train_data_nr, valid_data_nr, estimator_nr; gc.collect()
        else:
             print("   Skipping non-recursive: Empty train or valid set after NaN removal.")

        # 2. Обучение рекурсивной модели (на всех данных до FIRST_PRED_WEEK)
        train_mask_r = category_df['absolute_week_id'] < FIRST_PRED_WEEK
        train_df_r_clean = category_df[train_mask_r].dropna(subset=check_cols).copy()
        # Используем тот же валидационный сет для early stopping
        valid_df_r = valid_df # Мы его уже скопировали

        if not train_df_r_clean.empty and not valid_df_r.empty:
            print(f"   Training recursive model... Train shape: {train_df_r_clean.shape}, Valid shape: {valid_df_r.shape}")
            train_data_r = lgb.Dataset(train_df_r_clean[features_to_use],
                                       label=train_df_r_clean[TARGET],
                                       categorical_feature=[f for f in categorical_to_use_lgbm if f in features_to_use])
            valid_data_r_lgb = lgb.Dataset(valid_df_r[features_to_use],
                                           label=valid_df_r[TARGET],
                                           categorical_feature=[f for f in categorical_to_use_lgbm if f in features_to_use],
                                           reference=train_data_r)

            estimator_r = lgb.train(lgb_params,
                                    train_data_r,
                                    valid_sets=[valid_data_r_lgb],
                                    callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=100)],
                                    num_boost_round=lgb_params['n_estimators'])

            model_path_r = os.path.join(model_dir, f'lgb_model_recursive_{category_name}.bin')
            estimator_r.save_model(model_path_r)
            trained_models_recursive[category_name] = estimator_r
            print(f"   Recursive model saved to {model_path_r}. Best Iteration: {estimator_r.best_iteration}")
            del train_data_r, valid_data_r_lgb, estimator_r, train_df_r_clean, valid_df_r; gc.collect()
        else:
            print("   Skipping recursive: Empty train or valid set after NaN removal.")

        del category_df; gc.collect()

    print(f"\n--- Stage 2: Completed in {(time.time() - overall_start_time)/60:.2f} min ---")
    return trained_models_recursive, trained_models_nonrecursive, features_to_use, categorical_to_use_lgbm


# --- Этап 3: Генерация Прогнозов ---
def run_prediction(trained_models_recursive, trained_models_nonrecursive, features_to_use, categories):
    print("\n--- Starting Stage 3: Prediction Generation ---")
    overall_start_time = time.time()

    # Загрузка данных (полная сетка или только нужные для теста + истории)
    grid_df = pd.read_pickle(os.path.join(processed_data_dir, 'grid_arxiv_features.pkl'))
    grid_df['category_encoded'] = grid_df['category'].astype('category').cat.codes.astype('int16')
    grid_df = reduce_mem_usage(grid_df, verbose=False)

    # 1. Нерекурсивный Прогноз
    print(" Generating non-recursive predictions...")
    nonrecursive_preds_list = []
    test_mask = grid_df['absolute_week_id'] >= FIRST_PRED_WEEK

    for category_name in categories:
        print(f"   Predicting non-recursive for: {category_name}")
        if category_name in trained_models_nonrecursive:
            model_nr = trained_models_nonrecursive[category_name]
            category_code = grid_df.loc[grid_df['category']==category_name, 'category_encoded'].iloc[0] # Получаем код
            category_test_features = grid_df[(grid_df['category_encoded'] == category_code) & test_mask]

            if not category_test_features.empty:
                predictions = model_nr.predict(category_test_features[features_to_use], num_iteration=model_nr.best_iteration)
                # Копируем предсказания в новый DF, чтобы избежать SettingWithCopyWarning
                preds_df = category_test_features[['category', 'absolute_week_id', 'week_id']].copy()
                preds_df[TARGET] = predictions.clip(0)
                nonrecursive_preds_list.append(preds_df)
            else:
                print(f"   Warning: No test features found for non-recursive category {category_name}")
        else:
            print(f"   Warning: No non-recursive model found for category {category_name}")


    if nonrecursive_preds_list:
         nonrecursive_preds_df = pd.concat(nonrecursive_preds_list).reset_index(drop=True)
         nonrecursive_preds_final = nonrecursive_preds_df.pivot(index='category', columns='week_id', values=TARGET)
         nonrecursive_preds_final.columns = [f'F{i}' for i in range(1, P_HORIZON + 1)]
         print(" Non-recursive predictions generated.")
    else:
         print(" ERROR: No non-recursive predictions were generated.")
         nonrecursive_preds_final = None # Or handle error as appropriate

    del nonrecursive_preds_list, test_mask, model_nr, category_test_features, grid_df['category_encoded']; gc.collect() # Очистка

    # 2. Рекурсивный Прогноз
    print("\n Generating recursive predictions...")
    recursive_test_path = os.path.join(processed_data_dir, 'test_recursive_arxiv.pkl')
    base_test = pd.read_pickle(recursive_test_path)
    base_test['category_encoded'] = base_test['category'].astype('category').cat.codes.astype('int16') # Encode category
    base_test = reduce_mem_usage(base_test, verbose=False)

    all_recursive_preds = []
    rolling_features_to_recalc = [col for col in features_to_use if 'rolling' in col]
    lags_to_recalc = [col for col in features_to_use if f'{TARGET}_lag_' in col] # Все лаги зависят от предсказаний

    predict_start_time = time.time()
    for predict_step in range(1, P_HORIZON + 1):
        current_pred_week_abs = FIRST_PRED_WEEK + predict_step - 1
        print(f"  Predicting recursive | Step: {predict_step}/{P_HORIZON} (Week {current_pred_week_abs})")
        step_start_time = time.time()

        # Создаем маску для текущего дня
        day_mask_abs = base_test['absolute_week_id'] == current_pred_week_abs

        # Подготовка данных для предсказания (вычисляем признаки ДО самого предсказания)
        current_features_df = base_test[day_mask_abs].copy()

        # Пересчет признаков для predict_step > 1
        if predict_step > 1:
            print(f"   Recalculating features for step {predict_step}...")
            # Пересчет лагов (используем обновленный base_test)
            for lag in lag_weeks:
                lag_col_name = f'{TARGET}_lag_{lag}'
                if lag_col_name in base_test.columns:
                     # Shift from the original base_test series, using the updated TARGET column
                     shifted_values = base_test.groupby('category')[TARGET].shift(lag)
                     # Update only the rows corresponding to the current prediction week
                     current_features_df[lag_col_name] = shifted_values.loc[current_features_df.index]

            # Пересчет скользящих окон
            for window in rolling_windows:
                # Only recalc those needed based on previous preds
                 mean_col_name = f'rolling_mean_{lag_for_roll}_{window}'
                 std_col_name = f'rolling_std_{lag_for_roll}_{window}'

                 if mean_col_name in rolling_features_to_recalc:
                    # Shift and roll on the *updated* base_test TARGET
                    rolled_mean = base_test.groupby('category')[TARGET].shift(lag_for_roll).rolling(window, min_periods=1).mean()
                    current_features_df[mean_col_name] = rolled_mean.loc[current_features_df.index]
                 if std_col_name in rolling_features_to_recalc:
                    rolled_std = base_test.groupby('category')[TARGET].shift(lag_for_roll).rolling(window, min_periods=2).std()
                    current_features_df[std_col_name] = rolled_std.loc[current_features_df.index].fillna(0) # fillna if std is NaN

            current_features_df = reduce_mem_usage(current_features_df, verbose=False)

        current_step_preds_data = [] # Данные для записи в основной base_test
        # Предсказание по категориям
        for category_name in categories:
            if category_name in trained_models_recursive:
                 model_r = trained_models_recursive[category_name]
                 category_code = current_features_df.loc[current_features_df['category']==category_name, 'category_encoded'].iloc[0] # Получаем код
                 mask_pred = (current_features_df['category_encoded'] == category_code)

                 if mask_pred.sum() > 0:
                      features_for_pred = current_features_df.loc[mask_pred, features_to_use]
                      # Проверим на NaN перед предсказанием (можно заменить на 0 или среднее)
                      if features_for_pred.isnull().any().any():
                            print(f"    Warning: NaNs found in features for {category_name} at step {predict_step}. Filling with 0.")
                            features_for_pred = features_for_pred.fillna(0)
                      predictions = model_r.predict(features_for_pred, num_iteration=model_r.best_iteration)
                      current_step_preds_data.append(pd.DataFrame({
                          'index': current_features_df.loc[mask_pred].index,
                          TARGET: predictions.clip(0),
                          'category': category_name, # Для сводки
                          'absolute_week_id': current_pred_week_abs, # Для сводки
                          'week_id': predict_step # Для сводки
                      }))

        # Обновление base_test ПЕРЕД следующим шагом рекурсии
        if current_step_preds_data:
             current_preds_df = pd.concat(current_step_preds_data).set_index('index')
             base_test.loc[current_preds_df.index, TARGET] = current_preds_df[TARGET]
             # Добавляем предсказания текущего шага в общий список
             all_recursive_preds.append(current_preds_df[['category', 'absolute_week_id', 'week_id', TARGET]])
        else:
            print(f"   Warning: No predictions made for step {predict_step}")


        print(f"   Week {predict_step} processed in {(time.time() - step_start_time):.2f} sec")
        gc.collect()

    # Сборка рекурсивных прогнозов
    if all_recursive_preds:
        recursive_preds_long = pd.concat(all_recursive_preds).reset_index(drop=True)
        recursive_preds_final = recursive_preds_long.pivot(index='category', columns='week_id', values=TARGET)
        recursive_preds_final.columns = [f'F{i}' for i in range(1, P_HORIZON + 1)]
        print(" Recursive predictions generated.")
    else:
        print(" ERROR: No recursive predictions were generated.")
        recursive_preds_final = None # Handle error

    del base_test, recursive_preds_long; gc.collect()
    print(f"\n--- Stage 3: Completed in {(time.time() - overall_start_time)/60:.2f} min ---")
    return recursive_preds_final, nonrecursive_preds_final

# --- Этап 4: Ансамблирование ---
def run_ensembling(recursive_preds_final, nonrecursive_preds_final):
    print("\n--- Starting Stage 4: Ensembling ---")
    if recursive_preds_final is None or nonrecursive_preds_final is None:
        print(" ERROR: One or both prediction sets are missing. Cannot ensemble.")
        return None

    # Убедимся, что обе таблицы имеют одинаковые индексы и колонки
    common_categories = recursive_preds_final.index.intersection(nonrecursive_preds_final.index)
    recursive_preds_final = recursive_preds_final.loc[common_categories]
    nonrecursive_preds_final = nonrecursive_preds_final.loc[common_categories]
    print(f"Ensembling predictions for {len(common_categories)} categories.")

    ensembled_preds = (recursive_preds_final + nonrecursive_preds_final) / 2.0
    print("Ensemble created (simple average).")
    print(f"--- Stage 4: Completed ---")
    return ensembled_preds

# --- Этап 5: Формирование Итогового Файла ---
def format_submission(ensembled_preds, test_template_path, output_path):
    print("\n--- Starting Stage 5: Formatting Final Submission ---")
    if ensembled_preds is None:
        print(" ERROR: Ensembled predictions are missing. Cannot format submission.")
        return

    final_sub_long = ensembled_preds.stack().reset_index()
    final_sub_long.columns = ['category', 'forecast_week_col', TARGET]

    final_sub_long['week_id'] = final_sub_long['forecast_week_col'].str[1:].astype(int)
    final_sub_long['id'] = final_sub_long['category'] + '__' + final_sub_long['week_id'].astype(str)

    final_submission_df = final_sub_long[['id', TARGET]]
    final_submission_df.rename(columns={TARGET: 'num_papers'}, inplace=True)

    # Загрузим оригинальный test-7 для гарантии всех ID и правильного порядка
    submission_template = pd.read_csv(test_template_path)
    # Убедимся, что в id есть только 'category__week_id'
    if 'category' in submission_template.columns and 'week_id' in submission_template.columns:
        submission_template['id'] = submission_template['category'] + '__' + submission_template['week_id'].astype(str)
        submission_template = submission_template[['id']].copy() # Оставляем только id

    final_submission = submission_template.merge(final_submission_df, on='id', how='left')
    # Заполним пропуски (если какие-то категории не предсказались) нулями
    final_submission['num_papers'].fillna(0, inplace=True)
    # Убедимся, что все неотрицательное
    final_submission['num_papers'] = final_submission['num_papers'].clip(0)

    # Округление? Иногда требуется целое число
    # final_submission['num_papers'] = final_submission['num_papers'].round().astype(int)

    final_submission.to_csv(output_path, index=False)
    print(f"Final submission saved to {output_path}")
    print(final_submission.head())
    print(f"--- Stage 5: Completed ---")
    return final_submission

# --- Основной Пайплайн ---
if __name__ == '__main__':
    total_pipeline_start_time = time.time()

    grid_df, model_features, categorical_for_lgbm, categories = run_preprocessing()
    trained_models_r, trained_models_nr, features_to_use, _ = run_training(grid_df, model_features, categorical_for_lgbm, categories)
    recursive_preds, nonrecursive_preds = run_prediction(trained_models_r, trained_models_nr, features_to_use, categories)
    ensemble_predictions = run_ensembling(recursive_preds, nonrecursive_preds)
    final_sub = format_submission(
        ensemble_predictions,
        os.path.join(raw_data_dir, 'test-7.csv'),
        os.path.join(submission_dir, 'submission_ensemble_arxiv_final.csv')
    )

    print(f"\n Pipeline Finished. Total execution time: {(time.time() - total_pipeline_start_time)/60:.2f} minutes.")