# Дорогой дневник

--------------------------------------------

1) Сначала я решил попробовать без генерации новых фичей и без подкрутки параметров запустить LightGBM и посмотреть, что будет. Скор 0.720
При этом пока еще нет ни генерации новых признаков, ни использования таблицы поиска и тд. Попробуем добавить.

2) Просто запускаем код из "baseline_1_pandas.ipynb" и получаем обещаный скор 0.817

3) Оптюнил 10 минут, безрезультатно. Буду придумывать новые признаки. Думаю начать с кластеризации и knn. 

4) Сначала решил просто расширить плавающее окно с 4 до 5 месяцев, результат 0.8192.

    Есть огромное количество идей, только что заменил пандас на поларс, потому что он реально на порядок быстрее. В первую очередь хочется применить знания с семинара по интерпретации бустингов, но сначала заменить катбуст на lgbm, ибо Илья утверждал, что при должном обращении он рвет и мечет.

--------------------------------------------

# Другой подход к кластеризации

Пробуем новую идею, если не получается кластеризовать все обьекты, будем делать так: мы же уже знаем какие есть кластеры, просто раздадим метки по ключевым словам, олценим сколько осталось и проведем еще одну кластеризацию. 

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import catboost
import os
from datetime import date, timedelta
from itertools import combinations

from local_utils import *
import lightgbm as lgb

%load_ext autoreload
%autoreload 2

test_start_date = date(2024, 8, 1)
val_start_date = date(2024, 7, 1)
val_end_date = date(2024, 7, 31)
train_end_date = date(2024, 6, 30)
data_path = "C:\\Users\\Admin\\Desktop\\AIM 2сем\\ML2\\hw2"

actions_history = pl.scan_parquet(os.path.join(data_path, 'actions_history/*.parquet')).collect()
search_history = pl.scan_parquet(os.path.join(data_path, 'cluster_search/*.parquet')).collect()
product_information = pl.read_csv(
    os.path.join(data_path, 'cluster_product_information.csv'),
    ignore_errors=True
)

val_target = (
    actions_history
    .filter(pl.col('timestamp').dt.date() >= val_start_date)
    .filter(pl.col('timestamp').dt.date() <= val_end_date)
    .select('user_id', (pl.col('action_type_id') == 3).alias('has_order'))
    .group_by('user_id')
    .agg(pl.max('has_order').cast(pl.Int32).alias('target'))
)

val_target.group_by('target').agg(pl.count('user_id'))

target,user_id
i32,u32
0,1227381
1,647575


In [2]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite", 
    3: "order",
    5: "to_cart",
}

# Сначала соберем все агрегированные данные
all_aggs = []
numeric_features = []

for id_, suf in actions_id_to_suf.items():
    aggs = (
        actions_history
        .filter(pl.col('timestamp').dt.date() <= train_end_date)
        .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30 * 4))
        .filter(pl.col('action_type_id') == id_)
        .join(
            product_information.select('product_id', 'discount_price'),
            on='product_id',
        )
        .group_by('user_id')
        .agg(
            pl.count('product_id').cast(pl.Int32).alias(f'num_products_{suf}'),
            pl.sum('discount_price').cast(pl.Float32).alias(f'sum_discount_price_{suf}'),
            pl.max('discount_price').cast(pl.Float32).alias(f'max_discount_price_{suf}'),
            pl.max('timestamp').alias(f'last_{suf}_time'),
            pl.min('timestamp').alias(f'first_{suf}_time'),
        )
        .with_columns([
            (pl.lit(val_start_date) - pl.col(f'last_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_last_{suf}'),
            
            (pl.lit(val_start_date) - pl.col(f'first_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_first_{suf}'),
        ])
    )
    
    # Сохраняем имена числовых фичей для последующего создания полиномов
    numeric_features.extend([
        f'num_products_{suf}',
        f'sum_discount_price_{suf}', 
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}',
    ])
    
    actions_aggs[id_] = aggs
    all_aggs.append(aggs)

# Объединяем все агрегации по user_id с указанием суффиксов
combined = all_aggs[0]
for i, agg in enumerate(all_aggs[1:], 1):
    combined = combined.join(
        agg, 
        on='user_id', 
        how='left',
        suffix=f"_{i}"  # Добавляем уникальный суффикс для каждого соединения
    )

In [3]:
# search_aggs
id_ = 4
suf = 'search'

# Сначала вычислим value_counts отдельно и развернем их в плоскую структуру
cluster_counts = (
    search_history
    .filter(pl.col('action_type_id') == id_)
    .filter(pl.col('timestamp').dt.date() <= train_end_date)
    .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30 * 5))
    .group_by('user_id')
    .agg(
        pl.col('cluster').value_counts().alias('cluster_counts')
    )
    .explode('cluster_counts')
    .with_columns(
        pl.col('cluster_counts').struct.field('cluster').alias('cluster_name'),
        pl.col('cluster_counts').struct.field('count').alias('cluster_count')
    )
    .group_by('user_id')
    .agg(
        pl.col('cluster_name').sort_by('cluster_count', descending=True).head(3).alias('top3_clusters'),
        pl.col('cluster_count').sort(descending=True).head(3).alias('top3_counts')
    )
)

actions_aggs[id_] = (
    search_history
    .filter(pl.col('action_type_id') == id_)
    .filter(pl.col('timestamp').dt.date() <= train_end_date)
    .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30 * 5))
    .group_by('user_id')
    .agg(
        # Общее количество поисков за 5 месяцев
        pl.count('search_query').cast(pl.Int32).alias(f'num_{suf}'),
        pl.col('search_query').n_unique().alias(f'unique_{suf}_queries'),
        
        # Количество поисков за последний месяц (30 дней)
        pl.col('search_query')
            .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30))
            .count()
            .cast(pl.Int32)
            .alias(f'num_{suf}_last_month'),
        
        # Количество поисков за последнюю неделю (7 дней)
        pl.col('search_query')
            .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=7))
            .count()
            .cast(pl.Int32)
            .alias(f'num_{suf}_last_week'),

        (pl.count() / (pl.max('timestamp') - pl.min('timestamp')).dt.total_days()).alias(f'{suf}_daily_rate'),

        pl.col('cluster').n_unique().alias(f'num_{suf}_clusters'),
        pl.col('cluster').mode().first().alias(f'main_{suf}_cluster'),
        
        # Динамика кластеров
        pl.col('cluster')
            .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30))
            .mode().first()
            .alias(f'recent_{suf}_cluster'),

        (pl.col('cluster').value_counts().struct.field('count').max() / pl.col('cluster').count()).alias(f'{suf}_cluster_concentration'),
        
        # Энтропия кластеров (мера разнообразия)
        (-(pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count()).log()
            * (pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count())
            .sum()).alias(f'{suf}_cluster_entropy'),
        
        # Переключения между кластерами
        pl.col('cluster').diff().fill_null(0).abs().sum().alias(f'{suf}_cluster_switches'),
        
        # Стабильность кластеров (процент повторяющихся)
        ((pl.col('cluster').count() - pl.col('cluster').n_unique()) / pl.col('cluster').count())
            .alias(f'{suf}_cluster_stability'),
        
        # Время в основном кластере
        (pl.col('timestamp')
            .filter(pl.col('cluster') == pl.col('cluster').mode().first())
            .count() / pl.col('timestamp').count())
            .alias(f'main_{suf}_cluster_time_ratio'),

        pl.col('timestamp').filter(pl.col('cluster').diff().fill_null(0) != 0)
            .diff()
            .dt.total_days()
            .mean()
            .alias(f'{suf}_mean_cluster_switch_days'),

        pl.col('search_query').str.len_chars().mean().alias(f'{suf}_mean_query_len'),
        
        (pl.col('search_query').str.len_chars()
            .filter(pl.col('cluster') == pl.col('cluster').mode().first()).mean() - 
            pl.col('search_query').str.len_chars()
                .filter(pl.col('cluster') != pl.col('cluster').mode().first()).mean())
                .alias(f'{suf}_main_cluster_query_len_diff'),

        pl.max('timestamp').alias(f'last_{suf}_time'),
        pl.min('timestamp').alias(f'first_{suf}_time'),
    )
    .join(cluster_counts, on='user_id', how='left')
    .with_columns([
        (pl.lit(val_start_date) - pl.col(f'last_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_last_{suf}'),

        (pl.lit(val_start_date) - pl.col(f'first_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_first_{suf}'),
    ])
    .select(
        'user_id',
        f'num_{suf}',
        f'unique_{suf}_queries',
        f'num_{suf}_last_month',
        f'num_{suf}_last_week',
        f'{suf}_daily_rate',
        f'num_{suf}_clusters',
        f'main_{suf}_cluster',
        pl.col('top3_clusters').alias(f'top3_{suf}_clusters'),
        pl.col('top3_counts').alias(f'top3_{suf}_counts'),
        f'recent_{suf}_cluster',
        f'{suf}_cluster_concentration',
        f'{suf}_cluster_entropy',
        f'{suf}_cluster_switches',
        f'{suf}_cluster_stability',
        f'main_{suf}_cluster_time_ratio',
        f'{suf}_mean_cluster_switch_days',
        f'{suf}_mean_query_len',
        f'{suf}_main_cluster_query_len_diff',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}',
        f'last_{suf}_time',
        f'first_{suf}_time',
    )
)

  (pl.count() / (pl.max('timestamp') - pl.min('timestamp')).dt.total_days()).alias(f'{suf}_daily_rate'),


In [4]:
product_cluster_aggs = (
    actions_history
    .filter(pl.col('timestamp').dt.date() <= train_end_date)
    .filter(pl.col('timestamp').dt.date() >= train_end_date - timedelta(days=30 * 5))
    .join(
        product_information.select('product_id', 'cluster'),
        on='product_id'
    )
    .group_by('user_id')
    .agg(
        # Общие агрегаты по кластерам продуктов
        pl.col('cluster').n_unique().alias('num_product_clusters'),
        pl.col('cluster').mode().first().alias('main_product_cluster'),
        
        # Аналогичные агрегаты как для search
        (pl.col('cluster').value_counts().struct.field('count').max() / pl.col('cluster').count()).alias('product_cluster_concentration'),
        
        # Энтропия кластеров продуктов
        (-(pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count()).log()
            * (pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count())
            .sum()).alias('product_cluster_entropy'),
        
        # Стабильность кластеров продуктов
        ((pl.col('cluster').count() - pl.col('cluster').n_unique()) / pl.col('cluster').count())
            .alias('product_cluster_stability'),
        
        # Время в основном кластере продуктов
        (pl.col('timestamp')
            .filter(pl.col('cluster') == pl.col('cluster').mode().first())
            .count() / pl.col('timestamp').count()
        ).alias('main_product_cluster_time_ratio'),
            
        # Top 3 кластеров продуктов
        pl.col('cluster').value_counts().struct.field('cluster').alias('top_product_clusters'),
        pl.col('cluster').value_counts().struct.field('count').alias('top_product_counts')
    )
    .with_columns(
        pl.col('top_product_clusters').list.head(3).alias('top3_product_clusters'),
        pl.col('top_product_counts').list.head(3).alias('top3_product_counts')
    )
    .drop(['top_product_clusters', 'top_product_counts'])
)

In [5]:
train_last_month_features = (
    actions_history
    .filter(
        (pl.col('timestamp').dt.date() < val_start_date) &  # до валидации
        (pl.col('timestamp').dt.date() >= val_start_date - timedelta(days=30))  # последние 30 дней
    )
    .join(
        product_information.select('product_id', 'discount_price', 'cluster'),
        on='product_id'
    )
    .group_by('user_id')
    .agg(
        # Общая активность
        pl.count().alias('total_actions_30d'),
        
        # Разбивка по типам действий
        (pl.col('action_type_id') == 1).sum().alias('clicks_30d'),
        (pl.col('action_type_id') == 2).sum().alias('favorites_30d'),
        (pl.col('action_type_id') == 5).sum().alias('cart_adds_30d'),
        
        # Финансовые метрики
        pl.sum('discount_price').alias('total_spent_30d'),
        pl.mean('discount_price').alias('avg_price_30d'),
        
        # Временные характеристики (исправлено!)
        (val_start_date - pl.col('timestamp').max().dt.date()).dt.total_days().alias('days_since_last_action'),
        (pl.col('timestamp').max() - pl.col('timestamp').min()).dt.total_days().alias('active_days_30d'),
        
        # Метрики кластеров
        pl.col('cluster').n_unique().alias('unique_clusters_30d'),
        (pl.col('cluster').value_counts().struct.field('count').max() / pl.count()).alias('main_cluster_ratio_30d')
    )
    .with_columns(
        # Производные признаки
        (pl.col('total_actions_30d') / pl.col('active_days_30d')).alias('daily_actions_rate_30d'),
        (pl.col('cart_adds_30d') / pl.col('total_actions_30d')).alias('cart_add_ratio_30d'),
        (pl.col('favorites_30d') / pl.col('total_actions_30d')).alias('favorite_ratio_30d')
    )
)

  pl.count().alias('total_actions_30d'),
  (pl.col('cluster').value_counts().struct.field('count').max() / pl.count()).alias('main_cluster_ratio_30d')


In [6]:
df_main = val_target
for _, actions_aggs_df in actions_aggs.items():
    df_main = (
        df_main
        .join(actions_aggs_df, on='user_id', how='left')
    )

df_main = df_main.join(product_cluster_aggs, on='user_id', how='left')
df_main = df_main.join(train_last_month_features, on='user_id', how='left')
    
df_pd = df_main.to_pandas()

columns_to_log = ['max_discount_price_click', 'num_products_favorite', 'sum_discount_price_favorite', 'max_discount_price_favorite',  'num_products_order', 'sum_discount_price_order', 'sum_discount_price_order',  'num_products_to_cart', 'max_discount_price_to_cart', 'num_search', 'unique_search_queries', 'num_search_last_month', 'num_search_last_week', 'search_daily_rate', 'search_cluster_switches', 'search_mean_query_len', 'search_main_cluster_query_len_diff']

df_pd = apply_log_transform(df_pd, columns_to_log, drop_original=True)

In [7]:
from local_utils import *

pca_cols = list(set(df_pd.columns) - {'user_id', 'target', 'last_click_time', 'first_click_time', 'last_favorite_time', 'first_favorite_time', 
                                'last_order_time', 'first_order_time', 'last_to_cart_time', 'first_to_cart_time', 'last_search_time', 'first_search_time',
                                'top3_search_clusters', 'top3_search_counts', 'search_cluster_entropy', 'top3_product_counts', 'product_cluster_entropy', 'top3_product_clusters'})
df_pd = add_pca_columns(df_pd,  pca_cols,  n_components = 2)

Nans filled


In [8]:
knn_cols = ['days_since_first_order', 'days_since_last_order', 'sum_discount_price_to_cart', 'num_products_click', 'main_search_cluster', 'search_cluster_stability', 'product_cluster_stability']

df_pd = add_knn_features_faiss(df_pd, knn_cols, n_neighbors=5, use_gpu=True)

Nans filled
Data scaled
Using CPU
FAISS index built
KNN search done


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


KNN features created


------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

In [9]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite", 
    3: "order",
    5: "to_cart",
}

# Сначала соберем все агрегированные данные
all_aggs = []
numeric_features = []

for id_, suf in actions_id_to_suf.items():
    aggs = (
        actions_history
        .filter(pl.col('timestamp').dt.date() <= val_end_date)
        .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30 * 5))
        .filter(pl.col('action_type_id') == id_)
        .join(
            product_information.select('product_id', 'discount_price'),
            on='product_id',
        )
        .group_by('user_id')
        .agg(
            pl.count('product_id').cast(pl.Int32).alias(f'num_products_{suf}'),
            pl.sum('discount_price').cast(pl.Float32).alias(f'sum_discount_price_{suf}'),
            pl.max('discount_price').cast(pl.Float32).alias(f'max_discount_price_{suf}'),
            pl.max('timestamp').alias(f'last_{suf}_time'),
            pl.min('timestamp').alias(f'first_{suf}_time'),
        )
        .with_columns([
            (pl.lit(test_start_date) - pl.col(f'last_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_last_{suf}'),
            
            (pl.lit(test_start_date) - pl.col(f'first_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_first_{suf}'),
        ])
    )
    
    # Сохраняем имена числовых фичей для создания полиномов
    numeric_features.extend([
        f'num_products_{suf}',
        f'sum_discount_price_{suf}', 
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}',
    ])
    
    actions_aggs[id_] = aggs
    all_aggs.append(aggs)

# Объединяем все агрегации по user_id с суффиксами
combined_val = all_aggs[0]
for i, agg in enumerate(all_aggs[1:], 1):
    combined_val = combined_val.join(
        agg, 
        on='user_id', 
        how='outer',
        suffix=f"_{i}"
    )

  combined_val = combined_val.join(


In [10]:
id_ = 4
suf = 'search'

# Вычисляем top3 кластеров для валидации (аналогично трейну)
val_cluster_counts = (
    search_history
    .filter(pl.col('action_type_id') == id_)
    .filter(pl.col('timestamp').dt.date() <= val_end_date)
    .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30 * 5))
    .group_by('user_id')
    .agg(
        pl.col('cluster').value_counts().alias('cluster_counts')
    )
    .explode('cluster_counts')
    .with_columns(
        pl.col('cluster_counts').struct.field('cluster').alias('cluster_name'),
        pl.col('cluster_counts').struct.field('count').alias('cluster_count')
    )
    .group_by('user_id')
    .agg(
        pl.col('cluster_name').sort_by('cluster_count', descending=True).head(3).alias('top3_clusters'),
        pl.col('cluster_count').sort(descending=True).head(3).alias('top3_counts')
    )
)

actions_aggs[id_] = (
    search_history
    .filter(pl.col('action_type_id') == id_)
    .filter(pl.col('timestamp').dt.date() <= val_end_date)
    .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30 * 5))
    .group_by('user_id')
    .agg(
        # Общее количество поисков за 5 месяцев
        pl.count('search_query').cast(pl.Int32).alias(f'num_{suf}'),
        pl.col('search_query').n_unique().alias(f'unique_{suf}_queries'),
        
        # Количество поисков за последний месяц (30 дней)
        pl.col('search_query')
            .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30))
            .count()
            .cast(pl.Int32)
            .alias(f'num_{suf}_last_month'),
        
        # Количество поисков за последнюю неделю (7 дней)
        pl.col('search_query')
            .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=7))
            .count()
            .cast(pl.Int32)
            .alias(f'num_{suf}_last_week'),

        (pl.count() / (pl.max('timestamp') - pl.min('timestamp')).dt.total_days()).alias(f'{suf}_daily_rate'),

        pl.col('cluster').n_unique().alias(f'num_{suf}_clusters'),
        pl.col('cluster').mode().first().alias(f'main_{suf}_cluster'),
        
        # Динамика кластеров
        pl.col('cluster')
            .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30))
            .mode().first()
            .alias(f'recent_{suf}_cluster'),

        (pl.col('cluster').value_counts().struct.field('count').max() / pl.col('cluster').count()).alias(f'{suf}_cluster_concentration'),
        
        # Энтропия кластеров
        (-(pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count()).log()
            * (pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count())
            .sum()).alias(f'{suf}_cluster_entropy'),
        
        # Переключения между кластерами
        pl.col('cluster').diff().fill_null(0).abs().sum().alias(f'{suf}_cluster_switches'),
        
        # Стабильность кластеров
        ((pl.col('cluster').count() - pl.col('cluster').n_unique()) / pl.col('cluster').count())
            .alias(f'{suf}_cluster_stability'),
        
        # Время в основном кластере
        (pl.col('timestamp')
            .filter(pl.col('cluster') == pl.col('cluster').mode().first())
            .count() / pl.col('timestamp').count())
            .alias(f'main_{suf}_cluster_time_ratio'),

        pl.col('timestamp').filter(pl.col('cluster').diff().fill_null(0) != 0)
            .diff()
            .dt.total_days()
            .mean()
            .alias(f'{suf}_mean_cluster_switch_days'),

        pl.col('search_query').str.len_chars().mean().alias(f'{suf}_mean_query_len'),
        
        (pl.col('search_query').str.len_chars()
            .filter(pl.col('cluster') == pl.col('cluster').mode().first()).mean() - 
            pl.col('search_query').str.len_chars()
                .filter(pl.col('cluster') != pl.col('cluster').mode().first()).mean())
                .alias(f'{suf}_main_cluster_query_len_diff'),

        pl.max('timestamp').alias(f'last_{suf}_time'),
        pl.min('timestamp').alias(f'first_{suf}_time'),
    )
    .join(val_cluster_counts, on='user_id', how='left')
    .with_columns([
        (pl.lit(test_start_date) - pl.col(f'last_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_last_{suf}'),

        (pl.lit(test_start_date) - pl.col(f'first_{suf}_time'))
            .dt.total_days()
            .cast(pl.Int32)
            .alias(f'days_since_first_{suf}'),
    ])
    .select(
        'user_id',
        f'num_{suf}',
        f'unique_{suf}_queries',
        f'num_{suf}_last_month',
        f'num_{suf}_last_week',
        f'{suf}_daily_rate',
        f'num_{suf}_clusters',
        f'main_{suf}_cluster',
        pl.col('top3_clusters').alias(f'top3_{suf}_clusters'),
        pl.col('top3_counts').alias(f'top3_{suf}_counts'),
        f'recent_{suf}_cluster',
        f'{suf}_cluster_concentration',
        f'{suf}_cluster_entropy',
        f'{suf}_cluster_switches',
        f'{suf}_cluster_stability',
        f'main_{suf}_cluster_time_ratio',
        f'{suf}_mean_cluster_switch_days',
        f'{suf}_mean_query_len',
        f'{suf}_main_cluster_query_len_diff',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}',
        f'last_{suf}_time',
        f'first_{suf}_time',
    )
)

  (pl.count() / (pl.max('timestamp') - pl.min('timestamp')).dt.total_days()).alias(f'{suf}_daily_rate'),


In [11]:
val_product_cluster_aggs = (
    actions_history
    .filter(pl.col('timestamp').dt.date() <= val_end_date)
    .filter(pl.col('timestamp').dt.date() >= val_end_date - timedelta(days=30 * 5))
    .join(
        product_information.select('product_id', 'cluster'),
        on='product_id'
    )
    .group_by('user_id')
    .agg(
        # Общие агрегаты по кластерам продуктов
        pl.col('cluster').n_unique().alias('num_product_clusters'),
        pl.col('cluster').mode().first().alias('main_product_cluster'),
        
        # Аналогичные агрегаты как для search
        (pl.col('cluster').value_counts().struct.field('count').max() / pl.col('cluster').count()).alias('product_cluster_concentration'),
        
        # Энтропия кластеров продуктов
        (-(pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count()).log()
            * (pl.col('cluster').value_counts().struct.field('count') / pl.col('cluster').count())
            .sum()).alias('product_cluster_entropy'),
        
        # Стабильность кластеров продуктов
        ((pl.col('cluster').count() - pl.col('cluster').n_unique()) / pl.col('cluster').count())
            .alias('product_cluster_stability'),
        
        # Время в основном кластере продуктов
        (pl.col('timestamp')
            .filter(pl.col('cluster') == pl.col('cluster').mode().first())
            .count() / pl.col('timestamp').count()
        ).alias('main_product_cluster_time_ratio'),
            
        # Top 3 кластеров продуктов
        pl.col('cluster').value_counts().struct.field('cluster').alias('top_product_clusters'),
        pl.col('cluster').value_counts().struct.field('count').alias('top_product_counts')
    )
    .with_columns(
        pl.col('top_product_clusters').list.head(3).alias('top3_product_clusters'),
        pl.col('top_product_counts').list.head(3).alias('top3_product_counts')
    )
    .drop(['top_product_clusters', 'top_product_counts'])
)

In [12]:
test_last_month_features = (
    actions_history
    .filter(
        (pl.col('timestamp').dt.date() < test_start_date) &  # до теста
        (pl.col('timestamp').dt.date() >= test_start_date - timedelta(days=30))  # последние 30 дней
    )
    .join(
        product_information.select('product_id', 'discount_price', 'cluster'),
        on='product_id'
    )
    .group_by('user_id')
    .agg(
        # Те же метрики, что и для трейна
        pl.count().alias('total_actions_30d'),
        (pl.col('action_type_id') == 1).sum().alias('clicks_30d'),
        (pl.col('action_type_id') == 2).sum().alias('favorites_30d'),
        (pl.col('action_type_id') == 5).sum().alias('cart_adds_30d'),
        
        pl.sum('discount_price').alias('total_spent_30d'),
        pl.mean('discount_price').alias('avg_price_30d'),
        
        (test_start_date - pl.col('timestamp').max().dt.date()).dt.total_days().alias('days_since_last_action'),
        (pl.col('timestamp').max() - pl.col('timestamp').min()).dt.total_days().alias('active_days_30d'),
        
        pl.col('cluster').n_unique().alias('unique_clusters_30d'),
        (pl.col('cluster').value_counts().struct.field('count').max() / pl.count()).alias('main_cluster_ratio_30d')
    )
    .with_columns(
        (pl.col('total_actions_30d') / pl.col('active_days_30d')).alias('daily_actions_rate_30d'),
        (pl.col('cart_adds_30d') / pl.col('total_actions_30d')).alias('cart_add_ratio_30d'),
        (pl.col('favorites_30d') / pl.col('total_actions_30d')).alias('favorite_ratio_30d')
    )
)

  pl.count().alias('total_actions_30d'),
  (pl.col('cluster').value_counts().struct.field('count').max() / pl.count()).alias('main_cluster_ratio_30d')


In [13]:
from local_utils import *

test_users_submission = (
    pl.read_csv(os.path.join(data_path, 'test_users.csv'))
)

test_df_main = test_users_submission
for _, actions_aggs_df in actions_aggs.items():
    test_df_main = (
        test_df_main
        .join(actions_aggs_df, on='user_id', how='left')
    )
test_df_main = test_df_main.join(val_product_cluster_aggs, on='user_id', how='left')
test_df_main = test_df_main.join(test_last_month_features, on='user_id', how='left')

test_df_pd = test_df_main.to_pandas()

test_df_pd = apply_log_transform(test_df_pd, columns_to_log, drop_original=True)

In [14]:
test_df_pd = add_pca_columns(test_df_pd,  pca_cols,  n_components = 2)

Nans filled


In [15]:
test_df_pd = add_knn_features_faiss(test_df_pd, knn_cols, n_neighbors=5, use_gpu=True)

Nans filled
Data scaled
Using CPU
FAISS index built
KNN search done


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


KNN features created


----------------------------------

NameError: name 'ы' is not defined

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# 1. Полностью очищаем данные от временных столбцов и нечисловых типов
cols = list(set(df_pd.columns) - {'user_id', 'target', 'last_click_time', 'first_click_time', 'last_favorite_time', 'first_favorite_time', 
                                  'last_order_time', 'first_order_time', 'last_to_cart_time', 'first_to_cart_time', 'last_search_time', 'first_search_time',
                                  'top3_search_clusters', 'top3_search_counts', 'search_cluster_entropy', 'top3_product_counts', 'product_cluster_entropy', 'top3_product_clusters'})

# 2. Явно преобразуем все данные в float и заменяем оставшиеся NaT/NaN
X_train = df_pd[cols].astype(float).values
y_train = df_pd['target'].values

X_test = test_df_pd[cols].astype(float).values

# Базовые модели с поддержкой NaN
base_models = {
    "catboost": CatBoostClassifier(
        iterations=150, learning_rate=0.05, depth=5, random_state=42,
        verbose=0, allow_writing_files=False
    ),
    "lgbm": lgb.LGBMClassifier(
        n_estimators=150, learning_rate=0.05, max_depth=5, random_state=42
    ),
    # Новые модели с совершенно другими параметрами
    "lgbm_deep": lgb.LGBMClassifier(
        n_estimators=300, learning_rate=0.01, max_depth=10, num_leaves=64,
        min_child_samples=20, reg_alpha=0.1, reg_lambda=0.1,
        subsample=0.8, colsample_bytree=0.7, random_state=42
    ),
    "lgbm_fast": lgb.LGBMClassifier(
        n_estimators=80, learning_rate=0.2, max_depth=3,
        min_data_in_leaf=10, boosting_type='dart',
        random_state=42, bagging_freq=1, bagging_fraction=0.9
    ),
    "lgbm_goss": lgb.LGBMClassifier(
        boosting_type='goss',
        n_estimators=200, learning_rate=0.02,
        max_depth=7, num_leaves=50,
        top_rate=0.2, other_rate=0.1,
        random_state=42
    ),
    "catboost_balanced": CatBoostClassifier(
        iterations=200, learning_rate=0.03, depth=8,
        l2_leaf_reg=5, scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train),
        random_seed=42, verbose=0, allow_writing_files=False,
        grow_policy='Lossguide'
    ),
    "catboost_tuned": CatBoostClassifier(
        iterations=100, learning_rate=0.1, depth=4,
        border_count=128, random_strength=0.5,
        bagging_temperature=0.8, od_type='Iter',
        od_wait=50, random_seed=42, verbose=0
    )
}

# Подготовка OOF-прогнозов
n_models = len(base_models)
n_train = len(X_train)
n_test = len(X_test)

meta_train = np.zeros((n_train, n_models))
meta_test = np.zeros((n_test, n_models))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (name, model) in enumerate(base_models.items()):
    print(f"Training {name}...")
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        if name.startswith('lgbm'):
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        elif name.startswith('catboost'):
            model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=0)
        else:
            model.fit(X_tr, y_tr)
        
        meta_train[val_idx, i] = model.predict_proba(X_val)[:, 1]
            
    
    try:
        meta_test[:, i] = model.predict_proba(X_test)[:, 1]
    except Exception as e:
        print(f"Error in {name} predict: {str(e)}")
        meta_test[:, i] = 0.5

Training catboost...
Training lgbm...
[LightGBM] [Info] Number of positive: 518271, number of negative: 981693
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.282290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24857
[LightGBM] [Info] Number of data points in the train set: 1499964, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.345522 -> initscore=-0.638780
[LightGBM] [Info] Start training from score -0.638780
[LightGBM] [Info] Number of positive: 518137, number of negative: 981828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.122892 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24841
[LightGBM] [Info] Number of data points in the train set: 1499965, number of used features: 124
[LightGBM] [Info] [binary:BoostF

In [21]:
meta_test

array([[0.21186571, 0.17236895, 0.18420134, ..., 0.18192606, 0.27707778,
        0.17589141],
       [0.68631946, 0.78158597, 0.74764962, ..., 0.76131111, 0.82645268,
        0.6579162 ],
       [0.25964519, 0.21470835, 0.22364766, ..., 0.24045381, 0.34127315,
        0.21091497],
       ...,
       [0.61983921, 0.63479345, 0.56406308, ..., 0.59530627, 0.73613724,
        0.64297975],
       [0.50364527, 0.52507877, 0.52562602, ..., 0.52954635, 0.67915716,
        0.52082136],
       [0.4883099 , 0.50563655, 0.50518336, ..., 0.52611808, 0.66105509,
        0.50868375]])

In [29]:
import optuna
from sklearn.metrics import roc_auc_score
import joblib

best_model = None
best_score = -1

def objective(trial):
    global best_model, best_score
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 39,
    }
    
    # Используем 3-fold CV для оценки
    scores = []
    models = []
    kf = KFold(n_splits=3, shuffle=True, random_state=39)
    
    for train_idx, val_idx in kf.split(meta_train):
        X_tr, X_val = meta_train[train_idx], meta_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(X_tr, y_tr)
        models.append(model)
        
        preds = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, preds)
        scores.append(score)
    
    mean_score = np.mean(scores)
    
    # Сохраняем лучшую модель
    if mean_score > best_score:
        best_score = mean_score
        # Обучаем модель на всех данных с этими параметрами
        best_model = lgb.LGBMClassifier(**params, n_estimators=900)
        best_model.fit(meta_train, y_train)
        
        # Сохраняем модель на диск
        joblib.dump(best_model, 'best_meta_model.pkl')
    
    return mean_score

# Оптимизация
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=60*20)  # 20 минут

# Если нужно загрузить сохраненную модель
best_model = joblib.load('best_meta_model.pkl')

# Используем сохраненную лучшую модель
test_df_pd['stacking_predict'] = best_model.predict_proba(meta_test)[:, 1]
result = test_df_pd[['user_id', 'stacking_predict']]
result

[I 2025-04-04 20:13:23,364] A new study created in memory with name: no-name-30de6478-0393-497e-9f6d-5949957239cb
[I 2025-04-04 20:13:37,149] Trial 0 finished with value: 0.759961494263126 and parameters: {'learning_rate': 0.00986099751972483, 'num_leaves': 153, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.5609515483773001, 'colsample_bytree': 0.7121200852634904, 'reg_alpha': 0.4709557756637617, 'reg_lambda': 4.868078072391354}. Best is trial 0 with value: 0.759961494263126.
[I 2025-04-04 20:13:59,084] Trial 1 finished with value: 0.7602531098355813 and parameters: {'learning_rate': 0.0072400055643950575, 'num_leaves': 176, 'max_depth': 7, 'min_child_samples': 92, 'subsample': 0.541197862106498, 'colsample_bytree': 0.5291111180254446, 'reg_alpha': 8.70224153237558, 'reg_lambda': 9.228144134812505}. Best is trial 1 with value: 0.7602531098355813.
[I 2025-04-04 20:14:06,338] Trial 2 finished with value: 0.7600513580609557 and parameters: {'learning_rate': 0.002502939792935633,

Unnamed: 0,user_id,stacking_predict
0,1342,0.138443
1,9852,0.741076
2,10206,0.152573
3,11317,0.179707
4,13289,0.682962
...,...,...
2068419,11157283,0.164151
2068420,11160395,0.126374
2068421,11165052,0.588383
2068422,11168218,0.553448


In [30]:
test_df_pd[['user_id', 'stacking_predict']].to_csv('stacking_opt.csv', index=False)

In [None]:
# Параметры для LightGBM (упрощенные, без early stopping)
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'max_depth': 120,
    'num_leaves': 75,
    'min_data_in_leaf': 30,
    'n_estimators': 900,
    'random_state': 42,
    'verbose': 1,
    'random_state':39
}

# Создаем и обучаем мета-модель на всех данных
meta_model = lgb.LGBMClassifier(**lgb_params)
meta_model.fit(meta_train, y_train)  # Просто fit без валидации

test_df_pd['stacking_predict'] = meta_model.predict_proba(meta_test)[:, 1]
test_df_pd[['user_id', 'stacking_predict']]

[LightGBM] [Info] Number of positive: 647575, number of negative: 1227381
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 1874956, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.345381 -> initscore=-0.639403
[LightGBM] [Info] Start training from score -0.639403


array([0.16636209, 0.76811173, 0.20386324, ..., 0.58945946, 0.53982926,
       0.50582742])

In [26]:
test_df_pd[['user_id', 'stacking_predict']].to_csv('stacking.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# Функция для обработки пропусков
def handle_missing_values(df):
    # Создаем копию DataFrame
    processed = df.copy()
    
    # Определяем группы столбцов
    columns = processed.columns
    zero_fill_cols = [col for col in columns if col.startswith(('num_', 'sum_', 'max_', 'days_', 'log_', 'avg_', 'main_', 'search_', 'product_', 'recent_', 'knn'))]
    zero_fill_cols_end = [col for col in columns if col.endswith(('_30d'))]
    time_fill_cols = [col for col in columns if col.endswith('_time')]
    
    # Заполняем пропуски
    processed[zero_fill_cols] = processed[zero_fill_cols].fillna(0)
    processed[zero_fill_cols_end] = processed[zero_fill_cols_end].fillna(0)
    # Проверяем, есть ли временные столбцы перед заполнением
    if time_fill_cols:
        processed[time_fill_cols] = processed[time_fill_cols].fillna(pd.Timestamp('2024-01-01 00:00:00'))

    # Для остальных числовых признаков
    numeric_cols = processed.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col not in zero_fill_cols and col not in zero_fill_cols_end and col not in time_fill_cols:
            processed[col] = processed[col].fillna(-999)
    
    return processed

# Обработка данных (предполагаем, что df_pd и test_df_pd уже определены)
df_processed = handle_missing_values(df_pd)
test_processed = handle_missing_values(test_df_pd)

# Выбор фичей (исключаем user_id и target)
cols_to_exclude = {'user_id', 'target', 'last_click_time', 'first_click_time', 'last_favorite_time', 
                   'first_favorite_time', 'last_order_time', 'first_order_time', 'last_to_cart_time', 
                   'first_to_cart_time', 'last_search_time', 'first_search_time', 'top3_search_clusters', 
                   'top3_search_counts', 'search_cluster_entropy', 'top3_product_counts', 
                   'product_cluster_entropy', 'top3_product_clusters'}

cols_2 = [col for col in df_pd.columns if col not in cols_to_exclude]

# Подготовка данных
X_train_2 = df_processed[cols_2].values
y_train_2 = df_processed['target'].values
X_test_2 = test_processed[cols_2].values

# Базовые модели с индивидуальными параметрами
base_models = {
    "lgbm_native": LGBMClassifier(
        n_estimators=150, learning_rate=0.05, max_depth=5, random_state=42,
        verbose=-1  # Используем -1 для silence вместо параметра в fit
    ),
    "xgb_sklearn": XGBClassifier(
        n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42,
        tree_method='hist', enable_categorical=True, missing=-999,
        verbosity=0  # Используем verbosity вместо verbose
    ),
    "xgb_native": XGBClassifier(
        n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42,
        tree_method='hist', enable_categorical=True, missing=-999,
        verbosity=0
    ),
}

# Подготовка OOF-прогнозов
n_models_2 = len(base_models)
n_train_2 = len(X_train_2)
n_test_2 = len(X_test_2)

meta_train_2 = np.zeros((n_train_2, n_models_2))
meta_test_2 = np.zeros((n_test_2, n_models_2))

kf_2 = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (name, model) in enumerate(base_models.items()):
    print(f"\nTraining {name}...")
    for fold_idx, (train_idx, val_idx) in enumerate(kf_2.split(X_train_2, y_train_2)):
        X_tr, X_val = X_train_2[train_idx], X_train_2[val_idx]
        y_tr, y_val = y_train_2[train_idx], y_train_2[val_idx]
        
        if name == 'lgbm_native':
            model.fit(X_tr, y_tr, 
                     eval_set=[(X_val, y_val)], 
                     eval_metric='auc',
                     callbacks=[lgb.log_evaluation(period=0)])  # Отключаем вывод
        elif name in ['xgb_native', 'xgb_sklearn']:
            model.fit(X_tr, y_tr, 
                     eval_set=[(X_val, y_val)],
                     verbose=False)  # Используем verbose=False для XGBoost
        else:
            model.fit(X_tr, y_tr)
        
        meta_train_2[val_idx, i] = model.predict_proba(X_val)[:, 1]
    
    # Делаем предсказания для тестовых данных
    meta_test_2[:, i] = model.predict_proba(X_test_2)[:, 1]
    print(f"{name} trained successfully with {n_models_2} base models!")


Training lgbm_native...
lgbm_native trained successfully with 3 base models!

Training xgb_sklearn...


XGBoostError: [18:05:18] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\common\../data/gradient_index.h:94: Check failed: valid: Input data contains `inf` or a value too large, while `missing` is not set to `inf`

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# 1. Полностью очищаем данные от временных столбцов и нечисловых типов
cols = list(set(df_pd.columns) - {'user_id', 'target', 'last_click_time', 'first_click_time', 'last_favorite_time', 'first_favorite_time', 
                                  'last_order_time', 'first_order_time', 'last_to_cart_time', 'first_to_cart_time', 'last_search_time', 'first_search_time',
                                  'top3_search_clusters', 'top3_search_counts', 'search_cluster_entropy', 'top3_product_counts', 'product_cluster_entropy', 'top3_product_clusters'})

# 2. Явно преобразуем все данные в float и заменяем оставшиеся NaT/NaN
X_train = df_pd[cols].astype(float).fillna(-999).values
y_train = df_pd['target'].values

X_test = test_df_pd[cols].astype(float).fillna(-999).values

# Базовые модели с поддержкой NaN
base_models = {
    "lgbm_sklearn": LGBMClassifier(
        n_estimators=150, learning_rate=0.05, max_depth=5, random_state=42
    ),
    "catboost": CatBoostClassifier(
        iterations=150, learning_rate=0.05, depth=5, random_state=42,
        verbose=0, allow_writing_files=False
    )
}   

# Подготовка OOF-прогнозов
n_models = len(base_models)
n_train = len(X_train)
n_test = len(X_test)

meta_train = np.zeros((n_train, n_models))
meta_test = np.zeros((n_test, n_models))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (name, model) in enumerate(base_models.items()):
    print(f"Training {name}...")
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        try:
            if name == 'lgbm_native':
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=0)
            elif name == 'xgb_native':
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=0)
            elif name == 'catboost':
                model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=0)
            else:
                model.fit(X_tr, y_tr)
            
            meta_train[val_idx, i] = model.predict_proba(X_val)[:, 1]
            
        except Exception as e:
            print(f"Error in {name}: {str(e)}")
            meta_train[val_idx, i] = 0.5
    
    try:
        meta_test[:, i] = model.predict_proba(X_test)[:, 1]
    except Exception as e:
        print(f"Error in {name} predict: {str(e)}")
        meta_test[:, i] = 0.5