In [2]:
import polars as pl
import pandas as pd
import numpy as np
import catboost
import os
from datetime import date, timedelta

In [3]:
test_start_date = date(2024, 8, 1)

In [4]:
test_start_date = date(2024, 8, 1)
val_start_date = date(2024, 7, 1)
val_end_date = date(2024, 7, 31)
train_end_date = date(2024, 6, 30)
data_path = './contest_1_data'

# Read data

In [5]:
actions_history = pd.read_parquet(os.path.join(data_path, 'actions_history'))
search_history = pd.read_parquet(os.path.join(data_path, 'search_history'))
product_information = pd.read_csv(os.path.join(data_path, 'product_information.csv'))

In [6]:
pd.read_csv(os.path.join(data_path, 'action_type_info.csv'))

Unnamed: 0,action_type,action_type_id
0,click,1
1,favorite,2
2,order,3
3,search,4
4,to_cart,5
5,view,6


In [7]:
val_target = (
    actions_history[
        (actions_history['timestamp'].dt.date >= val_start_date) &
        (actions_history['timestamp'].dt.date <= val_end_date)
    ]
    .assign(has_order=lambda x: (x['action_type_id'] == 3).astype(int))
    .groupby('user_id', as_index=False)
    .agg(target=('has_order', 'max'))
)

In [8]:
val_target.target.value_counts()

0    1227381
1     647575
Name: target, dtype: int64

# Simple pipeline

## Feats

In [9]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite",
    3: "order",
    5: "to_cart",
}

for id_, suf in actions_id_to_suf.items():
    filtered_data = actions_history[
        (actions_history['timestamp'].dt.date <= train_end_date) &
        (actions_history['timestamp'].dt.date >= train_end_date - timedelta(days=30 * 4)) &
        (actions_history['action_type_id'] == id_)
    ]
    
    merged_data = filtered_data.merge(
        product_information[['product_id', 'discount_price']],
        on='product_id',
        how='left'
    )
    
    aggs = merged_data.groupby('user_id', as_index=False).agg(
        num_products=('product_id', 'count'),
        sum_discount_price=('discount_price', 'sum'),
        max_discount_price=('discount_price', 'max'),
        last_time=('timestamp', 'max'),
        first_time=('timestamp', 'min')
    )
    
    aggs = aggs.rename(columns={
        'num_products': f'num_products_{suf}',
        'sum_discount_price': f'sum_discount_price_{suf}',
        'max_discount_price': f'max_discount_price_{suf}',
        'last_time': f'last_{suf}_time',
        'first_time': f'first_{suf}_time'
    })
    
    aggs[f'days_since_last_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'last_{suf}_time']).dt.days
    aggs[f'days_since_first_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'first_{suf}_time']).dt.days
    
    aggs = aggs[[
        'user_id',
        f'num_products_{suf}',
        f'sum_discount_price_{suf}',
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}'
    ]]
    
    actions_aggs[id_] = aggs

In [10]:
# search_aggs
id_ = 4
suf = 'search'

filtered_data = search_history[
    (search_history['action_type_id'] == id_) &
    (search_history['timestamp'].dt.date <= train_end_date) &
    (search_history['timestamp'].dt.date >= train_end_date - timedelta(days=30 * 4))
]
aggs = filtered_data.groupby('user_id', as_index=False).agg(
    num_search=('search_query', 'count'),
    last_search_time=('timestamp', 'max'),
    first_search_time=('timestamp', 'min')
)

aggs = aggs.rename(columns={
    'num_search': f'num_{suf}',
    'last_search_time': f'last_{suf}_time',
    'first_search_time': f'first_{suf}_time'
})

aggs[f'days_since_last_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'last_{suf}_time']).dt.days
aggs[f'days_since_first_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'first_{suf}_time']).dt.days

aggs = aggs[[
    'user_id',
    f'num_{suf}',
    f'days_since_last_{suf}',
    f'days_since_first_{suf}'
]]

actions_aggs[id_] = aggs

In [11]:
actions_aggs.keys()

dict_keys([1, 2, 3, 5, 4])

In [12]:
df = val_target

for _, actions_aggs_df in actions_aggs.items():
    df = df.merge(actions_aggs_df, on='user_id', how='left')

In [13]:
df

Unnamed: 0,user_id,target,num_products_click,sum_discount_price_click,max_discount_price_click,days_since_last_click,days_since_first_click,num_products_favorite,sum_discount_price_favorite,max_discount_price_favorite,...,days_since_last_order,days_since_first_order,num_products_to_cart,sum_discount_price_to_cart,max_discount_price_to_cart,days_since_last_to_cart,days_since_first_to_cart,num_search,days_since_last_search,days_since_first_search
0,12,0,,,,,,,,,...,,,,,,,,,,
1,16,0,1.0,335.0,335.0,118.0,118.0,,,,...,,,1.0,335.0,335.0,118.0,118.0,,,
2,34,0,,,,,,,,,...,,,,,,,,,,
3,36,1,9.0,20407.0,17257.0,49.0,73.0,,,,...,,,,,,,,1.0,28.0,28.0
4,53,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874951,11184150,0,4.0,2181.0,780.0,2.0,112.0,,,,...,,,11.0,3113.0,420.0,2.0,120.0,22.0,2.0,120.0
1874952,11184151,1,66.0,33559.0,5476.0,3.0,120.0,,,,...,9.0,27.0,137.0,44696.0,3218.0,3.0,120.0,133.0,3.0,120.0
1874953,11184159,0,8.0,10462.0,7199.0,25.0,76.0,,,,...,,,,,,,,3.0,25.0,25.0
1874954,11184164,1,4.0,502.0,188.0,2.0,69.0,,,,...,4.0,4.0,2.0,174.0,87.0,2.0,69.0,4.0,2.0,69.0


In [14]:
df_pd = df

In [15]:
mask = df_pd.user_id % 10 <= 6

In [16]:
df_pd.columns

Index(['user_id', 'target', 'num_products_click', 'sum_discount_price_click',
       'max_discount_price_click', 'days_since_last_click',
       'days_since_first_click', 'num_products_favorite',
       'sum_discount_price_favorite', 'max_discount_price_favorite',
       'days_since_last_favorite', 'days_since_first_favorite',
       'num_products_order', 'sum_discount_price_order',
       'max_discount_price_order', 'days_since_last_order',
       'days_since_first_order', 'num_products_to_cart',
       'sum_discount_price_to_cart', 'max_discount_price_to_cart',
       'days_since_last_to_cart', 'days_since_first_to_cart', 'num_search',
       'days_since_last_search', 'days_since_first_search'],
      dtype='object')

In [17]:
cols = [
    'num_products_click', 
    'sum_discount_price_click', 'max_discount_price_click',
    'days_since_last_click', 'days_since_first_click',
    'num_products_favorite', 'sum_discount_price_favorite',
    'max_discount_price_favorite', 'days_since_last_favorite',
    'days_since_first_favorite', 'num_products_order',
    'sum_discount_price_order', 'max_discount_price_order',
    'days_since_last_order', 'days_since_first_order',
    'num_products_to_cart', 'sum_discount_price_to_cart',
    'max_discount_price_to_cart', 'days_since_last_to_cart',
    'days_since_first_to_cart', 'num_search', 'days_since_last_search',
    'days_since_first_search'
]

In [18]:
train_pool = catboost.Pool(
    df_pd.loc[mask, cols],
    label=df_pd.loc[mask].target,
)
eval_pool = catboost.Pool(
    df_pd.loc[~mask, cols],
    label=df_pd.loc[~mask].target,
)

In [19]:
train_pool.shape, eval_pool.shape

((1311636, 23), (563320, 23))

In [20]:
params = {
    'iterations': 200,
    'depth': 7, 
    'learning_rate': 0.1, 
    'random_state': 1,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'task_type': 'CPU',
}

In [21]:
model = catboost.CatBoost(params)
model.fit(
    train_pool, 
    eval_set=eval_pool,
    use_best_model=True,
    verbose=10,
    early_stopping_rounds=50,
)

0:	test: 0.7434893	best: 0.7434893 (0)	total: 153ms	remaining: 30.5s
10:	test: 0.7525779	best: 0.7525779 (10)	total: 996ms	remaining: 17.1s
20:	test: 0.7548277	best: 0.7548277 (20)	total: 1.84s	remaining: 15.7s
30:	test: 0.7557195	best: 0.7557195 (30)	total: 2.73s	remaining: 14.9s
40:	test: 0.7563668	best: 0.7563668 (40)	total: 3.64s	remaining: 14.1s
50:	test: 0.7568255	best: 0.7568255 (50)	total: 4.58s	remaining: 13.4s
60:	test: 0.7571397	best: 0.7571397 (60)	total: 5.46s	remaining: 12.4s
70:	test: 0.7574463	best: 0.7574463 (70)	total: 6.33s	remaining: 11.5s
80:	test: 0.7575498	best: 0.7575615 (77)	total: 7.2s	remaining: 10.6s
90:	test: 0.7577310	best: 0.7577310 (90)	total: 8.1s	remaining: 9.7s
100:	test: 0.7578848	best: 0.7578848 (100)	total: 8.99s	remaining: 8.82s
110:	test: 0.7579867	best: 0.7579867 (110)	total: 9.9s	remaining: 7.94s
120:	test: 0.7581014	best: 0.7581014 (120)	total: 10.8s	remaining: 7.05s
130:	test: 0.7582063	best: 0.7582063 (130)	total: 11.7s	remaining: 6.17s
140:

<catboost.core.CatBoost at 0x60d4be8b0>

In [22]:
name = 'baseline_1'
model.save_model(f"{name}.bin")

In [23]:
fi = model.get_feature_importance(eval_pool, prettified=True)
fi.head(50)

Unnamed: 0,Feature Id,Importances
0,days_since_last_order,19.832468
1,sum_discount_price_order,19.545872
2,num_products_order,10.261542
3,days_since_first_order,7.882187
4,sum_discount_price_to_cart,6.784568
5,num_products_to_cart,5.113728
6,num_search,4.913192
7,num_products_click,4.568658
8,days_since_last_to_cart,4.333416
9,days_since_first_search,2.859644


In [24]:
test_users_submission = (
    pd.read_csv(os.path.join(data_path, 'test_users.csv'))
)

In [25]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite",
    3: "order",
    5: "to_cart",
}

for id_, suf in actions_id_to_suf.items():
    filtered_data = actions_history[
        (actions_history['timestamp'].dt.date <= val_end_date) &
        (actions_history['timestamp'].dt.date >= val_end_date - timedelta(days=30 * 4)) &
        (actions_history['action_type_id'] == id_)
    ]
    
    merged_data = filtered_data.merge(
        product_information[['product_id', 'discount_price']],
        on='product_id',
        how='left'
    )
    
    aggs = merged_data.groupby('user_id', as_index=False).agg(
        num_products=('product_id', 'count'),
        sum_discount_price=('discount_price', 'sum'),
        max_discount_price=('discount_price', 'max'),
        last_time=('timestamp', 'max'),
        first_time=('timestamp', 'min')
    )
    
    aggs = aggs.rename(columns={
        'num_products': f'num_products_{suf}',
        'sum_discount_price': f'sum_discount_price_{suf}',
        'max_discount_price': f'max_discount_price_{suf}',
        'last_time': f'last_{suf}_time',
        'first_time': f'first_{suf}_time'
    })
    
    aggs[f'days_since_last_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'last_{suf}_time']).dt.days
    aggs[f'days_since_first_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'first_{suf}_time']).dt.days
    
    aggs = aggs[[
        'user_id',
        f'num_products_{suf}',
        f'sum_discount_price_{suf}',
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}'
    ]]
    
    actions_aggs[id_] = aggs

In [26]:
# search_aggs
id_ = 4
suf = 'search'

filtered_data = search_history[
    (search_history['action_type_id'] == id_) &
    (search_history['timestamp'].dt.date <= val_end_date) &
    (search_history['timestamp'].dt.date >= val_end_date - timedelta(days=30 * 4))
]
aggs = filtered_data.groupby('user_id', as_index=False).agg(
    num_search=('search_query', 'count'),
    last_search_time=('timestamp', 'max'),
    first_search_time=('timestamp', 'min')
)

aggs = aggs.rename(columns={
    'num_search': f'num_{suf}',
    'last_search_time': f'last_{suf}_time',
    'first_search_time': f'first_{suf}_time'
})

aggs[f'days_since_last_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'last_{suf}_time']).dt.days
aggs[f'days_since_first_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'first_{suf}_time']).dt.days

aggs = aggs[[
    'user_id',
    f'num_{suf}',
    f'days_since_last_{suf}',
    f'days_since_first_{suf}'
]]

actions_aggs[id_] = aggs

In [27]:
df = test_users_submission

for _, actions_aggs_df in actions_aggs.items():
    df = df.merge(actions_aggs_df, on='user_id', how='left')

In [28]:
df_pd = df

In [29]:
df_pd.shape

(2068424, 24)

In [30]:
df_pd.shape

(2068424, 24)

In [31]:
df_pd['predict'] = model.predict(df_pd[cols], prediction_type="Probability")[:, 1]

In [32]:
df_pd[['user_id', 'predict']]

Unnamed: 0,user_id,predict
0,1342,0.174245
1,9852,0.774801
2,10206,0.222960
3,11317,0.220542
4,13289,0.606761
...,...,...
2068419,11157283,0.222019
2068420,11160395,0.147336
2068421,11165052,0.599892
2068422,11168218,0.527906


In [33]:
df_pd[['user_id', 'predict']].to_csv('baseline_1_submission.csv', index=False)