In [4]:
import polars as pl
import pandas as pd
import numpy as np
import catboost
import os
from datetime import date, timedelta

In [5]:
test_start_date = date(2024, 8, 1)

In [6]:
test_start_date = date(2024, 8, 1)
val_start_date = date(2024, 7, 1)
val_end_date = date(2024, 7, 31)
train_end_date = date(2024, 6, 30)
data_path = './'

# Read data

In [7]:
actions_history = pd.read_parquet(os.path.join(data_path, 'actions_history'))
search_history = pd.read_parquet(os.path.join(data_path, 'search_history'))
product_information = pd.read_csv(os.path.join(data_path, 'product_information.csv'))

In [8]:
pd.read_csv(os.path.join(data_path, 'action_type_info.csv'))

Unnamed: 0,action_type,action_type_id
0,click,1
1,favorite,2
2,order,3
3,search,4
4,to_cart,5
5,view,6


In [9]:
val_target = (
    actions_history[
        (actions_history['timestamp'].dt.date >= val_start_date) &
        (actions_history['timestamp'].dt.date <= val_end_date)
    ]
    .assign(has_order=lambda x: (x['action_type_id'] == 3).astype(int))
    .groupby('user_id', as_index=False)
    .agg(target=('has_order', 'max'))
)

In [10]:
val_target.target.value_counts()

target
0    1227381
1     647575
Name: count, dtype: int64

# Simple pipeline

## Feats

In [11]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite",
    3: "order",
    5: "to_cart",
}

for id_, suf in actions_id_to_suf.items():
    filtered_data = actions_history[
        (actions_history['timestamp'].dt.date <= train_end_date) &
        (actions_history['timestamp'].dt.date >= train_end_date - timedelta(days=30 * 4)) &
        (actions_history['action_type_id'] == id_)
    ]
    
    merged_data = filtered_data.merge(
        product_information[['product_id', 'discount_price']],
        on='product_id',
        how='left'
    )
    
    aggs = merged_data.groupby('user_id', as_index=False).agg(
        num_products=('product_id', 'count'),
        sum_discount_price=('discount_price', 'sum'),
        max_discount_price=('discount_price', 'max'),
        last_time=('timestamp', 'max'),
        first_time=('timestamp', 'min')
    )
    
    aggs = aggs.rename(columns={
        'num_products': f'num_products_{suf}',
        'sum_discount_price': f'sum_discount_price_{suf}',
        'max_discount_price': f'max_discount_price_{suf}',
        'last_time': f'last_{suf}_time',
        'first_time': f'first_{suf}_time'
    })
    
    aggs[f'days_since_last_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'last_{suf}_time']).dt.days
    aggs[f'days_since_first_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'first_{suf}_time']).dt.days
    
    aggs = aggs[[
        'user_id',
        f'num_products_{suf}',
        f'sum_discount_price_{suf}',
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}'
    ]]
    
    actions_aggs[id_] = aggs

In [12]:
# search_aggs
id_ = 4
suf = 'search'

filtered_data = search_history[
    (search_history['action_type_id'] == id_) &
    (search_history['timestamp'].dt.date <= train_end_date) &
    (search_history['timestamp'].dt.date >= train_end_date - timedelta(days=30 * 4))
]
aggs = filtered_data.groupby('user_id', as_index=False).agg(
    num_search=('search_query', 'count'),
    last_search_time=('timestamp', 'max'),
    first_search_time=('timestamp', 'min')
)

aggs = aggs.rename(columns={
    'num_search': f'num_{suf}',
    'last_search_time': f'last_{suf}_time',
    'first_search_time': f'first_{suf}_time'
})

aggs[f'days_since_last_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'last_{suf}_time']).dt.days
aggs[f'days_since_first_{suf}'] = (pd.to_datetime(val_start_date) - aggs[f'first_{suf}_time']).dt.days

aggs = aggs[[
    'user_id',
    f'num_{suf}',
    f'days_since_last_{suf}',
    f'days_since_first_{suf}'
]]

actions_aggs[id_] = aggs

In [13]:
actions_aggs.keys()

dict_keys([1, 2, 3, 5, 4])

In [14]:
df = val_target

for _, actions_aggs_df in actions_aggs.items():
    df = df.merge(actions_aggs_df, on='user_id', how='left')

In [15]:
df

Unnamed: 0,user_id,target,num_products_click,sum_discount_price_click,max_discount_price_click,days_since_last_click,days_since_first_click,num_products_favorite,sum_discount_price_favorite,max_discount_price_favorite,...,days_since_last_order,days_since_first_order,num_products_to_cart,sum_discount_price_to_cart,max_discount_price_to_cart,days_since_last_to_cart,days_since_first_to_cart,num_search,days_since_last_search,days_since_first_search
0,12,0,,,,,,,,,...,,,,,,,,,,
1,16,0,1.0,335.0,335.0,118.0,118.0,,,,...,,,1.0,335.0,335.0,118.0,118.0,,,
2,34,0,,,,,,,,,...,,,,,,,,,,
3,36,1,9.0,20407.0,17257.0,49.0,73.0,,,,...,,,,,,,,1.0,28.0,28.0
4,53,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874951,11184150,0,4.0,2181.0,780.0,2.0,112.0,,,,...,,,11.0,3113.0,420.0,2.0,120.0,22.0,2.0,120.0
1874952,11184151,1,66.0,33559.0,5476.0,3.0,120.0,,,,...,9.0,27.0,137.0,44696.0,3218.0,3.0,120.0,133.0,3.0,120.0
1874953,11184159,0,8.0,10462.0,7199.0,25.0,76.0,,,,...,,,,,,,,3.0,25.0,25.0
1874954,11184164,1,4.0,502.0,188.0,2.0,69.0,,,,...,4.0,4.0,2.0,174.0,87.0,2.0,69.0,4.0,2.0,69.0


In [16]:
df_pd = df

In [17]:
mask = df_pd.user_id % 10 <= 6

In [18]:
df_pd.columns

Index(['user_id', 'target', 'num_products_click', 'sum_discount_price_click',
       'max_discount_price_click', 'days_since_last_click',
       'days_since_first_click', 'num_products_favorite',
       'sum_discount_price_favorite', 'max_discount_price_favorite',
       'days_since_last_favorite', 'days_since_first_favorite',
       'num_products_order', 'sum_discount_price_order',
       'max_discount_price_order', 'days_since_last_order',
       'days_since_first_order', 'num_products_to_cart',
       'sum_discount_price_to_cart', 'max_discount_price_to_cart',
       'days_since_last_to_cart', 'days_since_first_to_cart', 'num_search',
       'days_since_last_search', 'days_since_first_search'],
      dtype='object')

In [19]:
cols = [
    'num_products_click', 
    'sum_discount_price_click', 'max_discount_price_click',
    'days_since_last_click', 'days_since_first_click',
    'num_products_favorite', 'sum_discount_price_favorite',
    'max_discount_price_favorite', 'days_since_last_favorite',
    'days_since_first_favorite', 'num_products_order',
    'sum_discount_price_order', 'max_discount_price_order',
    'days_since_last_order', 'days_since_first_order',
    'num_products_to_cart', 'sum_discount_price_to_cart',
    'max_discount_price_to_cart', 'days_since_last_to_cart',
    'days_since_first_to_cart', 'num_search', 'days_since_last_search',
    'days_since_first_search'
]

In [20]:
train_pool = catboost.Pool(
    df_pd.loc[mask, cols],
    label=df_pd.loc[mask].target,
)
eval_pool = catboost.Pool(
    df_pd.loc[~mask, cols],
    label=df_pd.loc[~mask].target,
)

In [21]:
train_pool.shape, eval_pool.shape

((1311636, 23), (563320, 23))

In [22]:
params = {
    'iterations': 200,
    'depth': 7, 
    'learning_rate': 0.1, 
    'random_state': 1,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'task_type': 'CPU',
}

In [23]:
model = catboost.CatBoost(params)
model.fit(
    train_pool, 
    eval_set=eval_pool,
    use_best_model=True,
    verbose=10,
    early_stopping_rounds=50,
)

0:	test: 0.7434883	best: 0.7434883 (0)	total: 198ms	remaining: 39.3s
10:	test: 0.7531217	best: 0.7531217 (10)	total: 902ms	remaining: 15.5s
20:	test: 0.7546796	best: 0.7546796 (20)	total: 1.57s	remaining: 13.4s
30:	test: 0.7556549	best: 0.7556549 (30)	total: 2.25s	remaining: 12.3s
40:	test: 0.7562939	best: 0.7562939 (40)	total: 2.92s	remaining: 11.3s
50:	test: 0.7566664	best: 0.7566664 (50)	total: 3.56s	remaining: 10.4s
60:	test: 0.7570016	best: 0.7570016 (60)	total: 4.22s	remaining: 9.62s
70:	test: 0.7572529	best: 0.7572529 (70)	total: 4.89s	remaining: 8.89s
80:	test: 0.7574683	best: 0.7574683 (80)	total: 5.56s	remaining: 8.17s
90:	test: 0.7575881	best: 0.7575881 (90)	total: 6.23s	remaining: 7.46s
100:	test: 0.7577279	best: 0.7577279 (100)	total: 6.89s	remaining: 6.75s
110:	test: 0.7579467	best: 0.7579539 (109)	total: 7.57s	remaining: 6.07s
120:	test: 0.7580197	best: 0.7580197 (120)	total: 8.23s	remaining: 5.37s
130:	test: 0.7581182	best: 0.7581182 (130)	total: 8.87s	remaining: 4.67s


<catboost.core.CatBoost at 0x1e83a297350>

In [24]:
name = 'baseline_1'
model.save_model(f"{name}.bin")

In [25]:
fi = model.get_feature_importance(eval_pool, prettified=True)
fi.head(50)

Unnamed: 0,Feature Id,Importances
0,sum_discount_price_order,24.107865
1,num_products_order,11.200589
2,days_since_last_order,10.969459
3,max_discount_price_order,9.063226
4,days_since_first_order,8.498767
5,sum_discount_price_to_cart,5.573428
6,num_products_to_cart,5.167236
7,num_search,3.930325
8,num_products_click,3.909462
9,days_since_last_to_cart,3.014358


In [26]:
test_users_submission = (
    pd.read_csv(os.path.join(data_path, 'test_users.csv'))
)

In [27]:
actions_aggs = {}
actions_id_to_suf = {
    1: "click",
    2: "favorite",
    3: "order",
    5: "to_cart",
}

for id_, suf in actions_id_to_suf.items():
    filtered_data = actions_history[
        (actions_history['timestamp'].dt.date <= val_end_date) &
        (actions_history['timestamp'].dt.date >= val_end_date - timedelta(days=30 * 4)) &
        (actions_history['action_type_id'] == id_)
    ]
    
    merged_data = filtered_data.merge(
        product_information[['product_id', 'discount_price']],
        on='product_id',
        how='left'
    )
    
    aggs = merged_data.groupby('user_id', as_index=False).agg(
        num_products=('product_id', 'count'),
        sum_discount_price=('discount_price', 'sum'),
        max_discount_price=('discount_price', 'max'),
        last_time=('timestamp', 'max'),
        first_time=('timestamp', 'min')
    )
    
    aggs = aggs.rename(columns={
        'num_products': f'num_products_{suf}',
        'sum_discount_price': f'sum_discount_price_{suf}',
        'max_discount_price': f'max_discount_price_{suf}',
        'last_time': f'last_{suf}_time',
        'first_time': f'first_{suf}_time'
    })
    
    aggs[f'days_since_last_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'last_{suf}_time']).dt.days
    aggs[f'days_since_first_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'first_{suf}_time']).dt.days
    
    aggs = aggs[[
        'user_id',
        f'num_products_{suf}',
        f'sum_discount_price_{suf}',
        f'max_discount_price_{suf}',
        f'days_since_last_{suf}',
        f'days_since_first_{suf}'
    ]]
    
    actions_aggs[id_] = aggs

In [28]:
# search_aggs
id_ = 4
suf = 'search'

filtered_data = search_history[
    (search_history['action_type_id'] == id_) &
    (search_history['timestamp'].dt.date <= val_end_date) &
    (search_history['timestamp'].dt.date >= val_end_date - timedelta(days=30 * 4))
]
aggs = filtered_data.groupby('user_id', as_index=False).agg(
    num_search=('search_query', 'count'),
    last_search_time=('timestamp', 'max'),
    first_search_time=('timestamp', 'min')
)

aggs = aggs.rename(columns={
    'num_search': f'num_{suf}',
    'last_search_time': f'last_{suf}_time',
    'first_search_time': f'first_{suf}_time'
})

aggs[f'days_since_last_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'last_{suf}_time']).dt.days
aggs[f'days_since_first_{suf}'] = (pd.to_datetime(test_start_date) - aggs[f'first_{suf}_time']).dt.days

aggs = aggs[[
    'user_id',
    f'num_{suf}',
    f'days_since_last_{suf}',
    f'days_since_first_{suf}'
]]

actions_aggs[id_] = aggs

In [29]:
df = test_users_submission

for _, actions_aggs_df in actions_aggs.items():
    df = df.merge(actions_aggs_df, on='user_id', how='left')

In [30]:
df_pd = df

In [31]:
df_pd.shape

(2068424, 24)

In [32]:
df_pd['predict'] = model.predict(df_pd[cols], prediction_type="Probability")[:, 1]

In [33]:
df_pd[['user_id', 'predict']]

Unnamed: 0,user_id,predict
0,1342,0.176841
1,9852,0.767904
2,10206,0.226078
3,11317,0.222756
4,13289,0.573964
...,...,...
2068419,11157283,0.221028
2068420,11160395,0.151824
2068421,11165052,0.583338
2068422,11168218,0.525602


In [35]:
df_pd[['user_id', 'predict']].to_csv('prep_res.csv', index=False)