In [3]:
import polars as pl

import lightgbm

from recommenders.utils import fix_dtypes

In [4]:
val_path = './data/features/merged_features_val.parquet.gzip'
merged_val = fix_dtypes(pl.read_parquet(val_path))

print(f'Val shape: {merged_val.shape}')

Val shape: (149810474, 36)


In [6]:
unique_users = merged_val['user_id'].unique().to_list()

user_id2idx = dict(zip(unique_users, range(len(unique_users))))
merged_val = merged_val.with_columns([pl.col('user_id').apply(lambda x: user_id2idx[x]).alias('user_id')])

In [7]:
train_pt = merged_val.filter(pl.col('user_id') <= 0.85 * len(unique_users))
val_pt = merged_val.filter(pl.col('user_id') > 0.85 * len(unique_users))
print(train_pt.shape, val_pt.shape)

(127341341, 36) (22469133, 36)


In [15]:
train_pt_grouped = train_pt.groupby('user_id').agg(pl.col('timespent').sum())
user_idx = train_pt_grouped['user_id'].to_list()
timespent = train_pt_grouped['timespent'].to_list()
timespent_dict = dict(zip(user_idx, timespent))

In [17]:
def filter_zero_groups(df: pl.DataFrame) -> pl.DataFrame:
    train_pt_grouped = train_pt.groupby('user_id').agg(pl.col('timespent').sum())
    user_idx = train_pt_grouped['user_id'].to_list()
    timespent = train_pt_grouped['timespent'].to_list()
    timespent_dict = dict(zip(user_idx, timespent))
    nz_users = [user_id for user_id in timespent_dict if timespent_dict[user_id] > 0]
    return df.filter(pl.col('user_id').is_in(nz_users))

def downsample_negatives(df: pl.DataFrame, keep: float = None, proportion: int = None) -> pl.DataFrame:
    positives_mask = (pl.col('timespent') != 0)
    positives = df.filter(positives_mask)
    negatives = df.filter(~positives_mask)
    print(f'Negatives shape: {negatives.shape[0]}, Positives shape: {positives.shape[0]}')
    if keep is not None:
        keep_num_negatives = int(keep * negatives.shape[0])
    elif proportion is not None:
        keep_num_negatives = proportion * positives.shape[0]
    print(f'Negatives to keep: {keep_num_negatives}')
    negatives = negatives.sample(n=keep_num_negatives, shuffle=True)
    return pl.concat([positives, negatives]).sample(frac=1, shuffle=True)

print(f'Train shape: {train_pt.shape}')
train_pt_downsampled = downsample_negatives(filter_zero_groups(train_pt), keep=0.3)
print(f'Train shape: {train_pt_downsampled.shape}')
print(f'Train positives: {train_pt_downsampled.filter(pl.col("timespent") != 0).shape}')

Train shape: (127341341, 36)
Negatives shape: 102677274, Positives shape: 521012
Negatives to keep: 30803182
Train shape: (31324194, 36)
Train positives: (521012, 36)


In [18]:
features_columns = [col for col in merged_val.columns if col not in ['user_id', 'item_id', 'timespent', 'source_id']]
print(f'Feature columns number: {len(features_columns)}')

Feature columns number: 32


In [21]:
nz_timespent_mask = (train_pt_downsampled['timespent'] == 0)
print(f"Train WZ: {train_pt_downsampled.filter(nz_timespent_mask).shape[0]:_}")
print(f'Train NZ: {train_pt_downsampled.filter(~nz_timespent_mask).shape[0]:_}')

nz_timespent_mask = (train_pt_downsampled['timespent'] == 0)
print(f"Train downsampled WZ: {train_pt_downsampled.filter(nz_timespent_mask).shape[0]:_}")
print(f'Train downsampled NZ : {train_pt_downsampled.filter(~nz_timespent_mask).shape[0]:_}')

nz_timespent_mask = (val_pt['timespent'] == 0)
print(f"Val WZ: {val_pt.filter(nz_timespent_mask).shape[0]:_}")
print(f'Val NZ: {val_pt.filter(~nz_timespent_mask).shape[0]:_}')

Train WZ: 30_803_182
Train NZ: 521_012
Train downsampled WZ: 30_803_182
Train downsampled NZ : 521_012
Val WZ: 22_377_883
Val NZ: 91_250


In [22]:
train_pt_downsampled = train_pt_downsampled.sort(['user_id'])
val_pt = val_pt.sample(frac=1, shuffle=True)
val_pt = val_pt.sort(['user_id'])

SUPPP_COLS = ['user_id', 'item_id', 'timespent', 'source_id']

X_train = train_pt_downsampled.drop(SUPPP_COLS).to_numpy()
X_val = val_pt.drop(SUPPP_COLS).to_numpy()

y_train = train_pt_downsampled['timespent'].to_numpy()
y_val = val_pt['timespent'].to_numpy()

group_train = train_pt_downsampled.groupby('user_id').agg(pl.col('timespent').count())['timespent'].to_list()
group_val = val_pt.groupby('user_id').agg(pl.col('timespent').count())['timespent'].to_list()

In [23]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(len(group_train), len(group_val))
print(sum(group_train), sum(group_val))
print(max(group_train), max(group_val))

(31324194, 32) (31324194,)
(22469133, 32) (22469133,)
138050 29999
31324194 22469133
301 843


In [24]:
# clear memory
val_pt_cols = val_pt.drop(SUPPP_COLS).columns
del train_pt_downsampled
del train_pt
del val_pt

In [25]:
data_params = None

dataset_train = lightgbm.Dataset(
    data=X_train, 
    label=y_train, 
    group=group_train,
    params=data_params
    )

dataset_val = lightgbm.Dataset(
    data=X_val, 
    label=y_val, 
    group=group_val,
    reference=dataset_train, 
    params=data_params
    )

In [26]:
params = {
    "objective": "lambdarank",
    "metric": ["ndcg", "map"],
    "eval_at": 20,
    "boosting_type": "gbdt",
    "is_unbalance": True,
    "learning_rate": 0.05,
    'lambda_l1': 0.05,
    'lambda_l2': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    "label_gain": [i for i in range(60)],
    'force_col_wise': True,
    'bin_construct_sample_cnt': 1_500_000,
    # 'max_bin': 1023,
    }

ranker = lightgbm.train(
    params=params,
    train_set=dataset_train,
    num_boost_round=10_000,
    valid_sets=[dataset_val],
    # feval=custom_recall
    callbacks=[
        lightgbm.early_stopping(100),
        lightgbm.log_evaluation(50)
    ],
)

[LightGBM] [Info] Total groups: 138050, total data: 31324194
[LightGBM] [Info] Total Bins 8148
[LightGBM] [Info] Number of data points in the train set: 31324194, number of used features: 32
[LightGBM] [Info] Total groups: 29999, total data: 22469133
Training until validation scores don't improve for 100 rounds
[50]	valid_0's ndcg@20: 0.334795	valid_0's map@20: 0.262632
[100]	valid_0's ndcg@20: 0.344661	valid_0's map@20: 0.26893
[150]	valid_0's ndcg@20: 0.350643	valid_0's map@20: 0.272869
[200]	valid_0's ndcg@20: 0.35479	valid_0's map@20: 0.275617
[250]	valid_0's ndcg@20: 0.358015	valid_0's map@20: 0.277824
[300]	valid_0's ndcg@20: 0.360646	valid_0's map@20: 0.279695
[350]	valid_0's ndcg@20: 0.362662	valid_0's map@20: 0.281141
[400]	valid_0's ndcg@20: 0.364737	valid_0's map@20: 0.282634
[450]	valid_0's ndcg@20: 0.366377	valid_0's map@20: 0.283846
[500]	valid_0's ndcg@20: 0.367602	valid_0's map@20: 0.284714
[550]	valid_0's ndcg@20: 0.368606	valid_0's map@20: 0.285524
[600]	valid_0's ndc

In [27]:
ranker.save_model('./models/lgb/ranker_32feat_2902it.lgb')

<lightgbm.basic.Booster at 0x2866df05c30>

In [28]:
from operator import itemgetter

fim = dict(zip(val_pt_cols, ranker.feature_importance()))
sorted(fim.items(), key=itemgetter(1), reverse=True)

[('als_sim_max', 3850),
 ('als_sim_score', 3793),
 ('item_id_mean_timespent_shift_0', 3674),
 ('als_sim_rank', 3499),
 ('source_item_retention_perc', 3498),
 ('source_item_timespent_perc_nz', 3420),
 ('source_item_count_norm', 3256),
 ('source_item_timespent_norm', 3083),
 ('source_item_timespent_perc', 2991),
 ('content_sim_max', 2984),
 ('source_id_mean_timespent_shift_0', 2978),
 ('views_per_artist', 2885),
 ('timespent_per_view', 2881),
 ('als_sim_mean', 2851),
 ('timespent_per_artist', 2816),
 ('item_id_mean_timespent_shift_10', 2804),
 ('total_timespent', 2622),
 ('als_sim_std', 2607),
 ('content_sim_mean', 2407),
 ('item_id_mean_timespent_shift_50', 2395),
 ('source_id_mean_timespent_shift_100', 2378),
 ('bm25_sim_mean', 2377),
 ('source_id_mean_timespent_shift_10', 2360),
 ('content_sim_std', 2284),
 ('als_sim_min', 2244),
 ('source_item_rank', 2168),
 ('item_id_mean_timespent_shift_100', 2145),
 ('source_id_mean_timespent_shift_50', 2109),
 ('content_sim_min', 2092),
 ('bm25_s