In [1]:
import pickle
import numpy as np
import pandas as pd
import polars as pl

from recommenders.als_recommender import get_als_similarity_features
from recommenders.bm25_recommender import get_bm25_similarity_features
from recommenders.content_recommender import get_content_similarity_features

In [2]:
merged_candidates_val = pd.read_parquet('./data/candidates/merged_candidates_val.parquet.gzip')
print(f'Val candidates shape: {merged_candidates_val.shape[0]:_}')

merged_candidates_test = pd.read_parquet('./data/candidates/merged_candidates_test.parquet.gzip')
print(f'Test candidates shape: {merged_candidates_test.shape[0]:_}')

Val candidates shape: 149_810_474
Test candidates shape: 146_573_625


In [3]:
train_df = pd.read_parquet('./data/train.parquet.gzip').rename_axis('timestamp')

train_split = pd.read_parquet('./data/splits/train.parquet.gzip')
val_no_targets = pd.read_parquet('./data/splits/val_no_targets.parquet.gzip')
val_targets = pd.read_parquet('./data/splits/val_targets.parquet.gzip')

test_df = pd.read_parquet('./data/test.parquet.gzip')
items_meta_df = pd.read_parquet('./data/items_meta.parquet.gzip')
fresh_candidates_df = pd.read_parquet('./data/fresh_candidates.parquet.gzip')

### ALS features

In [4]:
with open('./models/als/recommender_val.pkl', 'rb') as f1:
    als_recommender_val = pickle.load(f1)

als_features_val = get_als_similarity_features(
    als_model = als_recommender_val,
    candidates_df = merged_candidates_val, 
    history_df = val_no_targets
    )


with open('./models/als/recommender_test.pkl', 'rb') as f2:
    als_reccomender_test = pickle.load(f2)

als_features_test = get_als_similarity_features(
    als_model = als_reccomender_test,
    candidates_df = merged_candidates_test, 
    history_df = train_df[train_df.user_id.isin(test_df.user_id.values)]
    )

Users candidates df w shape: 149_810_474
Users history df w shape: 20_379_394
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [04:18<00:00, 772.91it/s]


Users candidates df w shape: 146_573_625
Users history df w shape: 32_219_777
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [07:04<00:00, 470.69it/s]


In [5]:
print(f'Val features shape: {als_features_val.shape[0]:_}')
print(f'Test features shape: {als_features_test.shape[0]:_}')

Val features shape: 149_810_474
Test features shape: 146_573_625


In [6]:
als_features_val.head(3)

Unnamed: 0,user_id,item_id,als_sim_mean,als_sim_min,als_sim_max,als_sim_std,als_sim_score,als_sim_rank
0,4,93615,0.294215,0.03284,0.73446,0.146107,0.514802,0
1,4,87797,0.37896,-0.143124,0.745198,0.158539,0.234164,1
2,4,159229,0.340893,0.048582,0.912253,0.20539,0.224879,2


In [7]:
als_features_val.to_parquet('./data/features/als_features_val.parquet.gzip', compression='gzip')
als_features_test.to_parquet('./data/features/als_features_test.parquet.gzip', compression='gzip')

### BM25 features

In [8]:
with open('./models/bm25/recommender_val.pkl', 'rb') as f1:
    i2i_model_val = pickle.load(f1)

bm25_features_val = get_bm25_similarity_features(
    i2i_model = i2i_model_val, 
    candidates_df = merged_candidates_val, 
    history_df = val_no_targets
    )


with open('./models/bm25/recommender_test.pkl', 'rb') as f2:
    i2i_model_test = pickle.load(f2)

bm25_features_test = get_bm25_similarity_features(
    i2i_model = i2i_model_test, 
    candidates_df = merged_candidates_test, 
    history_df = train_df[train_df.user_id.isin(test_df.user_id.values)]
    )

Users candidates df w shape: 149_810_474
Users history df w shape: 20_379_394
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [15:03<00:00, 221.33it/s]


Users candidates df w shape: 146_573_625
Users history df w shape: 32_219_777
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [13:52<00:00, 240.17it/s]


In [9]:
print(f'Val features shape: {bm25_features_val.shape[0]:_}')
print(f'Test features shape: {bm25_features_test.shape[0]:_}')

Val features shape: 149_810_474
Test features shape: 146_573_625


In [10]:
bm25_features_val.to_parquet('./data/features/bm25_features_val.parquet.gzip', compression='gzip')
bm25_features_test.to_parquet('./data/features/bm25_features_test.parquet.gzip', compression='gzip')

### Content features


In [11]:
content_features_val = get_content_similarity_features(
    embeddings_matrix = np.stack(items_meta_df['embeddings'].to_numpy(), axis=0), 
    candidates_df = merged_candidates_val, 
    history_df = val_no_targets
    )

content_features_test = get_content_similarity_features(
    embeddings_matrix = np.stack(items_meta_df['embeddings'].to_numpy(), axis=0), 
    candidates_df = merged_candidates_test, 
    history_df = train_df[train_df.user_id.isin(test_df.user_id.values)]
    )

Users candidates df w shape: 149_810_474
Users history df w shape: 20_379_394
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [09:28<00:00, 351.60it/s]


Users candidates df w shape: 146_573_625
Users history df w shape: 32_219_777
Users candidates list len: 200_000
Users history list len: 200_000


100%|██████████| 200000/200000 [09:07<00:00, 365.43it/s]


In [12]:
print(f'Val features shape: {content_features_val.shape[0]:_}')
print(f'Test features shape: {content_features_test.shape[0]:_}')

Val features shape: 149_810_474
Test features shape: 146_573_625


In [13]:
content_features_val.head(3)

Unnamed: 0,user_id,item_id,content_sim_mean,content_sim_min,content_sim_max,content_sim_std
0,4,93615,0.688007,0.4575,0.865685,0.07055
1,4,87797,0.766864,0.538591,0.93528,0.076561
2,4,159229,0.759494,0.542229,0.887457,0.076913


In [14]:
content_features_val.to_parquet('./data/features/content_features_val.parquet.gzip', compression='gzip')
content_features_test.to_parquet('./data/features/content_features_test.parquet.gzip', compression='gzip')

### User - Item - Source features



In [16]:
def calculate_rank(df_pd: pd.DataFrame) -> pd.DataFrame:
    df_pl = pl.from_pandas(df_pd.reset_index())
    df_pl = df_pl.sort(['user_id', 'timestamp'])
    df_pl = df_pl.with_columns([
        pl.col('item_id')
        .cumcount(reverse=True)
        .over(['user_id'])
        .alias('rank')
        ])
    return df_pl.to_pandas().set_index('timestamp')

train_split = calculate_rank(train_split)
train_df = calculate_rank(train_df)

In [18]:
def calc_source_item_stats(train_df, items_meta_df):
    train_df = pd.merge(train_df, items_meta_df.drop(columns='embeddings'), on=['item_id'])
    
    si_stats_df = train_df.groupby(['source_id', 'item_id'], as_index=False).agg(
        source_item_cnt = ('user_id', 'count'),
        source_item_total_timespent = ('timespent', 'sum'), 
        source_item_cnt_nz = ('timespent', lambda x: np.nonzero(list(x))[0].shape[0])
        )

    si_stats_df['source_item_retention_perc'] = si_stats_df['source_item_cnt_nz'] / si_stats_df['source_item_cnt']
    si_stats_df['source_item_timespent_perc'] = si_stats_df['source_item_total_timespent'] / si_stats_df['source_item_cnt']
    si_stats_df['source_item_timespent_perc_nz'] = si_stats_df['source_item_total_timespent'] / si_stats_df['source_item_cnt_nz']

    si_stats_df = si_stats_df.sort_values(['source_id', 'source_item_timespent_perc'], ascending=[True, False])
    si_stats_df['source_item_rank'] = si_stats_df.groupby('source_id', sort=False).cumcount()

    si_stats_df['source_item_count_norm'] = si_stats_df['source_item_cnt']\
                                            / si_stats_df.groupby('source_id')['source_item_cnt'].transform('sum')
    si_stats_df['source_item_timespent_norm'] = si_stats_df['source_item_total_timespent']\
                                                / si_stats_df.groupby('source_id')['source_item_total_timespent'].transform('sum')

    del si_stats_df['source_item_cnt']
    del si_stats_df['source_item_cnt_nz']
    del si_stats_df['source_item_total_timespent']

    return si_stats_df

si_stats_val = calc_source_item_stats(train_split, items_meta_df)
si_stats_test = calc_source_item_stats(train_df, items_meta_df)

In [19]:
si_stats_val.to_parquet('./data/features/source_item_features_val.parquet.gzip', compression='gzip')
si_stats_test.to_parquet('./data/features/source_item_features_test.parquet.gzip', compression='gzip')

In [20]:
def get_stats(train_df, shifts, column):
    stats_dfs = None
    for shift in shifts:
        train_part = train_df[train_df['rank'] <= shift]

        stats = train_part[['user_id', column, 'timespent']].groupby(column, as_index=False).agg(
              **{f'{column}_mean_timespent_shift_{shift}': ('timespent', 'mean')})

        if stats_dfs is not None:
            stats_dfs = stats_dfs.merge(stats, how='right')
        else:
            stats_dfs = stats
    return stats_dfs.fillna(0)

def get_sources_popularity_stats(train_df, items_meta_df, shifts):
    train_df = pd.merge(train_df, items_meta_df.drop(columns='embeddings'), on=['item_id'])
    sources_stats = get_stats(train_df, shifts, 'source_id')
    return sources_stats

sources_stats_val= get_sources_popularity_stats(train_split, items_meta_df, [0, 10, 50, 100])
sources_stats_test = get_sources_popularity_stats(train_df, items_meta_df, [0, 10, 50, 100])

In [21]:
sources_stats_val.to_parquet('./data/features/sources_features_val.parquet.gzip', compression='gzip')
sources_stats_test.to_parquet('./data/features/sources_features_test.parquet.gzip', compression='gzip')

In [22]:
def get_items_popularity_stats(train_df, offsets):
    item_stats = get_stats(train_df, offsets, 'item_id')
    return item_stats

item_stats_val = get_items_popularity_stats(train_split, [0, 10, 50, 100])
item_stats_test = get_items_popularity_stats(train_df, [0, 10, 50, 100])

In [23]:
item_stats_val.to_parquet('./data/features/items_features_val.parquet.gzip', compression='gzip')
item_stats_test.to_parquet('./data/features/items_features_test.parquet.gzip', compression='gzip')

In [24]:
def get_user_source_stats(val_test_df, items_meta_df):
    val_test_df = pd.merge(val_test_df, items_meta_df.drop(columns='embeddings'), on=['item_id'])
    user_source_stats = val_test_df[['user_id', 'source_id', 'timespent']].\
        groupby(['user_id', 'source_id'], as_index=False).\
            agg(**{'timespent_per_view': ('timespent', 'mean')})
    return user_source_stats

user_source_stats_val= get_user_source_stats(val_no_targets, items_meta_df)
user_source_stats_test = get_user_source_stats(train_df[train_df.user_id.isin(test_df.user_id.values)], items_meta_df)

In [25]:
user_source_stats_val.to_parquet('./data/features/user_source_features_val.parquet.gzip', compression='gzip')
user_source_stats_test.to_parquet('./data/features/user_source_features_test.parquet.gzip', compression='gzip')

In [26]:
def get_user_stats(val_test_df, items_meta_df):
    val_test_df = pd.merge(val_test_df, items_meta_df.drop(columns='embeddings'), on=['item_id'])
    user_stats = val_test_df[['user_id', 'source_id', 'timespent']].groupby('user_id', as_index=False).agg(
        **{'total_views': ('source_id', 'count'),
           'total_timespent': ('timespent', 'sum'), 
           'unique_sources': ('source_id', 'nunique')})

    user_stats['views_per_artist'] = user_stats['total_views'] / user_stats['unique_sources']
    user_stats['timespent_per_artist'] = user_stats['total_timespent'] / user_stats['unique_sources']

    del user_stats['total_views']
    del user_stats['unique_sources']

    return user_stats

user_stats_val = get_user_stats(val_no_targets, items_meta_df)
user_stats_test = get_user_stats(train_df[train_df.user_id.isin(test_df.user_id.values)], items_meta_df)

In [27]:
user_stats_val.to_parquet('./data/features/user_features_val.parquet.gzip', compression='gzip')
user_stats_test.to_parquet('./data/features/user_features_test.parquet.gzip', compression='gzip')