In [1]:
import polars as pl
import pandas as pd 
import numpy as np

from recommenders.als_recommender import train_als, ALSRecommender
from recommenders.bm25_recommender import train_bm25, Item2ItemRecommender
from recommenders.popular_recommender import calculate_popularity, PopularRecommender
from recommenders.content_recommender import calculate_similar_items, ContentRecommender
from recommenders.utils import recalculate_target_all_time, recalculate_target_users_hist

In [2]:
train_df = pd.read_parquet('./data/train.parquet.gzip').rename_axis('timestamp')

train_split = pd.read_parquet('./data/splits/train.parquet.gzip')
val_no_targets = pd.read_parquet('./data/splits/val_no_targets.parquet.gzip')
val_targets = pd.read_parquet('./data/splits/val_targets.parquet.gzip')

test_df = pd.read_parquet('./data/test.parquet.gzip')
items_meta_df = pd.read_parquet('./data/items_meta.parquet.gzip')
fresh_candidates_df = pd.read_parquet('./data/fresh_candidates.parquet.gzip')

### ALS Recommender

In [None]:
# train_split = recalculate_target_all_time(
#     train_split, 
#     alpha_coeff = 100.0, 
#     reaction_coeff = 6.0)

# train_df = recalculate_target_all_time(
#     train_df, 
#     alpha_coeff = 100.0, 
#     reaction_coeff = 6.0)

# print(f"Train split: {train_split.shape[0]:_}")
# print(f'Train full: {train_df.shape[0]:_}')

In [3]:
recommender_train_split, sparse_train_split = train_als(
    train_split,  
    factors = 512, 
    iterations = 75,
    regularization = 2.5,   
    save_result = True,
    model_name = 'val',
    root_dir = './models/als/'
    )

als_recommender_val = ALSRecommender(
    als_model = recommender_train_split, 
    sparse_user_item = sparse_train_split
    )


recommender_train_full, sparse_train_full = train_als(
    train_df,  
    factors = 512, 
    iterations = 75,
    regularization = 2.5, 
    save_result = True,
    model_name = 'test',
    root_dir = './models/als/'
    )

als_recommender_test = ALSRecommender(
    als_model = recommender_train_full, 
    sparse_user_item = sparse_train_full
    )

Converted data to sparse: (1000183, 227606)


  0%|          | 0/25 [00:00<?, ?it/s]

Saved model to: ./models/als/recommender_val.pkl
Saved sparse matrix to: ./models/als/matrix_val.npz
Converted data to sparse: (1000183, 227606)


  0%|          | 0/25 [00:00<?, ?it/s]

Saved model to: ./models/als/recommender_test.pkl
Saved sparse matrix to: ./models/als/matrix_test.npz


In [4]:
als_candidates_val = als_recommender_val.recommend(
    user_ids = val_no_targets.user_id.unique(),
    n_recs = 300,
    filter_items = None
    )

als_candidates_test = als_recommender_test.recommend(
    user_ids = test_df.user_id.unique(),
    n_recs = 300,
    filter_items = items_meta_df[~items_meta_df.item_id.isin(fresh_candidates_df.item_id.values)].item_id.values
    )

In [5]:
print(f'Recommendations val no targets: {als_candidates_val.shape[0]:_}')
print(f'Recommendations test: {als_candidates_test.shape[0]:_}')

Recommendations val no targets: 60_000_000
Recommendations test: 60_000_000


In [6]:
als_candidates_val.head(3)

Unnamed: 0,user_id,item_id,als_sim_score,als_sim_rank
0,4,93615,0.514802,0
1,4,87797,0.234164,1
2,4,159229,0.224879,2


In [7]:
als_candidates_val.to_parquet('./data/candidates/als_candidates_300_val.parquet.gzip', compression='gzip')
als_candidates_test.to_parquet('./data/candidates/als_candidates_300_test.parquet.gzip', compression='gzip')

### BM25 Recommender 

In [8]:
i2i_model_val = train_bm25(
    train_split,
    K = 500,
    K1 = 1.5,
    B = 0.75,
    save_result = True,
    model_name = 'val',
    root_dir = './models/bm25/'
    )

i2i_recommender_val = Item2ItemRecommender(i2i_model_val)


i2i_model_test = train_bm25(
    train_df,
    K = 500,
    K1 = 1.5,
    B = 0.75,
    save_result = True,
    model_name = 'test',
    root_dir = './models/bm25/'
    )

i2i_recommender_test = Item2ItemRecommender(i2i_model_test)

Converted data to sparse: (1000183, 227606)




  0%|          | 0/227606 [00:00<?, ?it/s]

Saved model to: ../models/bm25/recommender_val.pkl
Converted data to sparse: (1000183, 227606)




  0%|          | 0/227606 [00:00<?, ?it/s]

Saved model to: ../models/bm25/recommender_test.pkl


In [9]:
bm25_candidates_val = i2i_recommender_val.recommend(
    users_history = val_no_targets,
    n_recs = 200,
    filter_items = None,
    mode = 'MZ',
    kwargs = {'max_els': 2}
    )

bm25_candidates_test = i2i_recommender_test.recommend(
    users_history = train_df[train_df.user_id.isin(test_df.user_id.values)],
    n_recs = 200,
    filter_items = items_meta_df[~items_meta_df.item_id.isin(fresh_candidates_df.item_id.values)].item_id.values,
    mode = 'MZ',
    kwargs = {'max_els': 2}
    )

Recommender with args: mode=MZ, n_recs=100, max_els=2


100%|██████████| 200000/200000 [02:40<00:00, 1245.14it/s]


Recommender with args: mode=MZ, n_recs=100, max_els=2


100%|██████████| 200000/200000 [09:43<00:00, 342.79it/s]


In [10]:
print(f'Recommendations val no targets: {bm25_candidates_val.shape[0]:_}')
print(f'Recommendations test: {bm25_candidates_test.shape[0]:_}')

Recommendations val no targets: 33_156_657
Recommendations test: 32_697_962


In [11]:
bm25_candidates_val.head(3)

Unnamed: 0,user_id,item_id,bm25_sim_score,bm25_sim_rank
0,4,214442,1178.543958,1
1,4,192478,1130.758259,2
2,4,198859,1073.323649,3


In [12]:
bm25_candidates_val.to_parquet('./data/candidates/bm25_candidates_2mz_200_val.parquet.gzip', compression='gzip')
bm25_candidates_test.to_parquet('./data/candidates/bm25_candidates_2mz_200_test.parquet.gzip', compression='gzip')

### Content Recommender

In [13]:
similarities_val = calculate_similar_items(
    embeddings_matrix = np.stack(items_meta_df['embeddings'].to_numpy(), axis=0),
    n_recs = 200,
    candidates_df = None,
    save_result = True,
    file_name = 'val',
    root_dir = './models/content/'
    )

similarities_test = calculate_similar_items(
    embeddings_matrix = np.stack(items_meta_df['embeddings'].to_numpy(), axis=0),
    n_recs = 200,
    candidates_df = fresh_candidates_df,
    save_result = True,
    file_name = 'test',
    root_dir = './models/content/'
    )

100%|██████████| 227606/227606 [49:59<00:00, 75.88it/s] 
100%|██████████| 227606/227606 [09:48<00:00, 386.45it/s]


In [14]:
content_recommender_val = ContentRecommender(similarities_df = similarities_val)
content_recommender_test = ContentRecommender(similarities_df = similarities_test)

# content_recommender_val = ContentRecommender(similarities_filepath = './models/content/similarities_val.parquet.gzip')
# content_recommender_test = ContentRecommender(similarities_filepath = './models/content/similarities_test.parquet.gzip')

In [15]:
content_candidates_val = content_recommender_val.recommend(
    users_history = val_no_targets,
    n_recs = 150,
    mode = 'MZ',
    kwargs = {'max_els': 2}
    )

content_candidates_test = content_recommender_test.recommend(
    users_history = train_df[train_df.user_id.isin(test_df.user_id.values)],
    n_recs = 150,
    mode = 'MZ',
    kwargs = {'max_els': 2}
    )

Recommender with args: mode=MZ, n_recs=75, max_els=2


100%|██████████| 200000/200000 [01:50<00:00, 1808.55it/s]


Recommender with args: mode=MZ, n_recs=75, max_els=2


100%|██████████| 200000/200000 [01:54<00:00, 1754.23it/s]


In [16]:
print(f'Recommendations val no targets: {content_candidates_val.shape[0]:_}')
print(f'Recommendations test: {content_candidates_test.shape[0]:_}')

Recommendations val no targets: 29_242_627
Recommendations test: 29_095_094


In [17]:
content_candidates_val.head(3)

Unnamed: 0,user_id,item_id,content_sim_score,content_sim_rank
0,4,105185,0.908647,0
1,4,172462,0.904849,1
2,4,24481,0.904252,2


In [18]:
content_candidates_val.to_parquet('./data/candidates/content_candidates_2mz_150_val.parquet.gzip', compression='gzip')
content_candidates_test.to_parquet('./data/candidates/content_candidates_2mz_150_test.parquet.gzip', compression='gzip')

### Popular Recommender 


In [19]:
train_split_popularity = calculate_popularity(train_split, last_n = 100)
popular_recommender_val_no_targets = PopularRecommender(train_split_popularity)

train_df_popularity = calculate_popularity(train_df, last_n = 100)
popular_recommender_test = PopularRecommender(train_df_popularity)

In [20]:
popular_candidates_val = popular_recommender_val_no_targets.recommend(
    history_df = val_no_targets, 
    n_recs = 200, 
    filter_items = None
    )

popular_candidates_test = popular_recommender_test.recommend(
    history_df = train_df[train_df.user_id.isin(test_df.user_id.values)], 
    n_recs = 200, 
    filter_items = items_meta_df[~items_meta_df.item_id.isin(fresh_candidates_df.item_id.values)].item_id.values
    )

In [21]:
print(f'Recommendations val no targets: {popular_candidates_val.shape[0]:_}')
print(f'Recommendations test: {popular_candidates_test.shape[0]:_}')

Recommendations val no targets: 39_979_012
Recommendations test: 39_941_434


In [22]:
popular_candidates_val.head(3)

Unnamed: 0,user_id,item_id,mean_timespent
0,4,4628,25.8
1,4,169029,14.2
2,4,146586,13.283721


In [24]:
popular_candidates_val.to_parquet('./data/candidates/popular_candidates_200_val.parquet.gzip', compression='gzip')
popular_candidates_test.to_parquet('./data/candidates/popular_candidates_200_test.parquet.gzip', compression='gzip')