In [1]:
!pip install rectools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rectools
  Downloading RecTools-0.3.0-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 2.6 MB/s 
[?25hCollecting nmslib<3.0.0,>=2.0.4
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 17.9 MB/s 
Collecting lightfm<2.0,>=1.16
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 17.7 MB/s 
Collecting Markdown<3.3,>=3.2
  Downloading Markdown-3.2.2-py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 4.6 MB/s 
[?25hCollecting attrs<22.0.0,>=19.1.0
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 4.6 MB/s 
[?25hCollecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 37.8 MB/s 
Collecting pybind11<2.6.2


In [2]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pickle
import dill
import numpy as np
import pandas as pd
import scipy as sp
import tqdm

from collections import Counter
from pathlib import Path

from implicit.nearest_neighbours import  BM25Recommender, CosineRecommender
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models.popular import PopularModel
from rectools.model_selection import TimeRangeSplitter


In [3]:
!mkdir ../data

mkdir: cannot create directory ‘../data’: File exists


In [4]:
!wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip

--2022-12-11 10:15:45--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-11 10:15:52 (13.5 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]



In [5]:
!unzip ../data/data_original.zip -d ../data

Archive:  ../data/data_original.zip
replace ../data/kion_train/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [19]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

In [20]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])


In [46]:
_, bins = pd.qcut(items['release_year'], 10, retbins=True)
labels = bins[:-1]

year_feature = pd.DataFrame(
    {
        'id': items['item_id'],
        'value': pd.cut(items['release_year'], bins=bins, labels=bins[:-1]),
        'feature': 'release_year'
    }
)

In [47]:
items['genre'] = items['genres'].str.split(',')
items[['genre', 'genres']].head(3)

Unnamed: 0,genre,genres
0,"[драмы, зарубежные, детективы, мелодрамы]","драмы, зарубежные, детективы, мелодрамы"
1,"[зарубежные, приключения, комедии]","зарубежные, приключения, комедии"
2,"[криминал, зарубежные, триллеры, боевики, ...","криминал, зарубежные, триллеры, боевики, комедии"


In [48]:
genre_feature = items[['item_id', 'genre']].explode('genre')
genre_feature.columns = ['id', 'value']
genre_feature['feature'] = 'genre'
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [49]:
item_feat = pd.concat([genre_feature, year_feature])
item_feat = item_feat[item_feat['id'].isin(interactions['item_id'])]

In [50]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

In [51]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [52]:
# join titles from items 
popular_recommendations = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=20, 
    filter_viewed=False
).merge(items[['item_id', 'title']], 
       on='item_id',
       how='left')

In [53]:
popular_recommendations.head(10)

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,202457.0,1,Хрустальный
1,176549,15297,193123.0,2,Клиника счастья
2,176549,9728,132865.0,3,Гнев человеческий
3,176549,13865,122119.0,4,Девятаев
4,176549,4151,91167.0,5,Секреты семейной жизни
5,176549,3734,74803.0,6,Прабабушка легкого поведения
6,176549,2657,68581.0,7,Подслушано
7,176549,4880,55043.0,8,Афера
8,176549,142,45367.0,9,Маша
9,176549,6809,40372.0,10,Дуров


In [54]:
# train test split 
# test = last 1 week 

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [55]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [56]:
# Prepare train matrix 
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]


In [57]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")


users_mapping amount: 842129
items_mapping amount: 15404


In [59]:
 def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [60]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)


In [61]:
models = {
    'Cosine_10': CosineRecommender(K=10),
    'Cosine_30': CosineRecommender(K=30),
    'Cosine_50': CosineRecommender(K=50),
    'BM_10' : BM25Recommender(K=10),
    'BM_30' : BM25Recommender(K=30),
    'BM_50' : BM25Recommender(K=50)
}

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [62]:
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

for model_name, model in tqdm.tqdm(models.items()):
    model.fit(interaction_matrix)
    with open(models_dir / f'{model_name}.dill', 'wb') as f:
        dill.dump(model, f)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/842129 [00:00<?, ?it/s]

 17%|█▋        | 1/6 [29:02<2:25:14, 1742.86s/it]

  0%|          | 0/842129 [00:00<?, ?it/s]

 33%|███▎      | 2/6 [58:47<1:57:50, 1767.71s/it]

  0%|          | 0/842129 [00:00<?, ?it/s]

 50%|█████     | 3/6 [1:26:15<1:25:38, 1712.82s/it]

  0%|          | 0/842129 [00:00<?, ?it/s]

 67%|██████▋   | 4/6 [1:54:28<56:50, 1705.10s/it]  

  0%|          | 0/842129 [00:00<?, ?it/s]

 83%|████████▎ | 5/6 [2:21:32<27:55, 1675.74s/it]

  0%|          | 0/842129 [00:00<?, ?it/s]

100%|██████████| 6/6 [2:48:19<00:00, 1683.21s/it]


In [63]:
models_dir = Path('../models')

for model_name in tqdm.tqdm(models.keys()):
    with open(models_dir / f'{model_name}.dill', 'rb') as f:
        models[model_name] = dill.load(f)

100%|██████████| 6/6 [00:15<00:00,  2.53s/it]
