In [1]:
!pip install rectools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rectools
  Downloading RecTools-0.3.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.0/89.0 KB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightfm<2.0,>=1.16
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 KB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting Markdown<3.3,>=3.2
  Downloading Markdown-3.2.2-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.0/89.0 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m38.

In [2]:
pip install dill

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m512.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.6


In [3]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitItemKNNWrapperModel, PopularModel, PureSVDModel

from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

import dill

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [4]:
N = 10

Get KION dataset

In [5]:
!wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip

--2023-04-09 17:04:45--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘kion_train.zip’


2023-04-09 17:04:52 (13.2 MB/s) - ‘kion_train.zip’ saved [78795385/78795385]



In [6]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [7]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [8]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Interactions

In [9]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [10]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [11]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [12]:
rec_interactions = Interactions(interactions)

In [13]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


In [14]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


# Create Blending model

In [15]:
class BlendingModel():
    
    def __init__(self, model1, model2):
        self._model1 = model1
        self._model2 = model2
        
    def fit(self, X):
        self._model1.fit(X)
        self._model2.fit(X)
    
    def recommend(self, *args, **kwargs):
        reco1 = self._model1.recommend(*args, **kwargs)
        reco2 = self._model2.recommend(*args, **kwargs)
        mreco = pd.concat((reco1,reco2)).sort_values(["user_id", "rank"])
        mreco = mreco.drop_duplicates(subset=["user_id", "item_id"], keep="first")
        reco_groups = mreco.groupby("user_id")
        reco = reco_groups.head(kwargs["k"])
        reco["rank"] = np.concatenate([list(range(x)) for x in reco.groupby("user_id")["user_id"].count()])
        return reco

Split

In [16]:
# For cv 
n_folds = 7
unit = "W" #week
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


# Test fold

In [17]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']


In [18]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [19]:


# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(K=N),
    "tfidf_itemknn": TFIDFRecommender(K=N),
    "bm25_itemknn": BM25Recommender(K=N),
    "pure_svd": PureSVDModel(),
    "blend_cosine_tfidf": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
    ),
    "blend_cosine_bm25": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
    ),
    "blend_cosine_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        PureSVDModel(),
    ),
    "blend_tfidf_bm25": BlendingModel(
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
    ),
    "blend_tfidf_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
        PureSVDModel(),
    ),
    "blend_bm25_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
        PureSVDModel(),
    ),
}




# Model training cv

In [20]:
%%time

results = []

fold_iterator = cv.split(rec_interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in tqdm(enumerate(fold_iterator), total=cv.get_n_splits(rec_interactions)):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    interactions_train  = Dataset.construct(
        interactions_df=df_train,
        user_features_df=None,
        item_features_df=None
    )
    
    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        if model_name == "pure_svd":
            model = model
        elif model_name.startswith("blend_"):
            modle = model
        else:
            model = ImplicitItemKNNWrapperModel(model=model)
        model.fit(interactions_train)
        
        recs_itemknn = model.recommend(
            df_test['user_id'].unique(), 
            dataset=interactions_train, 
            k=N, 
            filter_viewed=False  # False - same items to every user
        )
        metric_values = calc_metrics(
            metrics,
            reco=recs_itemknn,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        pprint(fold)
        results.append(fold)

  0%|          | 0/7 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'),
 'Test': 237414,
 'Test items': 5947,
 'Test users': 98930,
 'Train': 2533586,
 'Train items': 14092,
 'Train users': 536802}
{'fold': 0,
 'map@10': 0.030780478331709257,
 'model': 'cosine_itemknn',
 'novelty': 8.639375623023268,
 'prec@10': 0.027854038208834527,
 'recall@10': 0.16991309024149737,
 'serendipity': 3.4622805775989926e-05}
{'fold': 0,
 'map@10': 0.037825762097538115,
 'model': 'tfidf_itemknn',
 'novelty': 7.289713840587525,
 'prec@10': 0.03565248155261296,
 'recall@10': 0.20874656011549036,
 'serendipity': 5.425983082852559e-05}
{'fold': 0,
 'map@10': 0.07093214185768328,
 'model': 'bm25_itemknn',
 'novelty': 4.471581947568645,
 'prec@10': 0.06021227130294147,
 'recall@10': 0.3450943240365177,
 'serendipity': 2.271946787767931e-05}
{'fold': 0,
 'map@10': 0.07638328154893521,
 'model': 'pure_svd',
 'novelty': 5.864210907769001,
 'prec@10': 0.04536439

# Metrics

In [21]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,map@10,novelty,serendipity
0,0,cosine_itemknn,0.027854,0.169913,0.030780,8.639376,0.000035
1,0,tfidf_itemknn,0.035652,0.208747,0.037826,7.289714,0.000054
2,0,bm25_itemknn,0.060212,0.345094,0.070932,4.471582,0.000023
3,0,pure_svd,0.045364,0.277383,0.076383,5.864211,0.000012
4,0,blend_cosine_tfidf,0.031060,0.184352,0.042869,8.031990,0.000046
...,...,...,...,...,...,...,...
65,6,blend_cosine_bm25,0.027005,0.143719,0.049881,6.067187,0.000044
66,6,blend_cosine_pure_svd,0.021547,0.118069,0.054092,6.265267,0.000035
67,6,blend_tfidf_bm25,0.029594,0.156538,0.054864,5.440941,0.000066
68,6,blend_tfidf_pure_svd,0.024157,0.131257,0.057680,5.750193,0.000054


In [22]:
df_metrics.to_pickle("df_metrics.pickle")

Groupby models

In [23]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blend_bm25_pure_svd,0.036358,0.20297,0.08907,4.749459,2.2e-05
blend_cosine_bm25,0.033377,0.187231,0.062353,6.190634,3.7e-05
blend_cosine_pure_svd,0.026297,0.151696,0.067104,6.491201,3.1e-05
blend_cosine_tfidf,0.024674,0.14118,0.047577,7.637336,6.8e-05
blend_tfidf_bm25,0.036192,0.201456,0.067582,5.572789,5.4e-05
blend_tfidf_pure_svd,0.028867,0.164598,0.070673,5.991766,4.6e-05
bm25_itemknn,0.042135,0.230893,0.062387,4.090628,2.6e-05
cosine_itemknn,0.021627,0.126067,0.029981,8.189264,5.1e-05
pure_svd,0.028351,0.160396,0.061973,5.316291,1.4e-05
tfidf_itemknn,0.028271,0.159305,0.037134,6.908348,8e-05


If a diff between model metrics less than an std value => there is no significant difference observed

In [24]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blend_bm25_pure_svd,0.006915,0.046826,0.023304,0.255851,2e-06
blend_cosine_bm25,0.006428,0.044941,0.015169,0.203095,7e-06
blend_cosine_pure_svd,0.005053,0.035481,0.018634,0.26194,5e-06
blend_cosine_tfidf,0.004939,0.034913,0.011853,0.175843,1.4e-05
blend_tfidf_bm25,0.007078,0.048665,0.015658,0.182415,1e-05
blend_tfidf_pure_svd,0.005266,0.036749,0.018939,0.260999,8e-06
bm25_itemknn,0.010479,0.068746,0.014453,0.179437,3e-06
cosine_itemknn,0.00473,0.033791,0.007399,0.20252,1.1e-05
pure_svd,0.008489,0.05874,0.016331,0.307187,2e-06
tfidf_itemknn,0.005514,0.038618,0.008259,0.172059,1.8e-05


# Fit best model

In [25]:
dataset  = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [26]:
model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=N))
model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f35ac8c8340>

In [27]:
with open('bm25_itemknn.dill', 'wb') as f:
    dill.dump(model, f)

# model for cold users

In [29]:
popular_model = PopularModel(add_cold=True)
popular_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f35acf45340>

In [30]:
with open('popular_model.dill', 'wb') as f:
    dill.dump(model, f)


# Offline models

In [31]:
with open("bm25_itemknn.dill", "rb") as f:
    model = dill.load(f)

In [32]:
users_ids = interactions['user_id'].unique()

In [33]:
offline_recs = model.recommend(
    users_ids, 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # False - same items to every user
)

In [34]:
offline_recs = offline_recs.groupby('user_id').agg({'item_id': list})

In [35]:
offline_recs_list = [ None ] * (max(users_ids) + 1)
for i, item in tqdm(offline_recs.iterrows(), total = len(users_ids)):
    offline_recs_list[item.name] = item["item_id"]

  0%|          | 0/962179 [00:00<?, ?it/s]

In [36]:
offline_recs

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[15297, 10440, 4151, 13865, 9728, 3734, 12192, 142, 2657, 4880]"
1,"[10440, 15297, 13865, 9728, 4151, 2657, 3734, 142, 9996, 4880]"
2,"[7571, 9728, 13865, 16166, 3734, 10440, 15297, 5693, 3182, 12841]"
3,"[10440, 15297, 9728, 4151, 13865, 3734, 4880, 2657, 142, 9996]"
4,"[4700, 9728, 13865, 10440, 15297, 8636, 3734, 4151, 4457, 142]"
...,...
1097553,"[13058, 12463, 24, 15297, 10440, 101, 4151, 1916, 14470, 15531]"
1097554,"[1053, 849, 9728, 10440, 13865, 12463, 3509, 142, 9169, 11237]"
1097555,"[4880, 9728, 10440, 13865, 15297, 4151, 3734, 2657, 142, 9996]"
1097556,"[12812, 15297, 3734, 9728, 10440, 13865, 4151, 4880, 12192, 142]"


In [37]:
offline_recs.to_csv (r'my_data.csv', index= "user_id")

In [38]:
with open("popular_model.dill", "rb") as f:
    popular_model = dill.load(f)

In [39]:
popular_model_recs = list(popular_model.recommend(
    [0], 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # False - same items to every user
)['item_id'])

In [40]:
with open('offline_bm25_itemknn.dill', 'wb') as f:
    dill.dump(
        {
            "recs": offline_recs_list,
            "popular_recs": popular_model_recs,
        },
        f,
    )

In [57]:
popular_model_recs

[15297, 10440, 4151, 13865, 9728, 3734, 12192, 142, 2657, 4880]

In [41]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blend_bm25_pure_svd,0.006915,0.046826,0.023304,0.255851,2e-06
blend_cosine_bm25,0.006428,0.044941,0.015169,0.203095,7e-06
blend_cosine_pure_svd,0.005053,0.035481,0.018634,0.26194,5e-06
blend_cosine_tfidf,0.004939,0.034913,0.011853,0.175843,1.4e-05
blend_tfidf_bm25,0.007078,0.048665,0.015658,0.182415,1e-05
blend_tfidf_pure_svd,0.005266,0.036749,0.018939,0.260999,8e-06
bm25_itemknn,0.010479,0.068746,0.014453,0.179437,3e-06
cosine_itemknn,0.00473,0.033791,0.007399,0.20252,1.1e-05
pure_svd,0.008489,0.05874,0.016331,0.307187,2e-06
tfidf_itemknn,0.005514,0.038618,0.008259,0.172059,1.8e-05


In [42]:

recos = offline_recs.copy()

recos

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[15297, 10440, 4151, 13865, 9728, 3734, 12192, 142, 2657, 4880]"
1,"[10440, 15297, 13865, 9728, 4151, 2657, 3734, 142, 9996, 4880]"
2,"[7571, 9728, 13865, 16166, 3734, 10440, 15297, 5693, 3182, 12841]"
3,"[10440, 15297, 9728, 4151, 13865, 3734, 4880, 2657, 142, 9996]"
4,"[4700, 9728, 13865, 10440, 15297, 8636, 3734, 4151, 4457, 142]"
...,...
1097553,"[13058, 12463, 24, 15297, 10440, 101, 4151, 1916, 14470, 15531]"
1097554,"[1053, 849, 9728, 10440, 13865, 12463, 3509, 142, 9169, 11237]"
1097555,"[4880, 9728, 10440, 13865, 15297, 4151, 3734, 2657, 142, 9996]"
1097556,"[12812, 15297, 3734, 9728, 10440, 13865, 4151, 4880, 12192, 142]"


In [43]:
recos[:10]

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[15297, 10440, 4151, 13865, 9728, 3734, 12192, 142, 2657, 4880]"
1,"[10440, 15297, 13865, 9728, 4151, 2657, 3734, 142, 9996, 4880]"
2,"[7571, 9728, 13865, 16166, 3734, 10440, 15297, 5693, 3182, 12841]"
3,"[10440, 15297, 9728, 4151, 13865, 3734, 4880, 2657, 142, 9996]"
4,"[4700, 9728, 13865, 10440, 15297, 8636, 3734, 4151, 4457, 142]"
5,"[7825, 5115, 4179, 5324, 696, 7653, 12314, 9785, 8450, 7043]"
7,"[8710, 10440, 15297, 4880, 13865, 16228, 4151, 805, 14266, 9996]"
8,"[6809, 15297, 10440, 4151, 9728, 13865, 142, 4880, 3734, 4740]"
9,"[15297, 10440, 13865, 9728, 4151, 3734, 2657, 142, 4880, 9996]"
10,"[13865, 10440, 9728, 15297, 3734, 4151, 4880, 2657, 142, 10464]"
