In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset, Interactions, IdMap

from service.rec_models.userknn import UserKnn

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Get KION dataset

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>


In [49]:

# download dataset by chunks
# url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"
#
# req = requests.get(url, stream=True)
#
# with open('../datasets/kion_train.zip', "wb") as fd:
#     total_size_in_bytes = int(req.headers.get('Content-Length', 0))
#     progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2 ** 20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [50]:
!unzip kion_train.zip

"unzip" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


# EDA

In [2]:
interactions = pd.read_csv('../datasets/kion_train/interactions.csv')
users = pd.read_csv('../datasets/kion_train/users.csv')
items = pd.read_csv('../datasets/kion_train/items.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## interactions

In [53]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [54]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [55]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [56]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


## users

In [57]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [58]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [59]:
# TODO
pd.concat([items.head(2), items.tail(2)])[["item_id", "content_type", "title", "release_year"]]

Unnamed: 0,item_id,content_type,title,release_year
0,10711,film,Поговори с ней,2002.0
1,2508,film,Голые перцы,2014.0
15961,4538,series,Среди камней,2019.0
15962,3206,series,Гоша,2019.0


In [60]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


## IdMap

In [61]:
userMap = IdMap.from_values(interactions[Columns.User])
itemMap = IdMap.from_values(interactions[Columns.Item])

#  userkNN model  CV

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base



In [5]:
# setting for cv
n_folds = 1
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Test fold borders

In [6]:
from rectools.model_selection import TimeRangeSplitter

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

rec_interactions = Interactions.from_raw(interactions, userMap, itemMap)
print(f"Real number of folds: {cv.get_n_splits(rec_interactions)}")

start_date: 2021-08-08 00:00:00
last_date: 2021-08-22 00:00:00
periods: 2
freq: 1W

Test fold borders: ['2021-08-08' '2021-08-15']


NameError: name 'userMap' is not defined

In [7]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics

# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
    "MAP@10": MAP(k=10),
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
}


# Model training by fold

In [65]:
%%time

results = []

fold_iterator = cv.split(rec_interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = rec_interactions.df.iloc[train_ids].copy()
    df_test = rec_interactions.df.iloc[test_ids].copy()

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        if model_name == "cosine_itemknn":
            userknn_model = UserKnn(model)
            userknn_model.fit(df_train, False)
            # userknn_model = UserKnn.load("userknn_cosine.dill")
        else:
            userknn_model = UserKnn(model)
            userknn_model.fit(df_train, False)
            # userknn_model = UserKnn.load("userknn_tfidf.dill")

        recos = userknn_model.recommend(df_test)

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        print()
        print()



{'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Test': 276699,
 'Test items': 6715,
 'Test users': 101983,
 'Train': 4587708,
 'Train items': 15404,
 'Train users': 842129}

0
1
2

3
4



0
1
2

3
4


CPU times: total: 8h 35min 8s
Wall time: 52min 53s


HBox(children=(FloatProgress(value=0.0, max=842129.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=101983.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=842129.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=101983.0), HTML(value='')))

# 👌 Metrics

`Metrics by fold`



## Metric description
- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.novelty.MeanInvUserFreq.html#rectools.metrics.novelty.MeanInvUserFreq"> Mean Inverse User Frequency (novelty)</a>

- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.serendipity.Serendipity.html"> Serendipity = novelty and relevance</a>

In [66]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,MAP@10,novelty,serendipity
0,0,cosine_itemknn,0.020312,0.108145,0.036107,6.682183,5.6e-05
1,0,tfidf_itemknn,0.021802,0.11102,0.037483,6.936814,6.6e-05


In [67]:
df_metrics.to_pickle("df_metrics.pickle")

## Metrics mean by fold
`we can compare two models`

In [68]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cosine_itemknn,0.020312,0.108145,6.682183,5.6e-05,0.036107
tfidf_itemknn,0.021802,0.11102,6.936814,6.6e-05,0.037483


## Metrics std by fold

`If a diff between model metrics less than an std value => there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [69]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cosine_itemknn,,,,,
tfidf_itemknn,,,,,


In [70]:
df = df_metrics.groupby('model').mean()[metrics.keys()]

In [71]:
diff = df.loc['cosine_itemknn'] - df.loc['tfidf_itemknn']
diff

prec@10       -0.001489
recall@10     -0.002875
novelty       -0.254631
serendipity   -0.000010
MAP@10        -0.001376
dtype: float64

# Fit

In [72]:
# userknn_model = UserKnn(TFIDFRecommender())
# userknn_model.fit(rec_interactions.df, True)

# Проверка

In [10]:
userknn_model = UserKnn.load("userknn_tfidf.dill")

x = userknn_model.watched.watched_dict[176549]

In [13]:
%%time

for _ in range(0, 1000):
    userknn_model.predict(176549, 10)
print(userknn_model.predict(176549, 10))
print(set(userknn_model.predict(176549, 10)) - set(x))

[14, 24, 101, 142, 341, 512, 657, 849, 1132, 1287]
{512, 1287, 14, 142, 657, 849, 341, 24}
CPU times: total: 609 ms
Wall time: 629 ms


In [75]:
%%time

# for _ in range(0, 1000):
#     userknn_model._predict_userknn(176549, 10)
userknn_model._predict_userknn(176549, 10)



CPU times: total: 0 ns
Wall time: 458 µs


array([ 875, 2015, 3238, 2337, 1061,  229, 1408, 1730,  538,  585])