In [1]:
import pandas as pd
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from itertools import islice
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Interactions, IdMap

from service.rec_models.userknn import UserKnn

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# EDA

In [11]:
interactions = pd.read_csv('../datasets/kion_train/interactions.csv')
users = pd.read_csv('../datasets/kion_train/users.csv')
items = pd.read_csv('../datasets/kion_train/items.csv')

In [12]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## interactions

In [6]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [7]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [8]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [9]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


## users

In [10]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [11]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [12]:
pd.concat([items.head(2), items.tail(2)])[["item_id", "content_type", "title", "release_year"]]

Unnamed: 0,item_id,content_type,title,release_year
0,10711,film,Поговори с ней,2002.0
1,2508,film,Голые перцы,2014.0
15961,4538,series,Среди камней,2019.0
15962,3206,series,Гоша,2019.0


In [13]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


## IdMap

In [13]:
userMap = IdMap.from_values(interactions[Columns.User])
itemMap = IdMap.from_values(interactions[Columns.Item])

# userkNN model CV

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base

In [6]:
# setting for cv
n_folds = 7  # due to fit time only the first 2 folds will be used
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Test fold borders

In [15]:
from rectools.model_selection import TimeRangeSplitter

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

rec_interactions = Interactions.from_raw(interactions, userMap, itemMap)
print(f"Real number of folds: {cv.get_n_splits(rec_interactions)}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


### Validation parameters and metrics

In [13]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
    "MAP@10": MAP(k=10),
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
}

user_count = [50, 80]

popularities = ["n_users", "n_interactions", "mean_weight", "sum_weight"]

## Model training by fold

`Training UserKNN model with different parameters`

In [18]:
%%time

results = []

fold_iterator = islice(cv.split(rec_interactions, collect_fold_stats=True), 2)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = rec_interactions.df.iloc[train_ids].copy()
    df_test = rec_interactions.df.iloc[test_ids].copy()

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        for N_users in user_count:
            for popularity in popularities:
                userknn_model = UserKnn(model, popularity=popularity, N_users=N_users)
                userknn_model.fit(df_train, False)

                recos = userknn_model.recommend(df_test)

                metric_values = calc_metrics(
                    metrics,
                    reco=recos,
                    interactions=df_test,
                    prev_interactions=df_train,
                    catalog=catalog,
                )

                fold = {"fold": i_fold,
                        "model": model_name,
                        "N_users": N_users,
                        "popularity": popularity}
                fold.update(metric_values)
                results.append(fold)



{'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'),
 'Test': 237414,
 'Test items': 5947,
 'Test users': 98930,
 'Train': 2533586,
 'Train items': 14092,
 'Train users': 536802}

























{'End date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Test': 211146,
 'Test items': 6209,
 'Test users': 86167,
 'Train': 2886800,
 'Train items': 14357,
 'Train users': 595902}
























CPU times: total: 3d 1h 17min 17s
Wall time: 7h 33min 11s


HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=536802.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595902.0), HTML(value='')))

# 👌 Metrics

## Metric description
- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.novelty.MeanInvUserFreq.html#rectools.metrics.novelty.MeanInvUserFreq"> Mean Inverse User Frequency (novelty)</a>

- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.serendipity.Serendipity.html"> Serendipity = novelty and relevance</a>

In [10]:
df_metrics = pd.DataFrame(results)

df_metrics.to_pickle("df_metrics.pickle")

## Metrics mean by fold
`We can compare two models`

In [14]:
df_metrics.groupby(['model', 'N_users', 'popularity']).mean()[metrics.keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,N_users,popularity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cosine_itemknn,50,mean_weight,0.005875,0.032338,11.891396,3.8e-05,0.00573
cosine_itemknn,50,n_interactions,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,50,n_users,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,50,sum_weight,0.03487,0.215191,6.62811,4.1e-05,0.073966
cosine_itemknn,80,mean_weight,0.005875,0.032338,11.891396,3.8e-05,0.00573
cosine_itemknn,80,n_interactions,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,80,n_users,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,80,sum_weight,0.03487,0.215191,6.62811,4.1e-05,0.073966
tfidf_itemknn,50,mean_weight,0.009276,0.052973,11.775528,3.9e-05,0.009889
tfidf_itemknn,50,n_interactions,0.041433,0.24297,6.518878,4e-05,0.076768


`Increasing of the count of nearest neighbours doesn't impact on metrics values.
That's why the choice of 50 is sufficient.`

In [15]:
df_metrics = df_metrics[df_metrics['N_users'] == 50]

df_metrics.groupby(['model', 'popularity']).mean()[metrics.keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,popularity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cosine_itemknn,mean_weight,0.005875,0.032338,11.891396,3.8e-05,0.00573
cosine_itemknn,n_interactions,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,n_users,0.039523,0.240064,6.313828,3.8e-05,0.074456
cosine_itemknn,sum_weight,0.03487,0.215191,6.62811,4.1e-05,0.073966
tfidf_itemknn,mean_weight,0.009276,0.052973,11.775528,3.9e-05,0.009889
tfidf_itemknn,n_interactions,0.041433,0.24297,6.518878,4e-05,0.076768
tfidf_itemknn,n_users,0.041433,0.24297,6.518878,4e-05,0.076768
tfidf_itemknn,sum_weight,0.036995,0.219179,6.817797,4.2e-05,0.076404


`MAP is the most important metric, which score we need to increase.`

`Strategies 'n_users' and 'n_interactions' for popularity baseline with
TF-IDF KNN model shows better results than rest. 'n_users' was chosen as default variant.`

`However, the best MAP@10 result comes with decreased novelty.`

## Metrics std by fold

`If a diff between model metrics less than an std value =>
there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [16]:
df_metrics.groupby(['model', 'popularity']).std()[metrics.keys()]

Unnamed: 0_level_0,Unnamed: 1_level_0,prec@10,recall@10,novelty,serendipity,MAP@10
model,popularity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cosine_itemknn,mean_weight,6e-05,0.000372,0.279949,4e-06,8.8e-05
cosine_itemknn,n_interactions,0.007132,0.050245,0.047737,4e-06,0.004218
cosine_itemknn,n_users,0.007132,0.050245,0.047737,4e-06,0.004218
cosine_itemknn,sum_weight,0.006825,0.050053,0.019884,3e-06,0.012427
tfidf_itemknn,mean_weight,8.5e-05,0.001344,0.278584,5e-06,0.000302
tfidf_itemknn,n_interactions,0.006741,0.049426,0.088305,5e-06,0.004174
tfidf_itemknn,n_users,0.006741,0.049426,0.088305,5e-06,0.004174
tfidf_itemknn,sum_weight,0.006441,0.04851,0.058318,4e-06,0.012314


`std values for interested combinations are low enough
to maintain previous combination as the most desirable.`

# Fit

In [2]:
userknn_model = UserKnn(TFIDFRecommender())
userknn_model.fit(rec_interactions.df, save=True)

# Check

In [3]:
%%time

for _ in range(0, 1000):
    userknn_model.predict(176549, 10)

CPU times: total: 641 ms
Wall time: 613 ms


In [4]:
userknn_model._predict_userknn(176549, 10)

array([2777, 5297,  875, 2015,  948,  806, 3038, 1407, 4340, 2337])

In [6]:
userknn_model.recommend(pd.DataFrame({"user_id": [176549]}))

Unnamed: 0,user_id,item_id,score,rank
0,176549,25,202457.0,1
1,176549,21,193123.0,2
2,176549,32,132865.0,3
3,176549,16,122119.0,4
4,176549,174,91167.0,5
5,176549,84,74803.0,6
6,176549,93,68581.0,7
7,176549,142,55043.0,8
8,176549,370,45367.0,9
9,176549,122,40372.0,10


In [5]:
userknn_model.predict(176549, 10)


[2777, 5297, 875, 2015, 948, 806, 3038, 1407, 4340, 2337]