In [1]:
%%capture

!pip install implicit
!pip install rectools

In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics, Precision, Recall
from rectools.model_selection import TimeRangeSplitter

from userknn import UserKnn
from rectools.models import PopularModel
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Датасет KION

In [3]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [4]:
!unzip kion_train.zip -x '__MACOSX/*'

Archive:  kion_train.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: data_original/users.csv  
  inflating: data_original/items.csv  


In [2]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
#interactions = Interactions(interactions_df.sample(frac=0.01))

## Задаем фолды для кросс-валидации

In [3]:
def cross_validation(models, metrics, splitter, interactions):

    results = []
    #interactions_data = Interactions(interactions)
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids].copy()
        df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

        catalog = df_train[Columns.Item].unique()

        for key, value in models.items():

            userknn_model = UserKnn(model=value, N_users=50)
            userknn_model.fit(df_train)

            recos = userknn_model.predict(df_test)
            print(recos)
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            fold = {"fold": i_fold, "model": key}
            fold.update(metric_values)
            results.append(fold)

    return pd.DataFrame(results).groupby("model").mean()

In [4]:
N_SPLITS = 7
TEST_SIZE = '14D'

In [5]:
splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,
)

In [6]:
splitter.get_test_fold_borders(interactions)

[(Timestamp('2021-05-17 00:00:00', freq='14D'),
  Timestamp('2021-05-31 00:00:00', freq='14D')),
 (Timestamp('2021-05-31 00:00:00', freq='14D'),
  Timestamp('2021-06-14 00:00:00', freq='14D')),
 (Timestamp('2021-06-14 00:00:00', freq='14D'),
  Timestamp('2021-06-28 00:00:00', freq='14D')),
 (Timestamp('2021-06-28 00:00:00', freq='14D'),
  Timestamp('2021-07-12 00:00:00', freq='14D')),
 (Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [7]:
# watched_count = df_test.groupby(['user_id'])['item_id'].unique().reset_index()
# watched_count['count_films'] = watched_count['item_id'].apply(lambda x: len(x))
# watched_count.sort_values(['count_films'], ascending=False)

## Задаем метрики и модели, по которым будем делать CV

In [8]:
metrics = {
    'map@10': MAP(k=10),
    'novelty@10': MeanInvUserFreq(k=10),
    'precision@10': Precision(k=10),
    'recall@10': Recall(k=10),
}

# возьмем три модели из implicit
models = {
    #'cosine_userknn': CosineRecommender(), # implicit
    'tfidf_userknn': TFIDFRecommender(),
    #'bm25_userknn': BM25Recommender()
}

# Метрики качества по фолдам

In [9]:
%%time

df_metrics = cross_validation(models, metrics, splitter, interactions)
df_metrics


{'end': Timestamp('2021-05-31 00:00:00', freq='14D'),
 'i_split': 0,
 'start': Timestamp('2021-05-17 00:00:00', freq='14D'),
 'test': 325465,
 'test_items': 6656,
 'test_users': 124224,
 'train': 1163422,
 'train_items': 12451,
 'train_users': 272611}


  0%|          | 0/272611 [00:00<?, ?it/s]

         user_id item_id score  rank
0        1097551   10440  -inf     1
4        1097551    4151  -inf     2
7        1097551    4880  -inf     3
8        1097551     142  -inf     4
9        1097551    6809  -inf     5
...          ...     ...   ...   ...
2104750        2   15297   NaN     6
2104751        2    9728   NaN     7
2104752        2   13865   NaN     8
2104754        2    3734   NaN     9
2104755        2    2657   NaN    10

[1242240 rows x 4 columns]

{'end': Timestamp('2021-06-14 00:00:00', freq='14D'),
 'i_split': 1,
 'start': Timestamp('2021-05-31 00:00:00', freq='14D'),
 'test': 407766,
 'test_items': 7061,
 'test_users': 162372,
 'train': 1511667,
 'train_items': 13084,
 'train_users': 340978}


  0%|          | 0/340978 [00:00<?, ?it/s]

         user_id item_id     score  rank
0        1097548   10440      -inf     1
1        1097548   15297      -inf     2
4        1097548    4151      -inf     3
6        1097548    2657      -inf     4
7        1097548    4880      -inf     5
...          ...     ...       ...   ...
2665942        2    4436  3.016876     6
2665944        2    7107  2.648447     7
2665945        2    4880  2.372415     8
2665946        2   10440      -inf     9
2665947        2   15297      -inf    10

[1623720 rows x 4 columns]

{'end': Timestamp('2021-06-28 00:00:00', freq='14D'),
 'i_split': 2,
 'start': Timestamp('2021-06-14 00:00:00', freq='14D'),
 'test': 573256,
 'test_items': 7169,
 'test_users': 212510,
 'train': 1979424,
 'train_items': 13649,
 'train_users': 439529}


  0%|          | 0/439529 [00:00<?, ?it/s]

         user_id item_id     score  rank
0        1097555    6916  5.524388     1
2        1097555    4662  4.905972     2
3        1097555     140  4.537893     3
6        1097555    6939  3.162708     4
16       1097555    9091  3.010074     5
...          ...     ...       ...   ...
3546271        2    3541  2.525025     6
3546266        2   16029  2.507703     7
3546260        2    4475  2.507529     8
3546263        2   15266  2.470475     9
3546270        2     565  2.457829    10

[2125100 rows x 4 columns]

{'end': Timestamp('2021-07-12 00:00:00', freq='14D'),
 'i_split': 3,
 'start': Timestamp('2021-06-28 00:00:00', freq='14D'),
 'test': 622283,
 'test_items': 7459,
 'test_users': 231774,
 'train': 2582489,
 'train_items': 14107,
 'train_users': 543840}


  0%|          | 0/543840 [00:00<?, ?it/s]

         user_id item_id score  rank
0        1097556   10440  -inf     1
1        1097556   15297  -inf     2
2        1097556    9728  -inf     3
3        1097556   13865  -inf     4
4        1097556    4151  -inf     5
...          ...     ...   ...   ...
4014276        4    4151  -inf     6
4014277        4    3734  -inf     7
4014278        4    2657  -inf     8
4014279        4    4880  -inf     9
4014280        4     142  -inf    10

[2317740 rows x 4 columns]

{'end': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'i_split': 4,
 'start': Timestamp('2021-07-12 00:00:00', freq='14D'),
 'test': 637836,
 'test_items': 7851,
 'test_users': 216920,
 'train': 3239125,
 'train_items': 14730,
 'train_users': 646423}


  0%|          | 0/646423 [00:00<?, ?it/s]

         user_id item_id score  rank
0        1097557   10440  -inf     1
1        1097557   15297  -inf     2
2        1097557    9728  -inf     3
3        1097557   13865  -inf     4
4        1097557    4151  -inf     5
...          ...     ...   ...   ...
4075263        0    3734  -inf     6
4075264        0    2657  -inf     7
4075265        0    4880  -inf     8
4075266        0     142  -inf     9
4075267        0    6809  -inf    10

[2169200 rows x 4 columns]

{'end': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'i_split': 5,
 'start': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'test': 726066,
 'test_items': 8191,
 'test_users': 241149,
 'train': 3892558,
 'train_items': 15085,
 'train_users': 742256}


  0%|          | 0/742256 [00:00<?, ?it/s]

         user_id item_id     score  rank
0        1097557   15297  3.344003     1
1        1097557   10440      -inf     2
2        1097557    9728      -inf     3
3        1097557   13865      -inf     4
4        1097557    4151      -inf     5
...          ...     ...       ...   ...
4722085        3    9070  3.529739     6
4722089        3    6455  3.173744     7
4722076        3    4436  3.119949     8
4722087        3    6443  2.987237     9
4722090        3   16509  2.978089    10

[2411490 rows x 4 columns]

{'end': Timestamp('2021-08-23 00:00:00', freq='14D'),
 'i_split': 6,
 'start': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'test': 787191,
 'test_items': 8115,
 'test_users': 257877,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}


  0%|          | 0/850489 [00:00<?, ?it/s]

         user_id item_id score  rank
0        1097549   10440  -inf     1
1        1097549   15297  -inf     2
2        1097549    9728  -inf     3
3        1097549   13865  -inf     4
4        1097549    4151  -inf     5
...          ...     ...   ...   ...
5388349        1    3734  -inf     6
5388350        1    2657  -inf     7
5388351        1    4880  -inf     8
5388352        1     142  -inf     9
5388353        1    6809  -inf    10

[2578770 rows x 4 columns]
CPU times: user 3h 10min 21s, sys: 3min 9s, total: 3h 13min 30s
Wall time: 2h 39min 17s


Unnamed: 0_level_0,fold,precision@10,recall@10,map@10,novelty@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tfidf_userknn,3.0,0.049031,0.277276,0.123579,6.704977


## Metrics mean


In [10]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty@10,precision@10,recall@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tfidf_userknn,0.123579,6.704977,0.049031,0.277276


In [11]:
userknn_model = UserKnn(model=TFIDFRecommender(), N_users=50)
userknn_model.fit(interactions.df)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [12]:
import pickle

In [53]:
MODEL_PATH = "/content/user_knn.pkl"

pickle.dump(userknn_model, open(MODEL_PATH, "wb"))

In [14]:
# import lzma

In [15]:
# with lzma.open("user_knn.xz", "wb") as file:
#     pickle.dump(userknn_model, file)

In [54]:
pickled_model = pickle.load(open(MODEL_PATH, "rb"))

In [None]:
pickled_model.recommend(700)

In [None]:
# from logging import exception
# for user in users['user_id'].unique():
#   try:
#     all_users['item_id'] = all_users['user_id'].apply(lambda x: pickled_model.recommend(x))
#   except:
#     print(1)