#  Домашнее задание 3



In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.model_selection import TimeRangeSplitter

from userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

  from .autonotebook import tqdm as notebook_tqdm


# Датасет KION 

In [2]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O ml-1m.zip
# !unzip -o ml-1m.zip
# !rm ml-1m.zip

In [3]:
interactions_df = pd.read_csv('../data_original/interactions.csv')
users = pd.read_csv('../data_original/users.csv')
items = pd.read_csv('../data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)   

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
interactions = Interactions(interactions_df.sample(frac=0.01))  

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
4042379,673874,10275,2021-06-18,1496.0,38.0
4873938,482711,16197,2021-07-31,2519.0,43.0
4655681,286562,15915,2021-05-13,11491.0,100.0
3852255,591189,8771,2021-07-21,29.0,0.0
4896893,654195,10440,2021-07-27,151.0,0.0


# Создаем заготовку рекомендаций для холодных пользователей

In [4]:
max_date = interactions.df['datetime'].max()

train = interactions.df[(interactions.df['datetime'] < max_date - pd.Timedelta(days=7))]

In [5]:
from rectools.dataset import Dataset

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

In [6]:
from rectools.models.popular import PopularModel

pop = PopularModel()
pop.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f66c09b8df0>

In [7]:
pop_recs = pop.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций 
)
pop_recs[['item_id', 'item_id', 'rank']].to_csv('../processed_data/popular_10_recs.csv')
pop_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,673874,10440,1799.0,1
1,673874,15297,1766.0,2
2,673874,9728,1187.0,3
3,673874,13865,1176.0,4
4,673874,4151,812.0,5


# Задаем фолды для кросс-валидации

In [8]:
N_SPLITS = 10
TEST_SIZE = '10D'

In [9]:
# Init generator of folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [10]:
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-05-15 00:00:00', freq='10D'),
  Timestamp('2021-05-25 00:00:00', freq='10D')),
 (Timestamp('2021-05-25 00:00:00', freq='10D'),
  Timestamp('2021-06-04 00:00:00', freq='10D')),
 (Timestamp('2021-06-04 00:00:00', freq='10D'),
  Timestamp('2021-06-14 00:00:00', freq='10D')),
 (Timestamp('2021-06-14 00:00:00', freq='10D'),
  Timestamp('2021-06-24 00:00:00', freq='10D')),
 (Timestamp('2021-06-24 00:00:00', freq='10D'),
  Timestamp('2021-07-04 00:00:00', freq='10D')),
 (Timestamp('2021-07-04 00:00:00', freq='10D'),
  Timestamp('2021-07-14 00:00:00', freq='10D')),
 (Timestamp('2021-07-14 00:00:00', freq='10D'),
  Timestamp('2021-07-24 00:00:00', freq='10D')),
 (Timestamp('2021-07-24 00:00:00', freq='10D'),
  Timestamp('2021-08-03 00:00:00', freq='10D')),
 (Timestamp('2021-08-03 00:00:00', freq='10D'),
  Timestamp('2021-08-13 00:00:00', freq='10D')),
 (Timestamp('2021-08-13 00:00:00', freq='10D'),
  Timestamp('2021-08-23 00:00:00', freq='10D'))]

## Задаем метрики и модели, по которым будем делать CV

In [11]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
}

# few simple models to compare
models = {
    'cosine_userknn': CosineRecommender(), # implicit 
    'tfidf_userknn': TFIDFRecommender(), 
    'bm25_userknn': BM25Recommender()
}
models_fitted = {
    'cosine_userknn': CosineRecommender(), # implicit 
    'tfidf_userknn': TFIDFRecommender(), 
    'bm25_userknn': BM25Recommender()}

# Тюнинг моделей Knn

In [12]:
import copy

In [13]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)

        # models_fitted[model_name] = copy.deepcopy(userknn_model)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'end': Timestamp('2021-05-25 00:00:00', freq='10D'),
 'i_split': 0,
 'start': Timestamp('2021-05-15 00:00:00', freq='10D'),
 'test': 144,
 'test_items': 132,
 'test_users': 136,
 'train': 11106,
 'train_items': 2733,
 'train_users': 10352}


100%|██████████| 10352/10352 [00:00<00:00, 396981.32it/s]
100%|██████████| 10352/10352 [00:00<00:00, 395935.14it/s]
100%|██████████| 10352/10352 [00:00<00:00, 393409.58it/s]



{'end': Timestamp('2021-06-04 00:00:00', freq='10D'),
 'i_split': 1,
 'start': Timestamp('2021-05-25 00:00:00', freq='10D'),
 'test': 139,
 'test_items': 132,
 'test_users': 132,
 'train': 13571,
 'train_items': 2999,
 'train_users': 12578}


100%|██████████| 12578/12578 [00:00<00:00, 385085.59it/s]
100%|██████████| 12578/12578 [00:00<00:00, 410593.72it/s]
100%|██████████| 12578/12578 [00:00<00:00, 360379.50it/s]



{'end': Timestamp('2021-06-14 00:00:00', freq='10D'),
 'i_split': 2,
 'start': Timestamp('2021-06-04 00:00:00', freq='10D'),
 'test': 190,
 'test_items': 163,
 'test_users': 180,
 'train': 16175,
 'train_items': 3256,
 'train_users': 14958}


100%|██████████| 14958/14958 [00:00<00:00, 410325.77it/s]
100%|██████████| 14958/14958 [00:00<00:00, 398402.28it/s]
100%|██████████| 14958/14958 [00:00<00:00, 386605.86it/s]



{'end': Timestamp('2021-06-24 00:00:00', freq='10D'),
 'i_split': 3,
 'start': Timestamp('2021-06-14 00:00:00', freq='10D'),
 'test': 281,
 'test_items': 199,
 'test_users': 271,
 'train': 19841,
 'train_items': 3500,
 'train_users': 18294}


100%|██████████| 18294/18294 [00:00<00:00, 378866.01it/s]
100%|██████████| 18294/18294 [00:00<00:00, 377516.46it/s]
100%|██████████| 18294/18294 [00:00<00:00, 397796.65it/s]



{'end': Timestamp('2021-07-04 00:00:00', freq='10D'),
 'i_split': 4,
 'start': Timestamp('2021-06-24 00:00:00', freq='10D'),
 'test': 291,
 'test_items': 214,
 'test_users': 282,
 'train': 24109,
 'train_items': 3746,
 'train_users': 22150}


100%|██████████| 22150/22150 [00:00<00:00, 391319.03it/s]
100%|██████████| 22150/22150 [00:00<00:00, 398048.97it/s]
100%|██████████| 22150/22150 [00:00<00:00, 364135.98it/s]



{'end': Timestamp('2021-07-14 00:00:00', freq='10D'),
 'i_split': 5,
 'start': Timestamp('2021-07-04 00:00:00', freq='10D'),
 'test': 300,
 'test_items': 218,
 'test_users': 288,
 'train': 28961,
 'train_items': 3976,
 'train_users': 26543}


100%|██████████| 26543/26543 [00:00<00:00, 378555.58it/s]
100%|██████████| 26543/26543 [00:00<00:00, 385972.12it/s]
100%|██████████| 26543/26543 [00:00<00:00, 377549.09it/s]



{'end': Timestamp('2021-07-24 00:00:00', freq='10D'),
 'i_split': 6,
 'start': Timestamp('2021-07-14 00:00:00', freq='10D'),
 'test': 354,
 'test_items': 270,
 'test_users': 343,
 'train': 33423,
 'train_items': 4226,
 'train_users': 30540}


100%|██████████| 30540/30540 [00:00<00:00, 371495.98it/s]
100%|██████████| 30540/30540 [00:00<00:00, 328718.51it/s]
100%|██████████| 30540/30540 [00:00<00:00, 385144.47it/s]



{'end': Timestamp('2021-08-03 00:00:00', freq='10D'),
 'i_split': 7,
 'start': Timestamp('2021-07-24 00:00:00', freq='10D'),
 'test': 422,
 'test_items': 327,
 'test_users': 399,
 'train': 37946,
 'train_items': 4430,
 'train_users': 34532}


100%|██████████| 34532/34532 [00:00<00:00, 366627.61it/s]
100%|██████████| 34532/34532 [00:00<00:00, 377455.77it/s]
100%|██████████| 34532/34532 [00:00<00:00, 377517.75it/s]



{'end': Timestamp('2021-08-13 00:00:00', freq='10D'),
 'i_split': 8,
 'start': Timestamp('2021-08-03 00:00:00', freq='10D'),
 'test': 493,
 'test_items': 380,
 'test_users': 464,
 'train': 43253,
 'train_items': 4656,
 'train_users': 39244}


100%|██████████| 39244/39244 [00:00<00:00, 343755.06it/s]
100%|██████████| 39244/39244 [00:00<00:00, 337566.76it/s]
100%|██████████| 39244/39244 [00:00<00:00, 319211.22it/s]



{'end': Timestamp('2021-08-23 00:00:00', freq='10D'),
 'i_split': 9,
 'start': Timestamp('2021-08-13 00:00:00', freq='10D'),
 'test': 557,
 'test_items': 434,
 'test_users': 535,
 'train': 48751,
 'train_items': 4864,
 'train_users': 44058}


100%|██████████| 44058/44058 [00:00<00:00, 317026.16it/s]
100%|██████████| 44058/44058 [00:00<00:00, 352421.65it/s]
100%|██████████| 44058/44058 [00:00<00:00, 349131.76it/s]


CPU times: user 18.8 s, sys: 2.07 s, total: 20.9 s
Wall time: 12.9 s


# Метрики качества по фолдам 

In [21]:
df_metrics = pd.DataFrame(results)
# df_metrics

## Metrics mean 


In [22]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25_userknn,8.7e-05,10.077202
cosine_userknn,8.7e-05,10.023727
tfidf_userknn,8.7e-05,10.038238


In [36]:
userknn_model = UserKnn(model=models['bm25_userknn'], N_users=interactions.df.user_id.nunique())
userknn_model.fit(interactions.df)

100%|██████████| 49297/49297 [00:00<00:00, 337314.34it/s]


In [32]:
interactions.df

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
4042379,673874,10275,2021-06-18,1496.0,38.0
4873938,482711,16197,2021-07-31,2519.0,43.0
4655681,286562,15915,2021-05-13,11491.0,100.0
3852255,591189,8771,2021-07-21,29.0,0.0
4896893,654195,10440,2021-07-27,151.0,0.0
...,...,...,...,...,...
2433002,545741,5424,2021-08-09,5269.0,98.0
516893,17099,4151,2021-07-03,3354.0,38.0
5110384,848963,10440,2021-07-08,68962.0,40.0
1559789,236904,4151,2021-08-17,3870.0,13.0


In [37]:
ready_recs = userknn_model.predict(interactions.df)
ready_recs.to_csv('../processed_data/knn_bm25.csv')

Выводы:
bm25 и cosine выдают более качественные рекомендации, чем tfidf, 

при этом bm25 предсказывает наиболее разнообразную выборку при лучшем из имеющихся скорах МАР

В итогом варианте выбираем bm25 модель

In [19]:
import dill

with open('../models/model_bm25.pkl', 'wb') as f:
    dill.dump(models['bm25_userknn'], f)