In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Homework

### MRR
Исходные данные - результат `generate_subsample` 

**Задача** - по аналогии с precision написать три версии функции подсчета Mean Reciprocal Rank (naive, numba, pandas) и протестировать на разных размерах выборки
- Протестируйте для всех комбинаций (users_count, top_k):
  - users_count - [100, 1000, 10000, 100000]
  - top_k - [10, 50, 100]
- Результатом тестирования должен быть график, где будут отражены следующие показатели:
  - Алгоритм - naive, numba, pandas
  - Скорость работы (время)
  - users_count
  - top_k

In [121]:
# !pip install rectools

In [122]:
import pandas as pd
import numpy as np
import numba as nb
import plotly.express as px
import timeit

from rectools import Columns

In [123]:
np.random.seed(23)

PATH = '/content/drive/MyDrive/kion_train/'

In [124]:
interactions = pd.read_csv(PATH + 'interactions.csv')

interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

In [125]:
users = pd.read_csv(PATH + 'users.csv')
items = pd.read_csv(PATH + 'items.csv')

In [126]:
def generate_subsample(users_count, top_k):
    users = np.random.choice(interactions[Columns.User].unique(),
                             users_count, replace=False)
    df = interactions[interactions[Columns.User].isin(users)].reset_index(drop=True)
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']
    
    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

In [127]:
def mrr_naive(target, users, recs, k):
    rr = []
    for i, user in enumerate(users):
        user_target = target[target[:, 0] == user][:, 1]
        for rank, rec in enumerate(recs[i]):
            if rec in user_target:
                rr.append(1 / (rank+1))
                break
    return sum(rr) / len(users)

In [128]:
@nb.njit(cache=True, parallel=True)
def mrr_numba(target, users, recs, k):
    rr = np.zeros(len(users))
    for i in nb.prange(len(users)):
        user = users[i]
        user_target = target[target[:, 0] == user][:, 1]
        for rank, rec in enumerate(recs[i]):
            if rec in user_target:
                rr[i] = 1 / (rank+1)
                break
    return rr.mean()

In [129]:
def mrr_pandas(df, users, recs, k):
    df_recs = pd.DataFrame({
        Columns.User: np.repeat(users, k),
        Columns.Item: recs.ravel()
    })
    df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
    df['target'] = 1
    df_recs = df.merge(df_recs, how='left', on=Columns.UserItem)
    df_recs['rr'] = 1 / df_recs['rank']
    return df_recs.groupby('user_id')['rr'].max().sum()/len(users)

In [130]:
top_k = 10
df, users, recs = generate_subsample(10000, top_k)
target = df.values

In [131]:
mrr_naive(target, users, recs, top_k)

0.07836801587301619

In [132]:
mrr_numba(target, users, recs, top_k)

0.07836801587301577

In [133]:
mrr_pandas(df, users, recs, top_k)

0.07836801587301587

Результаты метрик совпадают. 

Посмотрим время работы:

In [134]:
%timeit mrr_naive(target, users, recs, top_k)

2.86 s ± 1.57 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [135]:
%timeit mrr_numba(target, users, recs, top_k)

1.13 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [136]:
%timeit mrr_pandas(df, users, recs, top_k)

58.4 ms ± 7.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Результаты предсказуемы, pandas сильно выигрывает по скорости работы

In [137]:
users_count = [100, 1000, 10000, 100000]
top_k = [10, 50, 100]
algs = ['naive', 'numba', 'pandas']

In [149]:
time = []
count = [n for n in users_count for _ in range(9)]
top = [n for n in top_k for _ in range(3)] * 4
algs = algs * 12
n = 5 

for user_count in users_count:
    for k in top_k:
        df, users, recs = generate_subsample(user_count, k)
        target = df.values
        t = timeit.timeit('mrr_naive(target, users, recs, k)',
                          'from __main__ import mrr_naive, target, users, recs, k',
                          number=n)
        time.append(t / n)
        t = timeit.timeit('mrr_numba(target, users, recs, k)',
                          'from __main__ import mrr_numba, target, users, recs, k',
                          number=n)
        time.append(t / n)
        t = timeit.timeit('mrr_pandas(df, users, recs, k)',
                          'from __main__ import mrr_pandas, df, users, recs, k',
                          number=n)
        time.append(t / n)

In [150]:
res = pd.DataFrame({
    'top_k': np.array(top),
    'user_count': np.array(count),
    'algoritm': np.array(algs),
    'time': np.array(time)})

In [151]:
res

Unnamed: 0,top_k,user_count,algoritm,time
0,10,100,naive,0.008622
1,10,100,numba,0.00179
2,10,100,pandas,0.008326
3,50,100,naive,0.013309
4,50,100,numba,0.000301
5,50,100,pandas,0.009365
6,100,100,naive,0.016326
7,100,100,numba,0.000174
8,100,100,pandas,0.010878
9,10,1000,naive,0.134295


In [155]:
fig = px.line(res, x="user_count", y="time", log_x=True, log_y=True, color="algoritm", facet_col="top_k")
fig.show()

С увеличением количества вычислений, векторизация начинает превосходить другие способы