In [94]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [95]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [97]:
data = pd.read_csv('drive/MyDrive/Rec_systems/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [98]:
popular = data.groupby('item_id')['sales_value'].sum().reset_index()
popular.sort_values('sales_value', ascending=False, inplace=True)
popular['log_sales'] = np.log(popular['sales_value'], where = popular['sales_value'] != 0)
log_sales_sum = popular['log_sales'].sum()

In [99]:
popular['log_sales_scaled'] = popular['log_sales'] / log_sales_sum
popular['log_sales_scaled'].sum()

1.0000000000000002

In [100]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [101]:
from numpy.random import choice

item_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
item_weights['weight'] = item_weights['sales_value'] / item_weights['sales_value'].sum()

def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации

    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """

    recs = choice(items_weights['item_id'], n, p=items_weights['weight'])
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code

    return recs.tolist()

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [102]:
result = pd.read_csv('drive/MyDrive/Rec_systems/predictions_basic.csv', sep=',')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation
0,1,[ 821867 834484 856942 865456 889248 ...,"[6533798, 1028243, 838824, 8156088, 7097610]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1085604, 884945, 979452, 8090532, 12351790]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[6544308, 1044558, 1045659, 7166566, 9676954]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1052294, 9420325, 945652, 883003, 863447]"


In [103]:
def precision_at_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]

    flags = np.isin(recommended_list, bought_list)

    precision = flags.sum() / len(recommended_list)


    return precision

In [104]:
import re

def str_to_list(line):
  pattern = r'[\[\]]'
  try:
    return [x for x in map(int, re.sub(pattern, '', line).split(','))]
  except TypeError:
    print(line)

def str_to_list_actual(line):
  pattern = r'\[ | \]|\[|\]|\n'
  try:
    return [x for x in map(int, re.sub(pattern, '', line).split())]
  except TypeError:
    print(line)
    return None

for col in result.columns.drop(['user_id', 'actual']):
  result[col] = result[col].transform(str_to_list)

result['actual'] = result['actual'].transform(str_to_list_actual)

result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6533798, 1028243, 838824, 8156088, 7097610]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1085604, 884945, 979452, 8090532, 12351790]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6544308, 1044558, 1045659, 7166566, 9676954]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1052294, 9420325, 945652, 883003, 863447]"


In [105]:
result.apply(lambda x: precision_at_k(x[2],x[1], 5),1).mean()

0.0004897159647404506

In [106]:
for i, col in enumerate(result.columns.drop(['user_id', 'actual'])):
  print(f'{col}: {round(result.apply(lambda x: precision_at_k(x[i + 2],x[1], 5), 1).mean(), 4)}')

random_recommendation: 0.0005
popular_recommendation: 0.1552
weighted_random_recommendation: 0.0251


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [108]:
%%time
item_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
item_weights['weight'] = item_weights['sales_value'] / item_weights['sales_value'].sum()

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(item_weights, n=5))
result.head(2)

CPU times: user 2.53 s, sys: 30 ms, total: 2.56 s
Wall time: 2.57 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6533798, 1028243, 838824, 8156088, 7097610]","[6534178, 6533889, 1029743, 6534166, 1082185]","[882305, 6979485, 841116, 1082185, 866211]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6544308, 1044558, 1045659, 7166566, 9676954]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1101173, 1105917, 1083111, 1136719, 1058939]"


In [109]:
%%time

item_weights_5000 = data.sort_values('sales_value', ascending=False).head(5000).groupby('item_id')['sales_value'].sum().reset_index()
item_weights_5000['weight'] = item_weights_5000['sales_value'] / item_weights_5000['sales_value'].sum()

result['weighted_random_recommendation_top5000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(item_weights_5000, n=5))
result.head(2)

CPU times: user 1.04 s, sys: 91.5 ms, total: 1.13 s
Wall time: 1.15 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation,weighted_random_recommendation_top5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6533798, 1028243, 838824, 8156088, 7097610]","[6534178, 6533889, 1029743, 6534166, 1082185]","[882305, 6979485, 841116, 1082185, 866211]","[970747, 6534178, 12484608, 6534178, 6534166]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6544308, 1044558, 1045659, 7166566, 9676954]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1101173, 1105917, 1083111, 1136719, 1058939]","[6534178, 12582517, 9798997, 9419393, 916381]"


In [111]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)

    return recs.tolist()

In [112]:
%%time

popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

items = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

result['random_recommendation_top5000'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(2)

CPU times: user 1.2 s, sys: 28.3 ms, total: 1.23 s
Wall time: 1.24 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_recommendation,weighted_random_recommendation_top5000,random_recommendation_top5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6533798, 1028243, 838824, 8156088, 7097610]","[6534178, 6533889, 1029743, 6534166, 1082185]","[882305, 6979485, 841116, 1082185, 866211]","[970747, 6534178, 12484608, 6534178, 6534166]","[5563693, 7166865, 835347, 1089108, 1050721]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6544308, 1044558, 1045659, 7166566, 9676954]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1101173, 1105917, 1083111, 1136719, 1058939]","[6534178, 12582517, 9798997, 9419393, 916381]","[947267, 1109778, 868319, 9553031, 948232]"


In [113]:
for i, col in enumerate(result.columns.drop(['user_id', 'actual'])):
  print(f'{col}: {round(result.apply(lambda x: precision_at_k(x[i + 2],x[1], 5), 1).mean(), 4)}')

random_recommendation: 0.0005
popular_recommendation: 0.1552
weighted_random_recommendation: 0.0229
weighted_random_recommendation_top5000: 0.0965
random_recommendation_top5000: 0.0071


Точность улучшилась, но незначительно и всё равно результ сильно хуже, чем просто выдача самых популярных позиций.
Вторую часть задания не могу выполнить, ядро крашится и в гугле и в юпитере при попытке запустить код из вебинара