In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

В этом же задании выполним пункт из следующего:
- Попробуйте улучшить бейзлайны, считая случаный на топ-5000 товаров

In [4]:
def get_weights(data, at_k=None):
    
    items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
    items_weights['sales_value'] = np.log(items_weights['sales_value'] + 1.0000001)
    items_weights.sort_values('sales_value', ascending=False, inplace=True)
    
    if at_k:
        items_weights = items_weights[:at_k]
    
    items_weights['sales_value'] = items_weights['sales_value'] / items_weights['sales_value'].sum()
    items_weights.columns = ['item_id', 'weight']
    
    return items_weights


def weighted_random_recommendation(items_weights, n=5):
    
    recs = np.random.choice(items_weights['item_id'], size=n, replace=False, p=items_weights['weight'])
    
    return recs.tolist()

In [5]:
def precision(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    return flags.sum() / len(recommended_list)

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [7]:
%%time

items_weights = get_weights(data_train)
items_weights_5000 = get_weights(data_train, at_k=5000)
items_weights_1000 = get_weights(data_train, at_k=1000)
items_weights_100 = get_weights(data_train, at_k=100)

result['weighted_rand_rec'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result['weighted_rand_rec_5000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights_5000, n=5))
result['weighted_rand_rec_1000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights_1000, n=5))
result['weighted_rand_rec_100'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights_100, n=5))

Wall time: 4.76 s


In [8]:
result.head(5)

Unnamed: 0,user_id,actual,weighted_rand_rec,weighted_rand_rec_5000,weighted_rand_rec_1000,weighted_rand_rec_100
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1934138, 5693713, 3833740, 828507, 1614511]","[841016, 12324841, 1125943, 991999, 1075786]","[1065593, 926905, 1138189, 866211, 1036495]","[840361, 1000753, 6534166, 878996, 893018]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106872, 1040486, 9420077, 1044637, 100505]","[917427, 1122547, 6463742, 880427, 1109615]","[1092303, 1013167, 882308, 854852, 1055403]","[1082185, 962229, 961554, 6533765, 995965]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6555314, 939323, 12172294, 6979518, 709249]","[5570882, 12810422, 948509, 1127624, 899459]","[1037894, 1057260, 940766, 845307, 9553193]","[893018, 993638, 12810393, 5569845, 1037840]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[988791, 17169697, 831808, 1880808, 1095557]","[872382, 821741, 895166, 879528, 1116096]","[7155012, 12582517, 5568758, 1029688, 952163]","[874972, 880150, 878996, 854405, 965267]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[990936, 1734922, 921870, 948826, 13007710]","[852014, 1128240, 884518, 10341234, 892314]","[898068, 1034686, 918046, 6534030, 1052912]","[6534178, 12810393, 849202, 844179, 993638]"


In [9]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision(row[name_col], row['actual']), axis=1).mean(), 4)} : {name_col}")

1.0 : actual
0.0014 : weighted_rand_rec
0.0071 : weighted_rand_rec_5000
0.015 : weighted_rand_rec_1000
0.0523 : weighted_rand_rec_100


## Задание 2. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая случаный на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

Первый пункт задания выполнен в предыдущем, со второй частью, к сожалению, не успел разобраться.