In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
data['week_no'].nunique()

95

In [4]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [5]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115772.0,83.458481
std,5178973.0,1628.715079
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553042.0,46.105
max,18024560.0,467993.62


In [6]:
popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
popularity.describe()

Unnamed: 0,item_id,user_id
count,89051.0,89051.0
mean,5115772.0,14.759767
std,5178973.0,45.904111
min,25671.0,1.0
25%,966583.0,1.0
50%,1448516.0,2.0
75%,9553042.0,10.0
max,18024560.0,2039.0


In [7]:
item_features = pd.read_csv('product.csv')
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
user_features = pd.read_csv('hh_demographic.csv')
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


### Train-test split

В рекомендательных системах корректнее использовать train-test split по времени, а не случайно  
Я возьму последние 3 недели в качестве теста

In [9]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [10]:
data_train.shape[0], data_test.shape[0]

(2278490, 118314)

# 1. Бейзлайны

Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели)

In [11]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [12]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### 1.1 Random recommendation

In [13]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [14]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(2)

Wall time: 10.5 s


Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000157, 1022284, 1009924, 1111610, 479576]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[7441790, 12988219, 13944932, 1089588, 6457655]"


### 1.2 Popularity-based recommendation

In [15]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [16]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

Wall time: 340 ms


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000157, 1022284, 1009924, 1111610, 479576]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[7441790, 12988219, 13944932, 1089588, 6457655]","[6534178, 6533889, 1029743, 6534166, 1082185]"


### 1.3 Weighted random recommender

- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [17]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()

In [18]:
popularity['weight'] = popularity['sales_value'].apply(lambda x: np.log10(x))

In [19]:
popularity.head(2)

Unnamed: 0,item_id,sales_value,weight
0,25671,20.94,1.320977
1,26081,0.99,-0.004365


*Пример*  
item_1 - 5, item_2 - 7, item_3 - 4  # / sum  
item_1 - 5 / 16, item_2 - 7 / 16, item_3 - 4 / 16

In [22]:
popularity['weight'].value_counts()

0.475671    1562
0.298853    1263
0.000000    1248
0.301030    1183
0.600973    1098
            ... 
2.175541       1
1.976304       1
2.473049       1
1.526985       1
2.262166       1
Name: weight, Length: 23409, dtype: int64

In [23]:
popularity['weight'].isnull().values.any()

False

In [24]:
popularity['weight'].isnull().sum()

0

In [25]:
popularity['weight'].isnull().sum().sum()

0

In [30]:
popularity[(popularity['weight'] < 0)]

Unnamed: 0,item_id,sales_value,weight
1,26081,0.99,-0.004365
20,27346,0.99,-0.004365
25,27509,0.88,-0.055517
26,27510,0.69,-0.161151
34,27697,0.89,-0.050610
...,...,...,...
88920,17284401,0.00,-inf
88945,17291554,0.00,-inf
88991,17381856,0.00,-inf
89013,17901795,0.50,-0.301030


In [49]:
a = popularity.replace([np.inf, -np.inf], np.nan)

In [50]:
a[(a['weight'] < 0)]

Unnamed: 0,item_id,sales_value,weight
1,26081,0.99,-0.004365
20,27346,0.99,-0.004365
25,27509,0.88,-0.055517
26,27510,0.69,-0.161151
34,27697,0.89,-0.050610
...,...,...,...
88659,17170702,0.70,-0.154902
88859,17248297,0.99,-0.004365
88860,17248306,0.79,-0.102373
89013,17901795,0.50,-0.301030


In [63]:
a[(a['weight'] == 'nan')]

Unnamed: 0,item_id,sales_value,weight


In [45]:
popularity[(popularity['weight'] < 0)]

Unnamed: 0,item_id,sales_value,weight
1,26081,0.99,-0.004365
20,27346,0.99,-0.004365
25,27509,0.88,-0.055517
26,27510,0.69,-0.161151
34,27697,0.89,-0.050610
...,...,...,...
88920,17284401,0.00,-inf
88945,17291554,0.00,-inf
88991,17381856,0.00,-inf
89013,17901795,0.50,-0.301030


In [58]:
popularity = a

In [53]:
import random

In [54]:
def weighted_random_recommendation(items, items_weights, k=5):  
    items = np.array(items)
    items_weights = np.array(items_weights)
    recs = random.choices(items, items_weights, k=5)#, size = n, replace=False)    
    return recs#.tolist()

In [59]:
%%time

items_weights = popularity['weight']
items = popularity['item_id']
result['weight_random_recommendation'] = result['user_id'].apply(lambda x: 
                                                                 weighted_random_recommendation(items, items_weights, k=5))
result.head(2)

Wall time: 1min 5s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weight_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000157, 1022284, 1009924, 1111610, 479576]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[7441790, 12988219, 13944932, 1089588, 6457655]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024..."


In [64]:
popularity_n = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity_n['weight'] = popularity_n['sales_value'].apply(lambda x: np.log10(x))

In [65]:
popularity_n[(popularity_n['weight'] < 0)]

Unnamed: 0,item_id,sales_value,weight
1,26081,0.99,-0.004365
20,27346,0.99,-0.004365
25,27509,0.88,-0.055517
26,27510,0.69,-0.161151
34,27697,0.89,-0.050610
...,...,...,...
88920,17284401,0.00,-inf
88945,17291554,0.00,-inf
88991,17381856,0.00,-inf
89013,17901795,0.50,-0.301030


In [66]:
popularity_n = popularity_n.replace([np.inf, -np.inf], np.nan)

In [73]:
popularity_n[(popularity_n['weight'] < -5)]

Unnamed: 0,item_id,sales_value,weight
36317,1093910,8.881784e-16,-15.0515


In [68]:
def weighted_random_recommendation(items_weights):
    items_weights = np.array(items_weights)
    recs = random.choices(items, weights = items_weights, k=5)

    return recs


In [69]:

%%time
items_weights = popularity_n['weight']
items = popularity_n['item_id']

result['w_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights))
result.head(2)

Wall time: 1min 2s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weight_random_recommendation,w_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000157, 1022284, 1009924, 1111610, 479576]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[7441790, 12988219, 13944932, 1089588, 6457655]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."


4.5 Измерим качество по precision@5

In [74]:
result.head(5)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weight_random_recommendation,w_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000157, 1022284, 1009924, 1111610, 479576]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[7441790, 12988219, 13944932, 1089588, 6457655]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[7443450, 5588109, 15507023, 476841, 1036014]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[979452, 903044, 15596337, 13876788, 1127571]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[841024, 9797934, 8197773, 7167865, 12266673]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."


In [42]:
result.to_csv('predictions_basic.csv', index=False)

In [51]:
def precision_at_k(recommended_list, bought_list, k=10):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

k = 5

In [44]:
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000686
popular_recommendation: 0.155240
weight_random_recommendation: 0.001469


k=4

In [46]:
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000857
popular_recommendation: 0.107003
weight_random_recommendation: 0.001469


In [48]:
#k=3
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.001143
popular_recommendation: 0.137773
weight_random_recommendation: 0.001469


In [50]:
#k=6
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000686
popular_recommendation: 0.155240
weight_random_recommendation: 0.001469


In [52]:
#k=10
for column_name in result.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.000686
popular_recommendation: 0.155240
weight_random_recommendation: 0.001469


для random_recommendation луший k=4, для popular_recommendation: k>= 5

In [61]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [62]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [65]:
def random_recommendation_top(items, n=5):
    """Случайные рекоммендации"""
    
    #items_top = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [68]:
result_top = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_top.columns=['user_id', 'actual']
result_top.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [69]:
%%time

items = top_5000

result_top['random_recommendation'] = result_top['user_id'].apply(lambda x: random_recommendation(items, n=5))
result_top.head(2)

Wall time: 1.9 s


Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[964133, 1120532, 7024990, 10356231, 861615]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12384657, 931579, 828525, 941313, 831517]"


In [73]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
    popularity.sort_values('n_sold', ascending=False).head(5000)
    
    recs = data.head(n).item_id
    
    return recs.tolist()

In [75]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result_top['popular_recommendation'] = result_top['user_id'].apply(lambda x: popular_recs)
result_top.head(2)

Wall time: 299 ms


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[964133, 1120532, 7024990, 10356231, 861615]","[1004906, 1033142, 1036325, 1082185, 8160430]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12384657, 931579, 828525, 941313, 831517]","[1004906, 1033142, 1036325, 1082185, 8160430]"


In [76]:
popularity_w = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity_w.rename(columns={'quantity': 'n_sold'}, inplace=True)
popularity_w.sort_values('n_sold', ascending=False).head(5000)
popularity_w.head(2)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1


In [78]:
popularity_w['weight'] = popularity_w['n_sold'].apply(lambda x: np.log10(x))

In [79]:
popularity_w.head(2)

Unnamed: 0,item_id,n_sold,weight
0,25671,6,0.778151
1,26081,1,0.0


In [82]:
def weighted_random_recommendation(items_weights_w):
    items_weights_w = np.array(items_weights_w)
    recs = random.choices(items_w, weights = items_weights_w, k=5)

    return recs

In [83]:
%%time
items_weights_w = popularity_w['weight']
items_w = popularity_w['item_id']

result_top['w_random_recommendation'] = result_top['user_id'].apply(lambda x: weighted_random_recommendation(items_weights_w))
result.head(2)

Wall time: 1min 2s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weight_random_recommendation,w_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[835243, 857554, 1045121, 987557, 8019716]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1026586, 1048483, 832760, 1122112, 1118533]","[6534178, 6533889, 1029743, 6534166, 1082185]","[18024556, 18024556, 18024556, 18024556, 18024...","[18024556, 18024556, 18024556, 18024556, 18024..."


In [96]:
def precision_at_k_w(recommended_list, bought_list, k=7):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [88]:
#k=5
for column_name in result_top.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k_w(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.004897
popular_recommendation: 0.155240
w_random_recommendation: 0.000294


In [93]:
#k=4
for column_name in result_top.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k_w(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.004775
popular_recommendation: 0.107003
w_random_recommendation: 0.000367


In [91]:
#k=3
for column_name in result_top.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k_w(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.004775
popular_recommendation: 0.107003
w_random_recommendation: 0.000367


In [95]:
#k=6
for column_name in result_top.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k_w(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.004897
popular_recommendation: 0.155240
w_random_recommendation: 0.000294


In [97]:
#k=7
for column_name in result_top.columns[1:]:
    print('%s: %f' % (column_name, result.apply(lambda row: precision_at_k_w(row[column_name], row['actual']), axis=1).mean()))

actual: 1.000000
random_recommendation: 0.004897
popular_recommendation: 0.155240
w_random_recommendation: 0.000294


#### для random_recommendation луший k>5, для popular_recommendation: k>= 5, для weighted_random_recommendation k<5