In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

In [2]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    try:
        recommended_list = recommended_list[:k]
    except:
        recommended_list = []
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [3]:
def prefilter_items(data):
    # Уберем самые популярные товары (их и так купят)
    popularity = data_train.groupby('item_id')['user_id'].nunique().reset_index() / data_train['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    data = data[data['sales_value'] != 0]
    
    # Уберем не интересные для рекоммендаций категории (department)
    step1 = item_features.groupby('department')['item_id'].count().sort_values(ascending=False)
    step2 = list(step1.loc[step1>=1000].index)
    item_features = item_features[item_features['department'].isin(step2)]
    
    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    # pricepp - price per product
    data_train = data_train[data_train['sales_value'] != 0]
    data_train['pricepp'] = data_train['sales_value']/data_train['quantity'] 
    data_train = data_train[data_train['pricepp'] > data_train['pricepp'].min()]
    
    # Уберем слишком дорогие товары
    # уберем инфинитум
    data_train = data_train[data_train['pricepp'] != data_train['pricepp'].max()]
    # ...
    
def postfilter_items(user_id, recommednations):
    pass

In [4]:
data = pd.read_csv('K:/2020/Programming/Data Science/Рекомендательные системы/Урок 4/transaction_data.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
item_features = pd.read_csv('K:/2020/Programming/Data Science/Рекомендательные системы/Урок 4/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(200)

Unnamed: 0,user_id,actual
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65..."
1,3,"[823704, 834117, 840244, 913785, 917816, 93870..."
2,5,"[913077, 1118028, 1386668]"
3,6,"[825541, 859676, 999318, 1055646, 1067606, 108..."
4,7,"[929248, 948622, 1013572, 1022003, 1049892, 10..."
...,...,...
195,250,"[844165, 847374, 855257, 859191, 868548, 87620..."
196,251,"[853756, 867065, 6533437, 15595996, 15596467, ..."
197,253,"[827858, 834103, 839419, 851508, 857849, 86215..."
198,254,"[823758, 833025, 845715, 901666, 994928, 99611..."


In [7]:
n_items_before = data_train['item_id'].nunique()

In [8]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

In [9]:
n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 90386 to 5001


In [10]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926885,15926886,15926887,15926927,15927033,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [12]:
user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [13]:
%%time

model = AlternatingLeastSquares(factors=44,
                                regularization=0.001,
                                iterations=20,
                                calculate_training_loss=True, 
                                use_gpu=False)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)



  0%|          | 0/20 [00:00<?, ?it/s]

CPU times: total: 12.5 s
Wall time: 1.88 s


In [14]:
def get_recommendations(user, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]],  # !!! 
                                    recalculate_user=True)]
    return res

In [15]:
%%time
    
result['bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, model=model, N=5))

CPU times: total: 1min 50s
Wall time: 27.6 s


In [16]:
result.apply(lambda row: precision_at_k(row['bm25'], row['actual']), axis=1).mean()

0.1077850326469111

In [17]:
result.head(2)

Unnamed: 0,user_id,actual,bm25
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65...","[9487534, 1051211, 1100972, 874617, 958046]"
1,3,"[823704, 834117, 840244, 913785, 917816, 93870...","[962229, 951590, 913278, 5569230, 1110244]"


In [18]:
item_features = item_features[item_features['item_id'].isin(itemid_to_id.keys())]
item_features[item_features['brand'] == 'Private'].item_id.nunique()

1469

In [19]:
own_items = item_features[item_features['brand'] == 'Private'].\
    groupby('department')['item_id'].nunique().\
    sort_values(ascending=False)
own_items

department
GROCERY            1231
MEAT-PCKGD           74
PRODUCE              32
PASTRY               31
DELI                 27
MISC SALES TRAN      20
DRUG GM              16
KIOSK-GAS            13
SEAFOOD-PCKGD        11
MEAT                  8
NUTRITION             5
FLORAL                1
Name: item_id, dtype: int64

In [20]:
categories_with_own_items = own_items.index.tolist()

In [21]:
categories_with_own_items

['GROCERY',
 'MEAT-PCKGD',
 'PRODUCE',
 'PASTRY',
 'DELI',
 'MISC SALES TRAN',
 'DRUG GM',
 'KIOSK-GAS',
 'SEAFOOD-PCKGD',
 'MEAT',
 'NUTRITION',
 'FLORAL']

In [22]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import cm
import pickle

from scipy.spatial.distance import cdist

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import seaborn as sns


def reduce_dims(df, dims=2, method='pca'):
    
    assert method in ['pca', 'tsne'], 'Неверно указан метод'
    
    if method=='pca':
        pca = PCA(n_components=dims)
        components = pca.fit_transform(df)
    elif method == 'tsne':
        tsne = TSNE(n_components=dims, learning_rate=250, random_state=42, n_iter=300, n_iter_without_progress=20)
        components = tsne.fit_transform(df)
    else:
        print('Error')
        
    colnames = ['component_' + str(i) for i in range(1, dims+1)]
    return pd.DataFrame(data = components, columns = colnames) 


def display_components_in_2D_space(components_df, labels='category', marker='D'):
    
    groups = components_df.groupby(labels)

    # Plot
    fig, ax = plt.subplots(figsize=(12,8))
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
    for name, group in groups:
        ax.plot(group.component_1, group.component_2, 
                marker='o', ms=6,
                linestyle='',
                alpha=0.7,
                label=name)
    ax.legend(loc='center left', bbox_to_anchor=(1.02, 0.5))

    plt.xlabel('component_1')
    plt.ylabel('component_2') 
    plt.show()

In [23]:
model.item_factors.shape

(5001, 44)

In [24]:
category = []

for idx in range(model.item_factors.shape[0]):

    try:
        cat = item_features.loc[item_features['item_id'] == id_to_itemid[idx], 'department'].values[0]
        category.append(cat)
    except:
        category.append('UNKNOWN')

In [25]:
popularity = data_train.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
popularity.sort_values('quantity', ascending=False, inplace=True)

popularity = popularity[popularity['item_id'] != 999999]

popularity = popularity.groupby('user_id').head(5)

popularity.sort_values('user_id', ascending=False, inplace=True)
popularity.head(5)

Unnamed: 0,user_id,item_id,quantity
709114,2500,859237,16
709374,2500,1082185,23
709350,2500,1065538,10
709467,2500,6534178,13
709343,2500,1058997,11


In [26]:
%%time

def get_similar_items_recommendation(model, x):
    recs = model.similar_items(itemid_to_id[x], N=3)
    top_rec = recs[1][0] # потому что на позии recs[0][0] - находится id самого x
    rec = id_to_itemid[top_rec]
    # заглушка, можно заменять на "менее похожий товар" в случае если попался товар не из ТОП-5000  
    if rec == 999999:
        #rec = x
        rec = id_to_itemid[recs[2][0]]
    return rec

CPU times: total: 0 ns
Wall time: 0 ns


In [27]:
popularity['similar_recommendation'] = popularity['item_id'].apply(lambda x: get_similar_items_recommendation(model, x))
popularity.head(3)

Unnamed: 0,user_id,item_id,quantity,similar_recommendation
709114,2500,859237,16,936508
709374,2500,1082185,23,981760
709350,2500,1065538,10,1034176


In [28]:
recommendation_similar_items = popularity.groupby('user_id')['similar_recommendation'].unique().reset_index()
recommendation_similar_items.columns=['user_id', 'similar_recommendation']
recommendation_similar_items.head(2)

Unnamed: 0,user_id,similar_recommendation
0,1,"[9526410, 5582712, 981760, 1138467]"
1,2,"[985999, 1133018, 8090537, 5569845, 1106523]"


In [29]:
result = result.merge(recommendation_similar_items, on='user_id', how='left')
result.head(2)

Unnamed: 0,user_id,actual,bm25,similar_recommendation
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65...","[9487534, 1051211, 1100972, 874617, 958046]","[9526410, 5582712, 981760, 1138467]"
1,3,"[823704, 834117, 840244, 913785, 917816, 93870...","[962229, 951590, 913278, 5569230, 1110244]","[1133018, 1075979, 1053690, 1092026, 960318]"


In [30]:
result.apply(lambda row: precision_at_k(row['similar_recommendation'], row['actual']), axis=1).mean()

  precision = flags.sum() / len(recommended_list)


0.13385390428211585

In [31]:
def get_similar_users_recommendation(model, x):
    _users = model.similar_users(userid_to_id[x], N=3)
    top_users = _users[1][0] # потому что на позии recs[0][0] - находится id самого x
    users = id_to_userid[top_users]
    # заглушка, можно заменять на "менее похожий товар" в случае если попался товар не из ТОП-5000  
    #if rec == 999999:
        #rec = x
        #rec = id_to_itemid[recs[2][0]]
    return users

In [32]:
popularity['similar_users'] = popularity['user_id'].apply(lambda x: get_similar_users_recommendation(model, x))
popularity.head(3)

Unnamed: 0,user_id,item_id,quantity,similar_recommendation,similar_users
709114,2500,859237,16,936508,1891
709374,2500,1082185,23,981760,1891
709350,2500,1065538,10,1034176,1891


In [None]:
#popularity['similar_users_recommendation'] = popularity['similar_users'].apply(lambda x: get_recommendations(x, model=model, N=5))
#popularity.head(3)

In [33]:
recommendation_similar_users = popularity.groupby('similar_users')['similar_recommendation'].unique().reset_index()


In [34]:
recommendation_similar_users.columns=['user_id', 'similar_recommendation']
recommendation_similar_users.head(2)

Unnamed: 0,user_id,similar_recommendation
0,1,"[981760, 959316, 1135834, 889731]"
1,3,"[1092026, 1133018, 1106523, 1053690, 981760, 8..."


In [35]:
result = result.merge(recommendation_similar_items, on='user_id', how='left')
result.head(2)

Unnamed: 0,user_id,actual,bm25,similar_recommendation_x,similar_recommendation_y
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65...","[9487534, 1051211, 1100972, 874617, 958046]","[9526410, 5582712, 981760, 1138467]","[9526410, 5582712, 981760, 1138467]"
1,3,"[823704, 834117, 840244, 913785, 917816, 93870...","[962229, 951590, 913278, 5569230, 1110244]","[1133018, 1075979, 1053690, 1092026, 960318]","[1133018, 1075979, 1053690, 1092026, 960318]"


In [40]:
# дальше исполнение программой кода выдает ошибку

In [44]:
%%time

own = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

own.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: total: 1.33 s
Wall time: 370 ms


In [49]:
def own_recommender(user, model, N=5):
    recs = [userid_to_id[rec[0]] for rec in
        model.recommend(userid=userid_to_id[user],
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)]
    return recs

In [53]:
popularity['own_recommendation'] = popularity['similar_users'].apply(lambda x: get_recommendations(x, model=own, N=5))
popularity.head(300)

Unnamed: 0,user_id,item_id,quantity,similar_recommendation,similar_users,own_recommendation
709114,2500,859237,16,936508,1891,"[1055425, 1132198, 5572828, 965772, 1045478]"
709374,2500,1082185,23,981760,1891,"[1055425, 1132198, 5572828, 965772, 1045478]"
709350,2500,1065538,10,1034176,1891,"[1055425, 1132198, 5572828, 965772, 1045478]"
709467,2500,6534178,13,1082185,1891,"[1055425, 1132198, 5572828, 965772, 1045478]"
709343,2500,1058997,11,1126899,1891,"[1055425, 1132198, 5572828, 965772, 1045478]"
...,...,...,...,...,...,...
689786,2441,1023958,17,909894,1287,"[981591, 1015781, 1130375, 865196, 943169]"
689618,2440,1070015,11,6533765,1617,"[997796, 997011, 902659, 1064574, 1127758]"
689593,2440,1043590,9,1044078,1617,"[997796, 997011, 902659, 1064574, 1127758]"
689536,2440,971325,10,1103629,1617,"[997796, 997011, 902659, 1064574, 1127758]"
