<a href="https://colab.research.google.com/github/VitalyGladyshev/gb_rec_sys/blob/main/coursework/cw_rec_sys_glvv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ПРОЕКТ "Рекомендательные системы"


# ЧАСТЬ 2: Моделирование

In [8]:
# install implicit (it was installing by conda: conda install -c conda-forge implicit)

In [9]:
# install lightfm (it was installing by conda: conda install -c conda-forge lightfm)

In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys

In [2]:
path_to_src = './src'

In [126]:
if path_to_src not in sys.path:
    sys.path.append(path_to_src)

# Написанные функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [7]:
path_to_files = './prep12'

In [34]:
data          = pd.read_csv(os.path.join(path_to_files, "data.csv"))
user_features = pd.read_csv(os.path.join(path_to_files, "user_features.csv"))
item_features = pd.read_csv(os.path.join(path_to_files, "item_features.csv"))

In [35]:
data

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,999999,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,999999,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,999999,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,999999,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,999999,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,41655820646,663,999999,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,41655829421,663,999999,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,41655829421,663,999999,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,41656790510,663,999999,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [11]:
user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,bill_avg,week_last_bill,items_per_week
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,2.492077,95,18.264368
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,2.673405,95,11.780220
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,2.798250,95,20.505882
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13,5.693402,95,26.750000
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16,2.927602,94,6.412500
...,...,...,...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2494,3.560885,93,6.230769
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496,2.891049,95,18.205128
798,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,2497,3.533885,95,23.349398
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2498,3.012473,95,10.544304


In [12]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


### Проверочные и тестовые мероприятия

In [13]:
user_item_matrix = pd.pivot_table(data, 
                                          index='user_id',
                                          columns='item_id', 
                                          values='quantity', # Можно пробовать другие варианты
                                          aggfunc='count', 
                                          fill_value=0)
user_item_matrix

item_id,201704,818981,818996,819063,819112,819115,819210,819255,819304,819308,...,17104444,17105058,17105257,17106039,17168855,17169131,17179856,17179873,17208470,17214946
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

#user_item_matrix_tfidf = tfidf_weight(user_item_matrix.T).T
user_item_matrix_tfidf = tfidf_weight(user_item_matrix)
user_item_matrix_tfidf

<2499x12744 sparse matrix of type '<class 'numpy.float64'>'
	with 606929 stored elements in COOrdinate format>

In [16]:
user_item_matrix_bm25 = bm25_weight(user_item_matrix.T).T
user_item_matrix_bm25

<2499x12744 sparse matrix of type '<class 'numpy.float64'>'
	with 606929 stored elements in COOrdinate format>

In [17]:
def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

In [18]:
id_to_itemid, id_to_userid, \
        itemid_to_id, userid_to_id = prepare_dicts(user_item_matrix)


In [19]:
id_to_itemid

{0: 201704,
 1: 818981,
 2: 818996,
 3: 819063,
 4: 819112,
 5: 819115,
 6: 819210,
 7: 819255,
 8: 819304,
 9: 819308,
 10: 819330,
 11: 819400,
 12: 819487,
 13: 819518,
 14: 819534,
 15: 819578,
 16: 819590,
 17: 819594,
 18: 819612,
 19: 819765,
 20: 819840,
 21: 819845,
 22: 819849,
 23: 819927,
 24: 819969,
 25: 819978,
 26: 820011,
 27: 820082,
 28: 820122,
 29: 820291,
 30: 820296,
 31: 820301,
 32: 820321,
 33: 820347,
 34: 820352,
 35: 820429,
 36: 820486,
 37: 820518,
 38: 820551,
 39: 820582,
 40: 820612,
 41: 820701,
 42: 820752,
 43: 820788,
 44: 820895,
 45: 820923,
 46: 820950,
 47: 821007,
 48: 821025,
 49: 821032,
 50: 821083,
 51: 821134,
 52: 821174,
 53: 821200,
 54: 821209,
 55: 821219,
 56: 821316,
 57: 821324,
 58: 821344,
 59: 821464,
 60: 821465,
 61: 821556,
 62: 821565,
 63: 821695,
 64: 821741,
 65: 821761,
 66: 821787,
 67: 821788,
 68: 821825,
 69: 821845,
 70: 821863,
 71: 821890,
 72: 821895,
 73: 821923,
 74: 821930,
 75: 821970,
 76: 821974,
 77: 8219

In [34]:
itemid_to_id

{201704: 0,
 818981: 1,
 818996: 2,
 819063: 3,
 819112: 4,
 819115: 5,
 819210: 6,
 819255: 7,
 819304: 8,
 819308: 9,
 819330: 10,
 819400: 11,
 819487: 12,
 819518: 13,
 819534: 14,
 819578: 15,
 819590: 16,
 819594: 17,
 819612: 18,
 819765: 19,
 819840: 20,
 819845: 21,
 819849: 22,
 819927: 23,
 819969: 24,
 819978: 25,
 820011: 26,
 820082: 27,
 820122: 28,
 820291: 29,
 820296: 30,
 820301: 31,
 820321: 32,
 820347: 33,
 820352: 34,
 820429: 35,
 820486: 36,
 820518: 37,
 820551: 38,
 820582: 39,
 820612: 40,
 820701: 41,
 820752: 42,
 820788: 43,
 820895: 44,
 820923: 45,
 820950: 46,
 821007: 47,
 821025: 48,
 821032: 49,
 821083: 50,
 821134: 51,
 821174: 52,
 821200: 53,
 821209: 54,
 821219: 55,
 821316: 56,
 821324: 57,
 821344: 58,
 821464: 59,
 821465: 60,
 821556: 61,
 821565: 62,
 821695: 63,
 821741: 64,
 821761: 65,
 821787: 66,
 821788: 67,
 821825: 68,
 821845: 69,
 821863: 70,
 821890: 71,
 821895: 72,
 821923: 73,
 821930: 74,
 821970: 75,
 821974: 76,
 821976: 

In [22]:
user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,bill_avg,week_last_bill,items_per_week
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,2.492077,95,18.264368
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,2.673405,95,11.780220
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,2.798250,95,20.505882
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13,5.693402,95,26.750000
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16,2.927602,94,6.412500
...,...,...,...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2494,3.560885,93,6.230769
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496,2.891049,95,18.205128
798,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,2497,3.533885,95,23.349398
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2498,3.012473,95,10.544304


In [20]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
...,...
2494,2496
2495,2497
2496,2498
2497,2499


In [22]:
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat

Unnamed: 0,user_id,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,bill_avg,week_last_bill,items_per_week
0,1,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,2.492077,95.0,18.264368
1,2,,,,,,,,,,
2,3,,,,,,,,,,
3,4,,,,,,,,,,
4,5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2494,2496,45-54,A,75-99K,Homeowner,Unknown,3,1,2.891049,95.0,18.205128
2495,2497,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,3.533885,95.0,23.349398
2496,2498,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,3.012473,95.0,10.544304
2497,2499,25-34,U,Under 15K,Unknown,2 Adults Kids,3,1,2.966347,92.0,11.535714


In [23]:
user_feat.set_index('user_id', inplace=True)
user_feat

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,bill_avg,week_last_bill,items_per_week
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,2.492077,95.0,18.264368
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2496,45-54,A,75-99K,Homeowner,Unknown,3,1,2.891049,95.0,18.205128
2497,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,3.533885,95.0,23.349398
2498,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,3.012473,95.0,10.544304
2499,25-34,U,Under 15K,Unknown,2 Adults Kids,3,1,2.966347,92.0,11.535714


In [24]:
user_feat = user_feat.fillna("")
user_feat

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,bill_avg,week_last_bill,items_per_week
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,2.492077,95.0,18.264368
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2496,45-54,A,75-99K,Homeowner,Unknown,3,1,2.891049,95.0,18.205128
2497,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,3.533885,95.0,23.349398
2498,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,3.012473,95.0,10.544304
2499,25-34,U,Under 15K,Unknown,2 Adults Kids,3,1,2.966347,92.0,11.535714


In [39]:
data['week_no'].max()

95

### Cхема обучения и валидации

In [None]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data[''] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]


Провнряем правильность разбиения по неделям

In [44]:
data_train_lvl_1['week_no'].min(), data_train_lvl_1['week_no'].max()

(1, 85)

In [45]:
data_val_lvl_1['week_no'].min(), data_val_lvl_1['week_no'].max()

(86, 91)

In [46]:
data_train_lvl_2['week_no'].min(), data_train_lvl_2['week_no'].max()

(86, 91)

In [47]:
data_val_lvl_2['week_no'].min(), data_val_lvl_2['week_no'].max()

(92, 95)

Создаем экземпляр класса MainRecommender

In [27]:
recommender = MainRecommender(data_train_lvl_1, item_features, user_features)

user_item_matrix
user_feat
item_feat




ALS model
ALS TF-IDF model
ALS BM25 model
LightFM model
Own


In [94]:
users_lvl_2_pred_als = pd.DataFrame(data_val_lvl_2['user_id'].unique())
users_lvl_2_pred_als

Unnamed: 0,0
0,338
1,2120
2,2324
3,514
4,1762
...,...
2037,1386
2038,472
2039,90
2040,1635


In [117]:
users_lvl_2_pred_als.columns = ['user_id']
users_lvl_2_pred_als

Unnamed: 0,user_id
0,338
1,2120
2,2324
3,514
4,1762
...,...
2037,1386
2038,472
2039,90
2040,1635


In [116]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

In [48]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2

Unnamed: 0,user_id,actual
0,1,"[999999, 856942, 865456, 907957, 914190, 94331..."
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096..."
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8..."
3,7,"[840386, 999999, 898068, 909714, 993838, 10031..."
4,8,"[835098, 872137, 910439, 924610, 992977, 99999..."
...,...,...
2037,2496,[999999]
2038,2497,"[1016709, 9835695, 999999, 845294, 871756, 873..."
2039,2498,"[999999, 901776, 914190, 958382, 972437, 10398..."
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [124]:
k_n = 5
num = 40

In [50]:
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity

Unnamed: 0,item_id,quantity
0,201704,53
1,818981,69
2,818996,24
3,819063,224
4,819112,82
...,...,...
12739,17169131,23
12740,17179856,16
12741,17179873,16
12742,17208470,45


In [51]:
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
popularity

Unnamed: 0,item_id,n_sold
0,201704,53
1,818981,69
2,818996,24
3,819063,224
4,819112,82
...,...,...
12739,17169131,23
12740,17179856,16
12741,17179873,16
12742,17208470,45


In [53]:
popularity.sort_values('n_sold', ascending=False).head(num+1)

Unnamed: 0,item_id,n_sold
4964,999999,239310436
4804,995242,13491
5773,1029743,9598
8551,1133018,7478
4439,981760,6737
7840,1106523,6583
8400,1127831,5310
8906,5569230,4669
2485,908531,4541
3670,951590,4284


In [56]:
item_list = popularity.sort_values('n_sold', ascending=False).head(num+1).item_id.tolist()
item_list

[999999,
 995242,
 1029743,
 1133018,
 981760,
 1106523,
 1127831,
 5569230,
 908531,
 951590,
 862349,
 916122,
 1058997,
 1044078,
 1126899,
 961554,
 844179,
 859075,
 1005186,
 904360,
 1065593,
 1070820,
 844165,
 923746,
 1053690,
 5569471,
 1096036,
 914190,
 854852,
 903325,
 878996,
 1004906,
 962229,
 986912,
 866211,
 1092026,
 962568,
 8090521,
 1081177,
 8090537,
 1024306]

In [69]:
len(item_list)

40

In [70]:
item_list = item_list[1:num+1]
item_list

[1029743,
 1133018,
 981760,
 1106523,
 1127831,
 5569230,
 908531,
 951590,
 862349,
 916122,
 1058997,
 1044078,
 1126899,
 961554,
 844179,
 859075,
 1005186,
 904360,
 1065593,
 1070820,
 844165,
 923746,
 1053690,
 5569471,
 1096036,
 914190,
 854852,
 903325,
 878996,
 1004906,
 962229,
 986912,
 866211,
 1092026,
 962568,
 8090521,
 1081177,
 8090537,
 1024306]

In [59]:
# Подготовим список наиболее покупаемых товаров
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
item_list = popularity.sort_values('n_sold', ascending=False).head(num+1).item_id.tolist()
item_list = item_list[1:num+1]
len(item_list)

40

Проводим локальную проверку работоспособности модели ALS

In [71]:
n_factors, regularization, iterations, num_threads =50, 0.0001, 15, 4

In [74]:
from implicit.als import AlternatingLeastSquares

In [75]:
model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)

In [76]:
model.recommend(userid=userid_to_id[1], 
                                             user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                             N=5, 
                                             filter_already_liked_items=False, 
                                             filter_items=[itemid_to_id[999999]],  # !!! 
                                             recalculate_user=True)

[(1826, 1.089351418887143),
 (4804, 1.061091063779555),
 (9059, 0.9991109451136332),
 (2344, 0.9898791262291128),
 (8912, 0.9380612761971857)]

In [80]:
user = 1
rec_num = 5
[id_to_itemid[rec[0]] for rec in 
                        model.recommend(userid=userid_to_id[user], 
                                             user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                             N= rec_num, 
                                             filter_already_liked_items=False, 
                                             filter_items=[itemid_to_id[999999]],  # !!! 
                                             recalculate_user=True)]

[885290, 995242, 5978656, 904360, 5569374]

In [84]:
itemid_to_id[995242]

4804

In [None]:
Вроде все в порядке

Готовим подпрограммы для трейна первого уровня

In [86]:
user_train_list = data_train_lvl_1["user_id"].unique().tolist()
user_train_list[:3]

[2375, 1364, 1130]

In [103]:
def get_pred_als(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [104]:
def get_pred_als_tfidf(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_tfidf_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [105]:
def get_pred_als_bm25(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_bm25_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [106]:
def get_pred_own(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_own_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [107]:
def get_pred_similar_items(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_similar_items_recommendation(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

Код для запуска рекомендаций  топ-N товаров и расчета метрики precision_at_k

In [None]:
users_lvl_2_pred_als['predictions'] = users_lvl_2_pred_als['user_id'].apply(\
                            lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=num))
    
users_lvl_2_pred_als["prec"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: precision_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als["prec"].mean()

### РАСЧЕТ ЗАВИС!!!
### Видимо, у меня слабый компьютер.
### Поэтому решил уменьшить количество пользователей до 500
(users_lvl_2_pred_als_part)

In [118]:
users_lvl_2_pred_als


Unnamed: 0,user_id
0,338
1,2120
2,2324
3,514
4,1762
...,...
2037,1386
2038,472
2039,90
2040,1635


In [156]:
# Делаем выборку 500 пользователей
users_lvl_2_pred_als_part = users_lvl_2_pred_als.head(500)
users_lvl_2_pred_als_part

Unnamed: 0,user_id
0,338
1,2120
2,2324
3,514
4,1762
...,...
495,2425
496,511
497,1020
498,1146


In [170]:
# Осуществляем подгонку валидационного датафрейма по 500 пользователям
result_lvl_2_part = result_lvl_2.loc[result_lvl_2['user_id'].\
                                 isin(users_lvl_2_pred_als_part['user_id']. tolist ())]


In [157]:
# Считаем предсказания
users_lvl_2_pred_als_part['predictions'] = users_lvl_2_pred_als_part['user_id'] \
.apply(lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=num))

In [158]:
# Проверяем срабатывание
users_lvl_2_pred_als_part

Unnamed: 0,user_id,predictions
0,338,"[1037863, 845208, 1026118, 1133018, 904360, 98..."
1,2120,"[1106523, 1029743, 995242, 981760, 1058997, 11..."
2,2324,"[995242, 1029743, 5978656, 981760, 961554, 112..."
3,514,"[878996, 1127831, 866211, 1005186, 1024306, 10..."
4,1762,"[1114597, 1003188, 1068719, 961554, 1110572, 9..."
...,...,...
495,2425,"[1029743, 995242, 981760, 904360, 854852, 1127..."
496,511,"[1133018, 1106523, 1044078, 981760, 1038217, 9..."
497,1020,"[1004906, 1037863, 862349, 995242, 916122, 961..."
498,1146,"[1133018, 904360, 6533765, 1033615, 1005186, 1..."


## У меня не получилось обратиться к файлу с метриками 

### Я так понял, что при импорте функции с метрикой, почему то не учитывается подключение библиотеки "import numpy as np"

### Поэтому перенс метрики в этот файл.

In [151]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [171]:
# Считаем метрику precision_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["prec"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: precision_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

In [172]:
users_lvl_2_pred_als_part

Unnamed: 0,user_id,predictions,prec
0,338,"[1037863, 845208, 1026118, 1133018, 904360, 98...",0.6
1,2120,"[1106523, 1029743, 995242, 981760, 1058997, 11...",0.0
2,2324,"[995242, 1029743, 5978656, 981760, 961554, 112...",0.6
3,514,"[878996, 1127831, 866211, 1005186, 1024306, 10...",0.6
4,1762,"[1114597, 1003188, 1068719, 961554, 1110572, 9...",0.2
...,...,...,...
495,2425,"[1029743, 995242, 981760, 904360, 854852, 1127...",0.2
496,511,"[1133018, 1106523, 1044078, 981760, 1038217, 9...",0.0
497,1020,"[1004906, 1037863, 862349, 995242, 916122, 961...",0.8
498,1146,"[1133018, 904360, 6533765, 1033615, 1005186, 1...",0.6


In [173]:
users_lvl_2_pred_als_part["prec"].mean()

0.23600000000000085

In [175]:
def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

In [176]:
# Считаем метрику recall_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["rec"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: recall_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))


In [177]:
users_lvl_2_pred_als_part["rec"].mean()

0.03179968066943935

Код для запуска рекомендаций  топ-N товаров с учетом представления tfidf и расчета метрик

In [178]:
users_lvl_2_pred_als_part['predictions_tfidf'] = users_lvl_2_pred_als_part['user_id'].apply(\
                            lambda x: get_pred_als_tfidf(recommender, user_train_list, item_list, x, num=num))

In [179]:
# Считаем метрику precision_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["prec_tfidf"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: precision_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions_tfidf"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

In [181]:
users_lvl_2_pred_als_part["prec_tfidf"].mean()

0.17160000000000067

In [182]:
# Считаем метрику recall_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["rec_tfidf"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: recall_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions_tfidf"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als_part["rec_tfidf"].mean()

0.022713076586009944

Код для запуска рекомендаций  топ-N товаров с учетом представления bm25 и расчета метрик

In [183]:
users_lvl_2_pred_als_part['predictions_bm25'] = users_lvl_2_pred_als_part['user_id'].apply(\
                            lambda x: get_pred_als_tfidf(recommender, user_train_list, item_list, x, num=num))

In [186]:
# Считаем метрику precision_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["prec_bm25"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: precision_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions_bm25"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))
users_lvl_2_pred_als_part["prec_bm25"].mean()

0.17160000000000067

In [187]:
# Считаем метрику recall_at_k для выборки из 500 пользователей
users_lvl_2_pred_als_part["rec_bm25"] = users_lvl_2_pred_als_part['user_id'].apply(\
        lambda x: recall_at_k((users_lvl_2_pred_als_part.loc[users_lvl_2_pred_als_part["user_id"] == x, "predictions_bm25"].values[0]), 
                                 (result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als_part["rec_tfidf"].mean()

0.022713076586009944

In [188]:
# Просматриваем получившийся датафрейм
users_lvl_2_pred_als_part.head()

Unnamed: 0,user_id,predictions,prec,rec,predictions_tfidf,prec_tfidf,rec_tfidf,predictions_bm25,Unnamed: 9,prec_bm25,rec_bm25
0,338,"[1037863, 845208, 1026118, 1133018, 904360, 98...",0.6,0.042857,"[845208, 1026118, 1037863, 904360, 1081177, 98...",0.4,0.028571,"[845208, 1026118, 1037863, 904360, 1081177, 98...",0.4,0.4,0.028571
1,2120,"[1106523, 1029743, 995242, 981760, 1058997, 11...",0.0,0.0,"[1029743, 995242, 981760, 1127831, 1106523, 11...",0.0,0.0,"[1029743, 995242, 981760, 1127831, 1106523, 11...",0.0,0.0,0.0
2,2324,"[995242, 1029743, 5978656, 981760, 961554, 112...",0.6,0.076923,"[1005186, 5978656, 1058997, 5569230, 1136257, ...",0.2,0.025641,"[1005186, 5978656, 1058997, 5569230, 1136257, ...",0.2,0.2,0.025641
3,514,"[878996, 1127831, 866211, 1005186, 1024306, 10...",0.6,0.096774,"[1062002, 1105488, 1127179, 878996, 1023720, 8...",0.4,0.064516,"[1062002, 1105488, 1127179, 878996, 1023720, 8...",0.4,0.4,0.064516
4,1762,"[1114597, 1003188, 1068719, 961554, 1110572, 9...",0.2,0.012346,"[1084331, 1003188, 1065593, 1136257, 976199, 1...",0.4,0.024691,"[1084331, 1003188, 1065593, 1136257, 976199, 1...",0.4,0.4,0.024691


In [30]:
users_lvl_2_pred_als.head()

Unnamed: 0,user_id,predictions,prec,rec,predictions_tfidf,prec_tfidf,rec_tfidf,predictions_bm25,prec_bm25,rec_bm25
0,338,"[1037863, 981760, 845208, 904360, 1082185, 113...",0.4,0.027778,"[845208, 923746, 896613, 904360, 1068292, 1037...",0.0,0.0,"[871611, 819840, 1084036, 1026118, 1025650, 84...",0.2,0.013889
1,2120,"[1082185, 1106523, 981760, 995242, 1029743, 10...",0.0,0.0,"[1082185, 1029743, 995242, 981760, 1127831, 11...",0.0,0.0,"[1082185, 981760, 849843, 1041476, 995055, 113...",0.0,0.0
2,2324,"[995242, 1029743, 1082185, 5978656, 981760, 91...",0.6,0.075,"[1082185, 1136257, 1039156, 1027569, 5978656, ...",0.6,0.075,"[10356272, 878611, 967760, 1108894, 942045, 96...",0.2,0.025
3,514,"[866211, 878996, 1127831, 1082185, 1005186, 10...",0.6,0.09375,"[866211, 1105488, 1005186, 965267, 1024306, 87...",0.4,0.0625,"[1126786, 1082185, 878996, 923169, 835595, 639...",0.8,0.125
4,1762,"[1098066, 1136257, 826249, 1068719, 1084331, 1...",0.0,0.0,"[1096036, 1084331, 1110572, 1003188, 1044078, ...",0.2,0.012048,"[5569471, 1029743, 1048200, 868548, 1096036, 8...",0.2,0.012048


### ВЫВОД: лучшую модель показала чистая АLS (не tfidf и не bm25).

### Целевая метрика precision@5 получилась равной 0.23.

### Будем использовать ALS в качестве входной для классификатора второго уровня LGBMClassifier



## LGBMClassifier

### Строим классификатор

In [190]:
data_train_lvl_2.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,999999,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,999999,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0


In [192]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']
users_lvl_2


Unnamed: 0,user_id
0,2070
1,2021
2,1753
3,2120
4,1346
...,...
2149,1446
2150,1784
2151,436
2152,1697


In [None]:
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply\
(lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=num))

### РАСЧЕТ ОПЯТЬ ЗАВИС!!!
### Видимо, у меня слабый компьютер.
### Поэтому решил уменьшить количество пользователей до 500
(users_lvl_2_pred_als_part)

In [193]:
users_lvl_2_part = users_lvl_2.head(500)
users_lvl_2_part

Unnamed: 0,user_id
0,2070
1,2021
2,1753
3,2120
4,1346
...,...
495,2149
496,491
497,378
498,603


In [194]:
# Считаем предсказания
users_lvl_2_part['predictions'] = users_lvl_2_part['user_id'] \
.apply(lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=num))

In [195]:
users_lvl_2_part.head()

Unnamed: 0,user_id,predictions
0,2070,"[923746, 1029743, 1044078, 1096036, 896369, 10..."
1,2021,"[951590, 981760, 1044078, 904360, 844179, 1029..."
2,1753,"[1005186, 879755, 1085604, 986912, 1106523, 11..."
3,2120,"[1106523, 1029743, 995242, 981760, 1058997, 11..."
4,1346,"[1127831, 1058997, 1126899, 866211, 878996, 98..."


In [199]:
users_lvl_2_part.rename(columns={'predictions': 'candidates'}, inplace=True)
users_lvl_2_part.head()

Unnamed: 0,user_id,candidates
0,2070,"[923746, 1029743, 1044078, 1096036, 896369, 10..."
1,2021,"[951590, 981760, 1044078, 904360, 844179, 1029..."
2,1753,"[1005186, 879755, 1085604, 986912, 1106523, 11..."
3,2120,"[1106523, 1029743, 995242, 981760, 1058997, 11..."
4,1346,"[1127831, 1058997, 1126899, 866211, 878996, 98..."


In [200]:
# Т.к. очень долго считается, сохраним файл
users_lvl_2_part.to_csv("./prep12/users_lvl_2_part.csv", index=False)

In [207]:
users_lvl_2_part.candidates[0][:10]

[923746,
 1029743,
 1044078,
 1096036,
 896369,
 1005186,
 981760,
 910032,
 859075,
 833025]

In [212]:
# Проверим еще раз на пропуски
users_lvl_2_part.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     500 non-null    int64 
 1   candidates  500 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [216]:
users_lvl_2_part.candidates.values

array([list([923746, 1029743, 1044078, 1096036, 896369, 1005186, 981760, 910032, 859075, 833025, 871756, 893018, 908531, 1107553, 1038217, 1106523, 5568378, 1006184, 1068719, 1133018, 904360, 1042438, 1127831, 1085604, 1024306, 878996, 909894, 847982, 1070015, 1062966, 995242, 866211, 899624, 962568, 1043590, 865456, 961554, 1056509, 965267, 879755]),
       list([951590, 981760, 1044078, 904360, 844179, 1029743, 995242, 845208, 1004906, 923746, 1106523, 1037863, 1133018, 962229, 1081177, 1127831, 910032, 871756, 1096036, 1068719, 847982, 9527290, 859075, 1043590, 866211, 909894, 854852, 9526410, 5592931, 916122, 899624, 12301109, 1065593, 883932, 987562, 952163, 1062966, 944534, 1039156, 965766]),
       list([1005186, 879755, 1085604, 986912, 1106523, 1133018, 1110572, 1110843, 1065593, 5978656, 1037894, 861445, 844165, 859075, 863802, 8090537, 1091365, 1075368, 1107553, 5569230, 981760, 1053690, 5568378, 1092026, 979707, 859237, 908531, 1136257, 8090521, 1068719, 901543, 1106301, 10

In [223]:
np.concatenate(users_lvl_2_part.candidates.values)

array([ 923746, 1029743, 1044078, ..., 1118787, 1022066,  897752],
      dtype=int64)

In [225]:
users_lvl_2_part.user_id.values.repeat(len(users_lvl_2_part.candidates[0]))

array([2070, 2070, 2070, ..., 1388, 1388, 1388], dtype=int64)

In [227]:
# Создадим "технический" датафрейм df с разбиением на пары "юзер-айтем"
df=pd.DataFrame({'user_id':users_lvl_2_part.user_id.values.repeat(len(users_lvl_2_part.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2_part.candidates.values)})
df

Unnamed: 0,user_id,item_id
0,2070,923746
1,2070,1029743
2,2070,1044078
3,2070,1096036
4,2070,896369
...,...,...
19995,1388,1053690
19996,1388,1016800
19997,1388,1118787
19998,1388,1022066


In [232]:
# Создадим датафрейм targets_lvl_2 из исходного data_train_lvl_2
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
#Введем столбец покупок
targets_lvl_2['target'] = 1  # тут только покупки 

In [238]:
targets_lvl_2.shape

(169711, 3)

In [233]:
# Выберем также тех же 500 пользователей, чтобы совпадали с вынужденно укороченным датафреймом users_lvl_2_part
targets_lvl_2_part = targets_lvl_2.loc[targets_lvl_2['user_id'].isin(df['user_id'].tolist ())]
targets_lvl_2_part.user_id.nunique()

500

In [234]:
targets_lvl_2_part

Unnamed: 0,user_id,item_id,target
2104867,2070,999999,1
2107468,2021,999999,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1
...,...,...,...
2279981,1769,926549,1
2279982,1769,943137,1
2279983,1769,1039156,1
2279984,23,999999,1


In [235]:
# Объединяем датафреймы
targets_lvl_2_part = df.merge(targets_lvl_2_part, on=['user_id', 'item_id'], how='left')
targets_lvl_2_part

Unnamed: 0,user_id,item_id,target
0,2070,923746,
1,2070,1029743,
2,2070,1044078,
3,2070,1096036,
4,2070,896369,
...,...,...,...
22868,1388,1016800,
22869,1388,1118787,
22870,1388,1022066,1.0
22871,1388,1022066,1.0


In [239]:
# Заполняем пропуски в 'target'

targets_lvl_2_part['target'].fillna(0, inplace= True)

In [240]:
targets_lvl_2_part.shape

(22873, 3)

In [310]:
# Объединяем датафреймы с характеристиками "юзеров" и "айтемов"
targets_lvl_2_part = targets_lvl_2_part.merge(item_features, on='item_id', how='left') #
targets_lvl_2_part = targets_lvl_2_part.merge(user_features, on='user_id', how='left') #

targets_lvl_2_part.head()

Unnamed: 0,user_id,item_id,target,manufacturer_x,department_x,brand_x,commodity_desc_x,sub_commodity_desc_x,curr_size_of_product_x,age_desc_x,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,bill_avg,week_last_bill,items_per_week
0,2070,923746,0.0,69,GROCERY,Private,EGGS,EGGS - LARGE,18 CT,45-54,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815
1,2070,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815
2,2070,1044078,0.0,2845,MEAT,National,BEEF,LEAN,,45-54,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815
3,2070,1096036,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815
4,2070,896369,0.0,69,GROCERY,Private,PNT BTR/JELLY/JAMS,PEANUT BUTTER,18 OZ,45-54,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815


In [243]:
targets_lvl_2_part.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22873 entries, 0 to 22872
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               22873 non-null  int64  
 1   item_id               22873 non-null  int64  
 2   target                22873 non-null  float64
 3   manufacturer          22873 non-null  int64  
 4   department            22873 non-null  object 
 5   brand                 22873 non-null  object 
 6   commodity_desc        22873 non-null  object 
 7   sub_commodity_desc    22873 non-null  object 
 8   curr_size_of_product  22873 non-null  object 
 9   age_desc              12428 non-null  object 
 10  marital_status_code   12428 non-null  object 
 11  income_desc           12428 non-null  object 
 12  homeowner_desc        12428 non-null  object 
 13  hh_comp_desc          12428 non-null  object 
 14  household_size_desc   12428 non-null  object 
 15  kid_category_desc  

In [244]:
# Видно наличие пустых ячеек. Заполняем их нулями.
targets_lvl_2_part.fillna(0, inplace=True)

In [245]:
# Готовим X_train и у_train
X_train = targets_lvl_2_part.drop('target', axis=1)
y_train = targets_lvl_2_part[['target']]

In [246]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'bill_avg',
 'week_last_bill',
 'items_per_week']

In [248]:
lgb = LGBMClassifier(objective='binary', 
                     n_estimators=1500,
                     learning_rate=0.008,
                     max_depth=7, 
                     verbose=1,
                     categorical_column=cat_feats)

hist = lgb.fit(X_train, y_train, 
               eval_set=[(X_train, y_train)],
               eval_metric=['logloss']
               )

train_preds = lgb.predict_proba(X_train)

[LightGBM] [Info] Number of positive: 6617, number of negative: 16256
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 22873, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.289293 -> initscore=-0.898820
[LightGBM] [Info] Start training from score -0.898820


In [259]:
train_preds

array([[0.63368842, 0.36631158],
       [0.41158958, 0.58841042],
       [0.66827877, 0.33172123],
       ...,
       [0.17141922, 0.82858078],
       [0.17141922, 0.82858078],
       [0.87687579, 0.12312421]])

In [260]:
hist.evals_result_['training']['binary_logloss'][-10:]

[0.35633661910648684,
 0.35627424783613804,
 0.35617925316004145,
 0.35609588325084385,
 0.3560279722403082,
 0.35596658632930744,
 0.35590413175142205,
 0.35584129434514417,
 0.3557865761166094,
 0.35571688791467027]

In [251]:
# Создадим датафрейм с вероятностями
classifer_prediction = X_train[['user_id', 'item_id']]
classifer_prediction["pred"] = train_preds[:, 1]
classifer_prediction.tail()

Unnamed: 0,user_id,item_id,pred
22868,1388,1016800,0.096407
22869,1388,1118787,0.121448
22870,1388,1022066,0.828581
22871,1388,1022066,0.828581
22872,1388,897752,0.123124


In [262]:
# Оставим итемы с вероятностями больше 0,5
res_val = classifer_prediction.loc[classifer_prediction["pred"]>0.5, ["user_id", "item_id"]]
res_val.tail()

Unnamed: 0,user_id,item_id
22831,1388,1029743
22857,1388,1081189
22858,1388,1081189
22870,1388,1022066
22871,1388,1022066


In [263]:
res_val.shape

(4297, 2)

In [264]:
res_val.user_id.nunique()

282

In [265]:
# Создаем датафрейм рекомендаций на основе датафрейма res_val
pred_lvl_2_part = res_val.groupby('user_id')['item_id'].unique().reset_index()
pred_lvl_2_part.columns=['user_id', 'pred']
pred_lvl_2_part.head()

Unnamed: 0,user_id,pred
0,13,"[859075, 1126899, 1029743, 5978656, 1058997, 9..."
1,19,"[965766, 865456, 899624, 1101173, 1046545, 844..."
2,20,"[1058997, 1126899, 1075368, 1123106, 894360, 9..."
3,23,"[1029743, 1075368, 844165, 962229, 1106523]"
4,27,"[1077555, 1052920, 1102416, 9419470, 1029252, ..."


In [257]:
result_lvl_2_part = result_lvl_2_part.merge(pred_lvl_2_part, on='user_id', how='left')
result_lvl_2_part.head()

Unnamed: 0,user_id,actual,pred
0,13,"[999999, 1104146, 862070, 886317, 950439, 9815...","[859075, 1126899, 1029743, 5978656, 1058997, 9..."
1,14,"[840601, 867293, 933067, 951590, 952408, 96569...",
2,18,"[831628, 999999, 914697, 995242, 1118878, 1128...",
3,20,"[819112, 944419, 945611, 999999, 1058997, 5592...","[1058997, 1126899, 1075368, 1123106, 894360, 9..."
4,22,"[999999, 849470, 858743, 916758, 935968, 10105...",


In [269]:
# Определяем количество НЕ ПУСТЫХ ячеек
user_pred_nan = result_lvl_2_part.loc[result_lvl_2_part["pred"] != result_lvl_2_part["pred"], 'user_id'].values
len(user_pred_nan)

375

In [274]:
set_tr_1 = set(data_train_lvl_1.user_id.unique())
len(set_tr_1)

2498

In [275]:
# Находим пересечение пользователей из валидационного и обучаемого датасетов
user_list = set(user_pred_nan).intersection(set_tr_1)
len(user_list)

375

In [277]:
result_lvl_2_part.loc[result_lvl_2_part["user_id"].isin(user_list), "user_id"]

1        14
2        18
4        22
6        28
7        35
       ... 
493    2461
494    2462
495    2465
497    2481
499    2500
Name: user_id, Length: 375, dtype: int64

In [276]:
result_lvl_2_part.loc[result_lvl_2_part["user_id"].isin(user_list), "pred"] =\
result_lvl_2_part.loc[result_lvl_2_part["user_id"].isin(user_list), "user_id"].apply(lambda x: recommender.get_als_recommendations(x, rec_num=k_n))
result_lvl_2_part.head()

Unnamed: 0,user_id,actual,pred
0,13,"[999999, 1104146, 862070, 886317, 950439, 9815...","[859075, 1126899, 1029743, 5978656, 1058997, 9..."
1,14,"[840601, 867293, 933067, 951590, 952408, 96569...","[995242, 1029743, 981760, 951590, 6533765]"
2,18,"[831628, 999999, 914697, 995242, 1118878, 1128...","[903325, 916122, 951412, 995965, 961554]"
3,20,"[819112, 944419, 945611, 999999, 1058997, 5592...","[1058997, 1126899, 1075368, 1123106, 894360, 9..."
4,22,"[999999, 849470, 858743, 916758, 935968, 10105...","[1070820, 862349, 5978656, 5978648, 854852]"


In [285]:
result_lvl_2_part.loc[result_lvl_2["user_id"] == 14]

Unnamed: 0,user_id,actual,pred
7,35,"[1089888, 5591609, 6423775]","[1058997, 1126899, 908531, 1029743, 995242]"


In [286]:
# Выделим пять наиболее популярных товаров (исходя из количества продаж) из всего исходного датасета 
popularity = recommender.prep_data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5 = popularity.sort_values('n_sold', ascending=False).head(6).item_id.tolist()[1:6]
top_5

[995242, 1029743, 1133018, 981760, 1106523]

In [291]:
# Внедряем эти товары
result_lvl_2_part.loc[result_lvl_2_part["user_id"] == 14, "pred"] =\
        result_lvl_2_part.loc[result_lvl_2["user_id"] == 14, "user_id"].apply(lambda x: top_5)

In [299]:
# Проверяем работоспособность
result_lvl_2_part.loc[result_lvl_2["user_id"] == 14]

Unnamed: 0,user_id,actual,pred
7,35,"[1089888, 5591609, 6423775]","[995242, 1029743, 1133018, 981760, 1106523]"


In [300]:
len(result_lvl_2_part.loc[result_lvl_2["user_id"] == 14, "pred"].values[0])

5

In [301]:
# Формируем рабочий список
res = []
res.extend(result_lvl_2_part.loc[result_lvl_2["user_id"] == 14, "pred"].values[0])
res

[995242, 1029743, 1133018, 981760, 1106523]

In [306]:
# Подпрограмма для заполнения недостающих рекомендуемых значений до пяти
def fill_if_not_5(recommender, x):
    res = []
    res.extend(result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "pred"].values[0])
    pred_num = len(result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "pred"].values[0])
    if pred_num < 5:
        res.extend(recommender.get_als_recommendations(x, rec_num=5-pred_num))
    return res

In [307]:
# Если рекомендуемых товаров получилось меньше пяти - "добиваем" их до пяти
result_lvl_2_part["pred"] = result_lvl_2_part["user_id"].apply(lambda x: fill_if_not_5(recommender, x))

In [312]:
result_lvl_2_part.head()

Unnamed: 0,user_id,actual,pred
0,13,"[999999, 1104146, 862070, 886317, 950439, 9815...","[859075, 1126899, 1029743, 5978656, 1058997, 9..."
1,14,"[840601, 867293, 933067, 951590, 952408, 96569...","[995242, 1029743, 981760, 951590, 6533765]"
2,18,"[831628, 999999, 914697, 995242, 1118878, 1128...","[903325, 916122, 951412, 995965, 961554]"
3,20,"[819112, 944419, 945611, 999999, 1058997, 5592...","[1058997, 1126899, 1075368, 1123106, 894360, 9..."
4,22,"[999999, 849470, 858743, 916758, 935968, 10105...","[1070820, 862349, 5978656, 5978648, 854852]"


In [313]:
result_lvl_2_part["rec"] = result_lvl_2_part['user_id'].apply(\
        lambda x: precision_at_k(result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "pred"].values[0], 
                                 list(result_lvl_2_part.loc[result_lvl_2_part["user_id"] == x, "actual"].values[0]), 
                                 k=5))

result_lvl_2_part['rec'].mean()

0.2800000000000004

## Целевая метрика precision@5 получилась равной 0.28

### Готовим выходной файл с рекомендациями recommendations.csv

In [317]:
recom = result_lvl_2_part.drop(['actual', 'rec'], axis=1)
recom

Unnamed: 0,user_id,pred
0,13,"[859075, 1126899, 1029743, 5978656, 1058997, 9..."
1,14,"[995242, 1029743, 981760, 951590, 6533765]"
2,18,"[903325, 916122, 951412, 995965, 961554]"
3,20,"[1058997, 1126899, 1075368, 1123106, 894360, 9..."
4,22,"[1070820, 862349, 5978656, 5978648, 854852]"
...,...,...
495,2465,"[981760, 986912, 923746, 904360, 993638]"
496,2475,"[1070820, 883003, 5569471, 971922, 1070845, 10..."
497,2481,"[1029743, 833025, 995242, 904360, 1133018]"
498,2498,"[1070820, 1053690, 5569230, 862349, 1070820]"


In [318]:
recom.to_csv("./prep12/recommendations.csv", index=False)