In [1]:
import pandas as pd
import numpy as np

import datetime
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict, Counter

from IPython.display import HTML
import warnings
warnings.filterwarnings('ignore')




Я использовал набор данных e-commerce 

Источник https://www.kaggle.com/retailrocket/ecommerce-dataset


In [2]:
# Категории товаров:

category = pd.read_csv('data/category_tree.csv')
category


Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
...,...,...
1664,49,1125.0
1665,1112,630.0
1666,1336,745.0
1667,689,207.0


In [3]:
# События

events = pd.read_csv('data/events.csv')
events


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [4]:
# Файл (первая часть) со свойствами элементов:

items_1 = pd.read_csv('data/item_properties_part1.csv')
items_1


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


In [5]:
# Файл (вторая часть) со свойствами элементов:

items_2 = pd.read_csv('data/item_properties_part2.csv')
items_2


Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062
...,...,...,...,...
9275898,1433646000000,236931,929,n12.000
9275899,1440903600000,455746,6,150169 639134
9275900,1439694000000,347565,686,610834
9275901,1433646000000,287231,867,769062


In [6]:
# Объединим их в один файл:

items = pd.concat([items_1, items_2])
items


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
9275898,1433646000000,236931,929,n12.000
9275899,1440903600000,455746,6,150169 639134
9275900,1439694000000,347565,686,610834
9275901,1433646000000,287231,867,769062


### Допущение: Будем считать, что под каждый id уникальный пользователь

# EDA

### Category

In [7]:
category.head()


Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [8]:
category.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   categoryid  1669 non-null   int64  
 1   parentid    1644 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 26.2 KB


In [9]:
category['categoryid'].nunique()


1669

In [10]:
category['parentid'].nunique()


362

In [11]:
category.isnull().sum()


categoryid     0
parentid      25
dtype: int64

### Item properties

In [12]:
items.head()


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [13]:
items.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20275902 entries, 0 to 9275902
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  int64 
 1   itemid     int64 
 2   property   object
 3   value      object
dtypes: int64(2), object(2)
memory usage: 773.5+ MB


In [14]:
# Странно, что у property тип object, но в нем много числовых значений


In [15]:
items.isnull().sum()


timestamp    0
itemid       0
property     0
value        0
dtype: int64

In [16]:
# Пропусков нет


In [17]:
items.nunique()


timestamp         18
itemid        417053
property        1104
value        1966868
dtype: int64

In [18]:
# property - это какие-то свойства объектов, которые можно использовать для item-based подхода


In [19]:
items['property'].unique()


array(['categoryid', '888', '400', ..., '1091', '522', '769'],
      dtype=object)

In [20]:
items[items['property'] == '400']


Unnamed: 0,timestamp,itemid,property,value
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
11,1435460400000,244127,400,n552.000 639502 n720.000 424566
89,1440298800000,368230,400,1297729 n720.000 1178208 n900.000 424566
95,1441508400000,428812,400,n552.000 639502 n720.000 424566
138,1439694000000,370490,400,n410.400 424566
...,...,...,...,...
9275396,1433041200000,144776,400,n720.000 424566
9275626,1436670000000,30934,400,n552.000 639502 n720.000 424566
9275705,1440903600000,76321,400,n552.000 639502 n720.000 424566
9275707,1440298800000,53256,400,1297729 n720.000 1178208 n900.000 424566


In [21]:
# Сложно будет что-то понять их этого, поэтому оставим)


### Events

In [22]:
events.head()


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [23]:
events.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [24]:
events.isnull().sum()


timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64

In [25]:
# В transactionid много пропусков, 
# так как данное поле должно быть заполнено, только тогда, когда в поле event стоит transaction.


In [26]:
events['event'].value_counts()


view           2664312
addtocart        69332
transaction      22457
Name: event, dtype: int64

In [27]:
events[events['event'] == 'transaction']['transactionid'].isnull().sum()


0

In [28]:
# Посмотрим, нет ли среди других типов операций значений в поле transactionid:
print(events[events['event'] != 'transaction']['transactionid'].shape,
      events[events['event'] != 'transaction']['transactionid'].isnull().sum())


(2733644,) 2733644


In [29]:
# Случайных значений нет - все хорошо)


In [30]:
events['visitorid'].nunique()


1407580

# Описание системы рекомендаций

1. Для новых пользователей, о которых у нас нет никакой информации будем показывать самые покупаемые продукты.
2. Для пользователей, у которых есть покупки, будем показывать товары, которые приобретали похожие на него пользователи. (подход User-Based через LightFM)
3. Если есть товар в корзине, то будем рекомендовать похожие товары на основе корзин других пользователей. (подход Item-Based через LightFM)
4. Для пользователей, о которых у нас есть информация лишь о просмотрах, будем предлагать те товары, которые покупали похожие на него по просмотрам пользователи. (User-Based путем суммирования коэффициентов схожести пользователей по каждому товару)


# Создание отдельных моделей

In [31]:
# Получим список всех id пользователей:
all_user = events['visitorid'].unique()
len(all_user)


1407580

## 1. Рекомендация для новых пользователей

In [32]:
# Выделим данные в которых была покупка:
events_with_bay = events[events['transactionid'].notnull()]
events_with_bay.shape


(22457, 5)

In [33]:
# Получим список купленных товаров, т.е. их id:

product_purchased = events_with_bay['itemid'].unique()
len(product_purchased)


12025

In [34]:
popular_items = {}

for i in events_with_bay['itemid']:
    if i in popular_items:
        popular_items[i] = popular_items[i] + 1
    else:
        popular_items[i] = 1
len(popular_items)


12025

In [35]:
keys_list = list(popular_items.keys())
values_list = list(popular_items.values())
print(len(keys_list), len(values_list))


12025 12025


In [36]:
# Получим отсортированный список товаров по популярности покупок:
sort_index_rec = list(np.argsort(values_list))
sort_index_rec.reverse()

recomend_for_new_client = [keys_list[i] for i in sort_index_rec[:5]]
recomend_for_new_client


[461686, 119736, 213834, 312728, 7943]

In [37]:
# Данный список рекомендаций будет одинаков для всех новых пользователей.


## 2. Для тех, кто уже что-то покупал:


In [38]:
# Выделим id пользователей, которые что-то купили, т.е. стали клиентами

customer_purchased = events_with_bay['visitorid'].unique()
len(customer_purchased)


11719

In [39]:
%%time

# Создадим список покупок каждого клиента:

purchased_items = []    
for customer in customer_purchased:
    purchased_items.append(list(events_with_bay[events_with_bay['visitorid'] == customer]['itemid'].values))

len(purchased_items)


CPU times: user 9.15 s, sys: 72.5 ms, total: 9.22 s
Wall time: 9.28 s


11719

In [40]:
purchased_items[:3]


[[356475],
 [15335,
  380775,
  237753,
  317178,
  12836,
  400969,
  105792,
  25353,
  200793,
  80582,
  302422],
 [81345]]

In [41]:
%%time 

# Получим матрицу Покупатель/Товар:

v = DictVectorizer(sparse=False)
d = [{item: 1 for item in product_purchased if item in ui} for ui in purchased_items]
user_interest_matrix = v.fit_transform(d)
user_interest_matrix.shape


CPU times: user 25.3 s, sys: 261 ms, total: 25.6 s
Wall time: 25.8 s


(11719, 12025)

In [42]:
user_interest_matrix[:5]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
user_interest_df = pd.DataFrame(user_interest_matrix, index = customer_purchased, 
                                       columns = product_purchased)
user_interest_df.head()


Unnamed: 0,356475,15335,81345,150318,310791,54058,284871,150100,243566,245400,...,29220,325384,439734,14731,132074,15212,419980,160671,384823,446271
599528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
552148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Переведем нашу матрицу в Разряженную матрицу:
purchases_sparse = sparse.csr_matrix(user_interest_matrix)


In [45]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() 
    test_set[test_set != 0] = 1 
    training_set = ratings.copy() 
    nonzero_inds = training_set.nonzero() 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) 
    random.seed(0) 
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) 
    samples = random.sample(nonzero_pairs, num_samples) 
    user_inds = [index[0] for index in samples] 
    item_inds = [index[1] for index in samples] 
    training_set[user_inds, item_inds] = 0 
    training_set.eliminate_zeros() 
    return training_set, test_set, list(set(user_inds))  


In [46]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)


In [47]:
import pickle
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score


model = LightFM(loss='warp', no_components = 150, k = 50)
model.fit_partial(product_train, epochs = 30, num_threads = 2)

with open('saved_model','wb') as f:
            saved_model={'model':model}
            pickle.dump(saved_model, f)

train_auc = auc_score(model, product_train).mean()
test_auc = auc_score(model, product_test).mean()

print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))


AUC: train 1.00, test 0.89.


In [48]:
# Результат неплохой, поэтому будет использовать данную модель в нашей рекомендации:


In [49]:
# Построим функцию рекомендации:

def display_recommended_items(user_ids):
    scores = model.predict([1], np.arange(user_interest_matrix.shape[1]))
    top_items = np.argsort(-scores)
    return list(top_items[:5])
    
display_recommended_items(1)


[9846, 2716, 612, 311, 5111]

## 3. Рекомендации по корзине

In [50]:
# Отфильтруем всех, кто добавлял товары в корзину:
events_add_tocart = events[events['event'] == 'addtocart']


In [51]:
events_add_tocart


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
17,1433223236124,287857,addtocart,5206,
19,1433221078505,158090,addtocart,10572,
63,1433223543021,1193904,addtocart,255275,
112,1433221941632,599528,addtocart,356475,
179,1433220880956,105775,addtocart,312728,
...,...,...,...,...,...
2755956,1438400400805,831605,addtocart,57810,
2756056,1438398156086,10670,addtocart,419736,
2756074,1438400994744,144106,addtocart,141241,
2756078,1438399807937,804736,addtocart,447661,


In [52]:
# Получим список всех пользователей, которые что-то добавляли в корзину:
users_add_tocart = events_add_tocart['visitorid']
len(users_add_tocart)


69332

In [53]:
users_tocart = [i for i in users_add_tocart if i not in customer_purchased]

len(users_tocart)


41865

In [54]:
# Создадим список товаров из корзины каждого клиента:

addtocard_items = []    
for customer in users_tocart:
    addtocard_items.append(list(events_add_tocart[events_add_tocart['visitorid'] == customer]['itemid'].values))

len(addtocard_items)


41865

In [55]:
# Получим список товаров из корзины, т.е. их id:

product_addtocard = events_add_tocart['itemid'].unique()
len(product_addtocard)


23903

In [56]:
%%time 

# Получим матрицу Покупатель / Товар в корзине:

v = DictVectorizer(sparse = False)
d = [{item: 1 for item in product_addtocard if item in ui} for ui in addtocard_items]
user_tocard_matrix = v.fit_transform(d)
user_tocard_matrix.shape


CPU times: user 4min, sys: 1.06 s, total: 4min 1s
Wall time: 4min 2s


(41865, 18138)

In [57]:
# Переведем нашу матрицу в Разряженную матрицу:
user_tocard_sparse = sparse.csr_matrix(user_tocard_matrix)


In [58]:
product_train_2, product_test_2, product_users_altered_2 = make_train(user_tocard_sparse, pct_test = 0.2)


In [59]:
model_3 = LightFM(loss='warp', no_components = 50, k = 50)
model_3.fit_partial(product_train_2, epochs = 30, num_threads = 2)

with open('saved_model_tocard','wb') as f:
            saved_model={'model':model_3}
            pickle.dump(saved_model, f)

train_auc = auc_score(model_3, product_train_2).mean()
test_auc = auc_score(model_3, product_test_2).mean()

print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))


AUC: train 1.00, test 0.94.


In [60]:
# Результаты модели неплохие, будем использовать ее.


In [61]:
# Построим фукнцию рекомендации похожих товаров:

def display_item_to_items_recommendations(item_id):
    sort_ind = list(np.argsort(cosine_similarity(model_3.item_embeddings)[item_id]))
    sort_ind.reverse()
    
    return sort_ind[1:6]

display_item_to_items_recommendations(0)


[7781, 9452, 2206, 15908, 8627]

## 4. Рекомендация по списку просмотренных:

In [62]:
events


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [63]:
# Отфильтруем данные, в которых просматривали товар:
events_view = events[events['event'] == 'view']
events_view


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [64]:
%%time

# соберем все просмотры каждого покупателя, который совершил покупку:

list_ite = []
for i in customer_purchased:
    a = events_view[events_view['visitorid'] == i]['itemid'].unique().tolist()
    list_ite.append(a)

len(list_ite)


CPU times: user 42 s, sys: 179 ms, total: 42.2 s
Wall time: 42.3 s


11719

In [65]:
list_ite[0]


[356475, 64279]

In [66]:
%%time

v = DictVectorizer(sparse = False)
d = [{item: 1 for item in ui} for ui in list_ite]
user_interest_mat = v.fit_transform(d)
user_interest_mat


CPU times: user 258 ms, sys: 72.2 ms, total: 331 ms
Wall time: 330 ms


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
user_interest_mat.shape


(11719, 35813)

In [68]:
%%time

user_cosine = cosine_similarity(user_interest_mat, user_interest_mat)
len(user_cosine)


CPU times: user 4min 20s, sys: 4.96 s, total: 4min 25s
Wall time: 1min 14s


11719

#### Построим модель для обучения:

In [69]:
# Построим функцию рекомендации товаров:

def most_similar_users_to(user_id):
    user = user_cosine[user_id]
    
    pairs = [(other_user_id, similarity) 
             for other_user_id, similarity in enumerate(user) 
             if user_id != other_user_id and similarity > 0]  

    return sorted(pairs, key = lambda x: x[1], reverse = True)

def user_based_suggestions(user_id):
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in list_ite[other_user_id]:
            suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(), key = lambda x: x[1], reverse = True)

    return [i[0] for i in suggestions][:5]
    
print(user_based_suggestions(275))


[448044, 393144, 210279, 309499, 187374]


In [70]:
from scipy.spatial.distance import cosine

def get_recommend_item(id_client):
    if id_client in customer_purchased:
        id_cl = list(customer_purchased).index(id_client)
        return user_based_suggestions(id_cl)
    
    else:
        k = events_view[events_view['visitorid'] == id_client]['itemid'].unique().tolist()
        
        list_ite_2 = list_ite.copy()
        list_ite_2.append(k)
        
        v = DictVectorizer(sparse = False)
        d = [{item: 1 for item in ui} for ui in list_ite_2]
        user_inter_matrix = v.fit_transform(d)
        
        cos_list = cosine_similarity([user_inter_matrix[-1]], user_inter_matrix)
        
        pairs = [(other_user_id, similarity) 
                 for other_user_id, similarity in enumerate(cos_list[0][:-1]) 
                 if similarity > 0]
        
        sor_rai = sorted(pairs, key = lambda x: x[1], reverse = True)
        
        suggestions = defaultdict(float)
        for other_user_id, similarity in sor_rai:
            for interest in list_ite[other_user_id]:
                suggestions[interest] += similarity

        suggestions = sorted(suggestions.items(), key = lambda x: x[1], reverse = True)

        return [i[0] for i in suggestions][:5]
        

In [71]:
3432 in customer_purchased


False

In [72]:
get_recommend_item(3432)


[13193, 157846, 139770, 114250, 260152]

In [73]:
customer_purchased[0]


599528

In [74]:
get_recommend_item(599528)


[356475, 64279, 58059, 268755, 317823]

# Итоговая рекомендация:

In [75]:
rest_user = [i for i in all_user if (i not in customer_purchased) or (i not in users_tocart)]
len(rest_user)


1407580

In [76]:
def itog_recomend(id_client):
    if id_client in customer_purchased:
        id_cl = list(customer_purchased).index(id_client)
        return display_recommended_items(id_cl)
    elif id_client in users_tocart:
        id_cl = list(users_tocart).index(id_client)
        return display_item_to_items_recommendations(id_cl)
    elif id_client in rest_user:
        return get_recommend_item(id_client)
    else:
        return recomend_for_new_client
        

In [77]:
# Сначала проверим на новом пользователе:

322222999 in all_user


False

In [78]:
itog_recomend(322222999)


[461686, 119736, 213834, 312728, 7943]

In [79]:
# Теперь посмотрим на существующих клиентах:


In [80]:
print(customer_purchased[0],
      users_tocart[5],
      rest_user[1])
      

599528 772309 992329


In [81]:
itog_recomend(599528)


[9846, 2716, 612, 311, 5111]

In [82]:
itog_recomend(772309)


[6458, 1426, 11637, 13176, 3710]

In [83]:
itog_recomend(992329)


[48030, 446522, 340825, 372169, 119736]

In [84]:
# Несовсем наглядно получается, но такие безымянные данные)
