In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.sparse import csc_matrix
from sklearn.neighbors import NearestNeighbors
%matplotlib inline

count_votes = 11

In [4]:
products = pd.read_csv('../data/products.csv.gz', index_col='product_id', 
                       sep=';', encoding='UTF-8'
                      )

In [5]:
order_products = pd.read_csv('../data/order_products.csv.gz', 
                       sep=';', encoding='UTF-8'
                      )

In [6]:
products.shape, order_products.shape

((17540, 1), (74574, 2))

In [7]:
products.head()

Unnamed: 0_level_0,name
product_id,Unnamed: 1_level_1
154,АК-47
6236,Комар
6237,Куда летим?
152,DOZOR
6197,Угловой цветочный узор


In [8]:
order_products.head()

Unnamed: 0,product_id,order_id
0,259,37
1,154,37
2,156,47
3,178,46
4,156,25


In [9]:
order_counts = order_products.groupby('product_id')['order_id'].agg(['count'])
order_counts.sort_values('count', ascending=False).head()

Unnamed: 0_level_0,count
product_id,Unnamed: 1_level_1
505,797
156,751
521,698
968,609
1275,575


In [10]:
order_counts.describe()

Unnamed: 0,count
count,9097.0
mean,8.197648
std,24.977427
min,1.0
25%,1.0
50%,3.0
75%,7.0
max,797.0


In [11]:
quantiles = np.arange(75 , 101, 1 , dtype=int)

for quantile in quantiles:
    c = order_counts.quantile(round(quantile / 100 , 2))['count']
    print("Кватниль {}% с количеством оценок: {} шт.".format(quantile , c))

Кватниль 75% с количеством оценок: 7.0 шт.
Кватниль 76% с количеством оценок: 7.0 шт.
Кватниль 77% с количеством оценок: 8.0 шт.
Кватниль 78% с количеством оценок: 8.0 шт.
Кватниль 79% с количеством оценок: 8.0 шт.
Кватниль 80% с количеством оценок: 9.0 шт.
Кватниль 81% с количеством оценок: 9.0 шт.
Кватниль 82% с количеством оценок: 10.0 шт.
Кватниль 83% с количеством оценок: 10.0 шт.
Кватниль 84% с количеством оценок: 11.0 шт.
Кватниль 85% с количеством оценок: 12.0 шт.
Кватниль 86% с количеством оценок: 12.0 шт.
Кватниль 87% с количеством оценок: 14.0 шт.
Кватниль 88% с количеством оценок: 15.0 шт.
Кватниль 89% с количеством оценок: 16.0 шт.
Кватниль 90% с количеством оценок: 17.0 шт.
Кватниль 91% с количеством оценок: 19.0 шт.
Кватниль 92% с количеством оценок: 21.0 шт.
Кватниль 93% с количеством оценок: 23.0 шт.
Кватниль 94% с количеством оценок: 26.0 шт.
Кватниль 95% с количеством оценок: 29.0 шт.
Кватниль 96% с количеством оценок: 35.0 шт.
Кватниль 97% с количеством оценок: 44.0

In [17]:
selected_products = order_counts[order_counts['count'] > count_votes].index
print("{} товаров, которые встречаются в {} и более заказах".format(selected_products.shape[0] , count_votes))

1368 товаров, которые встречаются в 11 и более заказах


In [18]:
selected_order_products = order_products[order_products['product_id'].isin(selected_products)]

In [19]:
#Создаем сводную таблицу product_id | order_id
selected_order_products_pivot = selected_order_products.pivot_table(index='product_id', 
                                                                    columns='order_id',
                                                                    aggfunc=lambda x : 1
                                                                   ).fillna(0)

In [20]:
selected_order_products_pivot.shape

(1368, 22052)

In [21]:
selected_order_products_pivot.head()

order_id,25,30,33,37,46,47,49,54,60,65,...,86647,86665,86682,86683,86686,86698,86717,86748,86754,86831
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
product_id_map = selected_order_products_pivot.index

In [23]:
order_products_compressed = csc_matrix(selected_order_products_pivot)

# Обучение

In [24]:
nearest = NearestNeighbors(metric='cosine')
%time nearest.fit(order_products_compressed)

CPU times: user 1.68 ms, sys: 280 µs, total: 1.96 ms
Wall time: 1.77 ms


NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [36]:
def similar(product_id=156, n=6):
    res = nearest.kneighbors([selected_order_products_pivot.loc[product_id]],n_neighbors=n+1 ,return_distance=True)
    df = pd.DataFrame(index=res[1][0], data=res[0][0]).sort_values(0)[1:]
    return pd.DataFrame(data=selected_order_products_pivot.index[df.index])

def related_product(product_id, n=6):
    return product_id_map[nearest.kneighbors([selected_order_products_pivot.loc[product_id]],n_neighbors=n+1 ,return_distance=False)[0][1:]]


In [26]:
products.loc[similar(156)['product_id']]

Unnamed: 0_level_0,name
product_id,Unnamed: 1_level_1
456,"ВДВ, Никто кроме нас 2"
460,&quot;За ВДВ&quot;
458,"ВДВ, парашют"
459,"ВДВ, парашют. Вариант 2."
897,Наклейка круглая &quot;ВДВ&quot;
8418,Бывших десантников не бывает


In [27]:
products.loc[similar(505)['product_id']]

Unnamed: 0_level_0,name
product_id,Unnamed: 1_level_1
590,One life... One love...
521,Бесплатная пробная наклейка (машина)
632,Долбит нормально
968,Иероглиф &quot;Ангел-хранитель&quot; (вертикал...
667,Yahooею с этих дорог
622,Моя жизнь... Мои правила...


In [168]:
%time products.loc[similar(5822)['product_id']]

CPU times: user 13.1 ms, sys: 0 ns, total: 13.1 ms
Wall time: 12.4 ms


Unnamed: 0_level_0,name
product_id,Unnamed: 1_level_1
14607,Стильбешеный
4329,Классика всегда в моде. Ваз.
12178,"Ломай, чини, жигули"
193,sedan syndicate 2107
1165,"Не жди принца на белом PORSHE, люби пацана на ..."
5538,Dolbit pizdato (Долбит пиздато)


In [41]:
products.loc[related_product(5538)]

Unnamed: 0_level_0,name
product_id,Unnamed: 1_level_1
10043,Heineken
5500,Ребенок в машине. Миньон
6701,Череп 101
14435,alphard
3845,Кот и паук
4687,bbs


In [38]:
related_product(5538)

Int64Index([10043, 5500, 6701, 14435, 3845, 4687], dtype='int64', name='product_id')

In [78]:
out_pd = pd.DataFrame(columns=['product_id' , 'related_id'])
i = 0
for product_id in selected_products:
    for related_id in related_product(product_id , 12):
        out_pd.loc[i] = [product_id , related_id]
        i += 1

In [82]:
out_pd.to_csv('../data/product_related.csv' ,
              sep=';',
              columns=['product_id' , 'related_id'],
              index=False
             )