In [1]:
import re
import gensim
import numpy as np
import pandas as pd
from scipy import spatial
from lightfm import LightFM
from sklearn.metrics import pairwise_distances
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import atan2
from scipy.sparse import load_npz
import scipy.sparse
from sklearn.decomposition import TruncatedSVD



In [2]:
Products_features = load_npz('item_features_matrix.npz')

In [3]:
I = pd.read_csv("Interactions.csv", delimiter=',')

In [4]:
I.shape

(12913475, 3)

In [5]:
print(I.product_id.max())
print(I.product_id.nunique())

509399
383771


In [6]:
#Отделяем id товаров от фичей
Products_features = Products_features.tocsc()
ids = Products_features[:,0]
Products_features = Products_features[:,1:]

In [7]:
svd = TruncatedSVD(n_components=2)
svd.fit(Products_features)
Products_features = svd.transform(Products_features)

In [8]:
Products_features

array([[ 0.06762964,  0.03255733],
       [ 0.10391546,  0.05141103],
       [ 0.04870719,  0.01914751],
       ...,
       [ 0.1627247 ,  0.43340185],
       [ 0.03302475,  0.00945652],
       [ 0.08098272, -0.00817672]])

In [9]:
t_df = pd.DataFrame(data={'id': [i[0] for i in ids[:,0].toarray().astype(int)],'x':[i for i in Products_features[:,0]], 'y':[i for i in Products_features[:,1]]})

In [10]:
t_df.head()

Unnamed: 0,id,x,y
0,335486,0.06763,0.032557
1,322530,0.103915,0.051411
2,57032,0.048707,0.019148
3,56996,0.061338,0.028638
4,37084,0.116709,0.24688


# Content-based


In [11]:
#отсортируем все по отношению к вектору (1,0) используя в качестве меры угол. Эта метрика в случае двумерного пространства идентична косинусной мере между двумя товарами. Нужно только отдельно рассмотреть пятерку товаров по краям списка ( изза свойства угла fi = fi + 2pi). Но в данном случае они (товары) очень близки к друг другу по краям списка. Такая мера в разы ускоряет вычисления
def calc_angle(point):
    return atan2(point[1],point[0])


In [12]:
#t_df['cos']= cosine_similarity(np.array([1,0]).reshape(1,2), t_df[['x','y']].values[:,:])[0]

In [13]:
t_df['angle'] = [calc_angle(point) for point in t_df[['x','y']].values[:,:]]

In [14]:
t_df.head()

Unnamed: 0,id,x,y,angle
0,335486,0.06763,0.032557,0.448662
1,322530,0.103915,0.051411,0.45943
2,57032,0.048707,0.019148,0.374557
3,56996,0.061338,0.028638,0.436804
4,37084,0.116709,0.24688,1.129198


In [15]:
t_df = t_df.sort_values(by='angle')
t_df = t_df.reset_index(drop=True)
t_df.head()

Unnamed: 0,id,x,y,angle
0,108281,0.372808,-0.277538,-0.639942
1,143309,0.372808,-0.277538,-0.639942
2,425780,0.372808,-0.277538,-0.639942
3,121940,0.372808,-0.277538,-0.639942
4,326344,0.372808,-0.277538,-0.639942


In [16]:
#save csv
with open('RusetskyAlexander1.csv','w') as out:
    for idx in range(t_df.shape[0]-5):
        row =        t_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(t_df.iloc[idx + i].id))+' '+str(t_df.iloc[idx + i].angle - cur_angle)+'\n' )
    for idx in range(t_df.shape[0]-1, t_df.shape[0]-1-5,-1):
        row =        t_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(t_df.iloc[idx - i].id))+' '+str(t_df.iloc[idx - i].angle - cur_angle)+'\n' )

# Коллаборативная фильтрация

In [17]:
I.head()

Unnamed: 0,vid,product_id,page_type
0,0,0,PRODUCT
1,1,1,PRODUCT
2,3,3,CART
3,4,4,PURCHASE
4,5,5,PRODUCT


In [18]:
I = I[pd.to_numeric(I['vid'], errors='coerce').notnull()]
I = I[pd.to_numeric(I['product_id'], errors='coerce').notnull()]

In [19]:
print(I.product_id.nunique(), I.product_id.max())

383771 509399


In [20]:
rates = {'PRODUCT':1, 'CART':2, 'PURCHASE':3}

In [21]:
#создаем матрицу взаимодейстий (товар х пользователи)
coo = coo_matrix((np.array([rates[i] for i in I.page_type.values]),(I.product_id.values, I.vid.values)))
#I2 = coo_matrix((I.product_id.max(), I.vid.max()), dtype=np.int8)

In [22]:
#понижаем ее размерность
svd = TruncatedSVD(n_components=2)
coo2 =svd.fit_transform(coo)


In [23]:
p_df = pd.DataFrame(data={'product_id':range(len(coo2[:,0])),'x':[i for i in coo2[:,0]], 'y':[i for i in coo2[:,1]]})

In [24]:
p_df.head()
p_df =p_df[p_df.product_id.isin(I.product_id)]

In [25]:
#дата фрейм - товаром и их двумерных фичей
p_df.head()

Unnamed: 0,product_id,x,y
0,0,0.000337,4.634345e-05
1,1,9.7e-05,-0.0001843927
3,3,2e-06,8.782123e-05
4,4,3.4e-05,4.494056e-07
5,5,0.002145,0.0007440253


In [26]:
p_df['angle'] = [calc_angle(point) for point in p_df[['x','y']].values[:,:]]

In [27]:
p_df=t_df.sort_values(by='angle')
p_df.head()

Unnamed: 0,id,x,y,angle
0,108281,0.372808,-0.277538,-0.639942
636,398257,0.372808,-0.277538,-0.639942
637,260095,0.372808,-0.277538,-0.639942
638,390836,0.372808,-0.277538,-0.639942
639,445126,0.372808,-0.277538,-0.639942


In [28]:
#save csv
with open('RusetskyAlexander2.csv','w') as out:
    for idx in range(p_df.shape[0]-5):
        row =        p_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(p_df.iloc[idx + i].id))+' '+str(p_df.iloc[idx + i].angle - cur_angle)+'\n' )
    for idx in range(p_df.shape[0]-1, p_df.shape[0]-1-5,-1):
        row =        p_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(p_df.iloc[idx - i].id))+' '+str(p_df.iloc[idx - i].angle - cur_angle)+'\n' )

# Матричная факторизация

In [29]:
coo.shape

(509400, 975006)

In [30]:
model = LightFM(loss='warp', no_components=100, learning_rate=0.03, learning_schedule="adadelta")

In [31]:
model.fit(coo, epochs=5, num_threads=40, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


<lightfm.lightfm.LightFM at 0x1a2662aad0>

In [32]:
user_feature_bias, user_feature_embeddings = model.get_user_representations()
item_feature_bias, item_feature_embeddings = model.get_item_representations()

In [33]:
#use user_feature_emb (такая входная матрица)
#сжимаем 
svd = TruncatedSVD(n_components=2)
coo3 =svd.fit_transform(user_feature_embeddings)

In [34]:
m_df = pd.DataFrame(data={'product_id':range(len(coo3[:,0])),'x':[i for i in coo3[:,0]], 'y':[i for i in coo3[:,1]]})
m_df =m_df[m_df.product_id.isin(I.product_id)]

In [35]:
m_df['angle'] = [calc_angle(point) for point in m_df[['x','y']].values[:,:]]

In [36]:
m_df=t_df.sort_values(by='angle')
m_df.head()

Unnamed: 0,id,x,y,angle
0,108281,0.372808,-0.277538,-0.639942
636,398257,0.372808,-0.277538,-0.639942
637,260095,0.372808,-0.277538,-0.639942
638,390836,0.372808,-0.277538,-0.639942
639,445126,0.372808,-0.277538,-0.639942


In [37]:
with open('RusetskyAlexander3.csv','w') as out:
    for idx in range(m_df.shape[0]-5):
        row =        m_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(m_df.iloc[idx + i].id))+' '+str(m_df.iloc[idx + i].angle - cur_angle)+'\n' )
    for idx in range(m_df.shape[0]-1, m_df.shape[0]-1-5,-1):
        row =        m_df.iloc[idx] 
        cur_angle = row.angle
        for i in range(1,6):
            out.write(str(int(row.id))+' '+str(int(m_df.iloc[idx - i].id))+' '+str(m_df.iloc[idx - i].angle - cur_angle)+'\n' )