In [132]:
# импорт необходимых библиотек

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as sparse

from implicit.als import AlternatingLeastSquares

In [133]:
# загрузка файлов

ratings = pd.read_csv("ml-latest-small/ratings.csv")
tags= pd.read_csv("ml-latest-small/tags.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")

In [148]:
ratings[['userId', 'movieId', 'rating']]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [135]:
movies['genres'].str.split("|")

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [136]:
# джойним / мержим два датафрейма
# ссылка на документацию: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

movies = movies.merge(ratings, on = 'movieId', how = 'inner')

In [137]:
df = movies.groupby(by=['userId','movieId'], as_index=False, sort=False).mean()
df = df[['userId','movieId','rating']]
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5
...,...,...,...
100831,184,193581,4.0
100832,184,193583,3.5
100833,184,193585,3.5
100834,184,193587,3.5


In [149]:
users_ = list(np.sort(df.userId.unique()))
len(users_)

610

In [140]:
movies_ = list(np.sort(df.movieId.unique()))
len(movies_)

9724

In [150]:
cols = np.array(df.userId)
rows = np.array(df.movieId)
ratings_ = np.array(df.rating)

In [106]:
print(len(rows), '\n', len(cols), '\n', len(ratings_))

100836 
 100836 
 100836


In [151]:
df_sparse = sparse.csr_matrix( (ratings_, (cols, rows)))
df_sparse.shape

(611, 193610)

In [159]:
df_sparse.toarray()

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 4. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 2.5, 2. , ..., 0. , 0. , 0. ],
       [0. , 3. , 0. , ..., 0. , 0. , 0. ],
       [0. , 5. , 0. , ..., 0. , 0. , 0. ]])

In [153]:
model = AlternatingLeastSquares(factors=50)
model.fit(df_sparse)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [154]:
model.item_factors

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02786652,  0.02147464, -0.00460342, ...,  0.01346934,
         0.02956764,  0.04700077],
       [-0.00407796,  0.01597944, -0.00648023, ...,  0.00037356,
         0.00284217, -0.00409868],
       ...,
       [ 0.07211667, -0.01578344, -0.0062459 , ...,  0.00738454,
         0.05241868,  0.0621392 ],
       [-0.00074078, -0.00011511,  0.0071687 , ...,  0.00574105,
         0.01179282,  0.00308968],
       [-0.00346134,  0.01374036,  0.03526111, ...,  0.06542251,
         0.03764272, -0.03696858]], dtype=float32)

In [155]:
userid = 8

user_items = df_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

In [112]:
recommendations

[(240, 0.3423484),
 (117, 0.315558),
 (84, 0.30968314),
 (181, 0.3074137),
 (19, 0.28690326),
 (140, 0.28248367),
 (136, 0.27643985),
 (40, 0.2688672),
 (559, 0.2667442),
 (385, 0.2657174)]

In [157]:
rec_np = np.array(recommendations)
rec_np = rec_np[:,0].astype(int)
rec_np

array([ 19, 240,  84, 181, 385, 117, 232, 136, 492, 559])

In [158]:
movies[movies['movieId'].isin(rec_np)].title.unique()

array(['Ace Ventura: When Nature Calls (1995)',
       "Young Poisoner's Handbook, The (1995)",
       'Mighty Morphin Power Rangers: The Movie (1995)',
       'Eat Drink Man Woman (Yin shi nan nu) (1994)', 'Hideaway (1995)',
       'Man of No Importance, A (1994)',
       'Manhattan Murder Mystery (1993)'], dtype=object)

In [168]:
movies.loc[movies['userId'] == 8].head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
216,2,Jumanji (1995),Adventure|Children|Fantasy,8,4.0,839463806
614,10,GoldenEye (1995),Action|Adventure|Thriller,8,2.0,839463509
746,11,"American President, The (1995)",Comedy|Drama|Romance,8,4.0,839463806
1148,21,Get Shorty (1995),Comedy|Crime|Thriller,8,4.0,839463564
1504,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,8,3.0,839463624


In [169]:
model.explain(8, user_items, 240)

(0.32830786605421164,
 [(43, 0.14158580294522433),
  (6, 0.0991188933022389),
  (436, 0.06596790114743485),
  (501, 0.012984827611518432),
  (274, 0.011993509436535321),
  (20, 0.007666277901330336),
  (414, -0.004159165988348607),
  (372, -0.006850180301721912)],
 (array([[ 4.72093605e-01,  9.30281623e-03,  4.49717226e-02, ...,
           1.69625675e-02,  2.54130770e-02,  7.15818469e-02],
         [ 4.39180005e-03,  4.80538911e-01,  2.21992540e-02, ...,
           1.20035160e-02,  3.40256099e-02,  2.21469561e-02],
         [ 2.12308627e-02,  1.10859690e-02,  4.85155563e-01, ...,
           6.01713256e-02,  6.26151048e-03,  7.89746105e-02],
         ...,
         [ 8.00791966e-03,  5.92595613e-03,  3.02217583e-02, ...,
           4.67924840e-01, -2.16559894e-04,  9.39297465e-03],
         [ 1.19973511e-02,  1.65870427e-02,  4.93601965e-03, ...,
           7.99428586e-03,  4.51949850e-01,  1.67712659e-02],
         [ 3.37933321e-02,  1.13083869e-02,  4.20257764e-02, ...,
           2.16

In [194]:
similarm = model.similar_items(6)
similarm

[(6, 0.18267044),
 (181, 0.15779851),
 (43, 0.14128067),
 (240, 0.13771954),
 (117, 0.13305882),
 (559, 0.13162501),
 (411, 0.12850104),
 (58, 0.12682943),
 (436, 0.12527357),
 (109, 0.121393666)]