## Получаем данные

In [2]:
import pandas as pd

In [3]:
import numpy as np
from tqdm import tqdm_notebook

In [4]:
df_ratings = pd.read_csv('ratings1M.csv')
df_movies = pd.read_csv('movies1M.csv', encoding= 'unicode_escape')
df_users = pd.read_csv('users1M.csv')

In [5]:
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
df_movies.head()

Unnamed: 0,movie_id,movie,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_ratings.columns = ['userId','movieId','rating','timestamp']

In [8]:
df_movies.columns = ['movieId','movie','genres']

In [9]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [11]:
del df['timestamp']
del df['genres']

In [12]:
df.head()

Unnamed: 0,userId,movieId,rating,movie
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)


In [13]:
df['rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64

## Формируем векторное описание для фильма

In [14]:
df['userId'].max()

6040

In [15]:
MAX_USER_ID = 6040
MIN_USER_ID = 1

In [16]:
movie_names = df['movie'].unique()

In [17]:
movie_names = movie_names.tolist()

In [18]:
movie_names[:10]

["One Flew Over the Cuckoo's Nest (1975)",
 'James and the Giant Peach (1996)',
 'My Fair Lady (1964)',
 'Erin Brockovich (2000)',
 "Bug's Life, A (1998)",
 'Princess Bride, The (1987)',
 'Ben-Hur (1959)',
 'Christmas Story, A (1983)',
 'Snow White and the Seven Dwarfs (1937)',
 'Wizard of Oz, The (1939)']

In [19]:
movie_to_vector = {}

In [20]:
for movie in tqdm_notebook(movie_names):
    movie_to_vector[movie] = np.zeros((MAX_USER_ID,))
    for r in df[df['movie'] == movie].iterrows():
        movie_to_vector[movie][r[1]['userId'] - MIN_USER_ID] = r[1]['rating']

HBox(children=(IntProgress(value=0, max=3706), HTML(value='')))




In [21]:
movie_to_vector['Toy Story (1995)']

array([5., 0., 0., ..., 0., 0., 3.])

## Ищем похожие

In [22]:
def find_similar(movie, dist_func, top=10):
    distances = {}
    target_movie = movie_to_vector[movie]
    for m in movie_names:
        distances[m] = dist_func(target_movie, movie_to_vector[m])
        
    distances_with_idx = [(i, distances[m]) for i, m in enumerate(movie_names)]
    distances_with_idx = sorted(distances_with_idx, key=lambda t: t[1], reverse=False)
    distances_with_idx = distances_with_idx[:top]
    
    return [(movie_names[i], d) for i, d in distances_with_idx]

In [23]:
from scipy.spatial.distance import cosine, euclidean, cityblock

In [24]:
distances = {}

In [25]:
movie = 'Toy Story (1995)'

In [26]:
target_movie = movie_to_vector[movie]

In [27]:
target_movie

array([5., 0., 0., ..., 0., 0., 3.])

In [28]:
for m in movie_names:
    distances[m] = euclidean(target_movie, movie_to_vector[m])

In [29]:
distances['Heat (1995)']

189.18509455028428

In [30]:
list(distances.keys())[2]

'My Fair Lady (1964)'

In [31]:
distances_with_idx = [(i, distances[m]) for i, m in enumerate(movie_names)]

In [32]:
distances_with_idx = sorted(distances_with_idx, key=lambda t: t[1], reverse=False)

In [33]:
distances_with_idx[:10]

[(40, 0.0),
 (33, 155.80436450882883),
 (50, 157.34675084030175),
 (390, 164.0579166026437),
 (4, 165.4297433957993),
 (10, 166.30093204789924),
 (537, 167.29614460590537),
 (20, 169.42254867637897),
 (381, 170.21457046915813),
 (514, 170.220445305492)]

In [34]:
[(movie_names[i], d) for i, d in distances_with_idx[:10]]

[('Toy Story (1995)', 0.0),
 ('Aladdin (1992)', 155.80436450882883),
 ('Toy Story 2 (1999)', 157.34675084030175),
 ('Lion King, The (1994)', 164.0579166026437),
 ("Bug's Life, A (1998)", 165.4297433957993),
 ('Beauty and the Beast (1991)', 166.30093204789924),
 ("Wayne's World (1992)", 167.29614460590537),
 ('Pleasantville (1998)', 169.42254867637897),
 ('Babe (1995)', 170.21457046915813),
 ('Mask, The (1994)', 170.220445305492)]

In [35]:
# [i for i in smth] <- List Comprehension

In [36]:
find_similar('Toy Story (1995)', cityblock)

[('Toy Story (1995)', 0.0),
 ('Toy Story 2 (1999)', 6224.0),
 ('Aladdin (1992)', 6275.0),
 ('Lion King, The (1994)', 6747.0),
 ('Beauty and the Beast (1991)', 6882.0),
 ("Wayne's World (1992)", 7052.0),
 ("Bug's Life, A (1998)", 7071.0),
 ('Pleasantville (1998)', 7210.0),
 ('Little Mermaid, The (1989)', 7239.0),
 ('Austin Powers: International Man of Mystery (1997)', 7306.0)]

## User 2 Item

In [37]:
#!pip install surprise

In [38]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import SVD

from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

In [39]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [40]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [41]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [42]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [43]:
reader = Reader(rating_scale=(0.5, 5))

In [44]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [45]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [46]:
#algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True})
algo = SVD()

In [47]:
kf = KFold(n_splits=10)

for trainset, testset in kf.split(dataset):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8681
RMSE: 0.8625
RMSE: 0.8648
RMSE: 0.8680
RMSE: 0.8649
RMSE: 0.8657
RMSE: 0.8639
RMSE: 0.8666
RMSE: 0.8675
RMSE: 0.8669


In [48]:
#cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [49]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x83c573990>

In [50]:
predictions = algo.test(testset)

In [55]:
accuracy.rmse(predictions)

RMSE: 0.8657


0.8657172163581556