In [1]:
import pandas as pd

In [2]:
data_movies = pd.read_csv('/Users/andrejzivoj/Documents/datasets/movies.csv')
data_users = pd.read_csv('/Users/andrejzivoj/Documents/datasets/ratings.csv')

display(data_movies.head())
display(data_users.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# EDA

In [3]:
print(data_movies.isnull().sum())
print(data_users.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [4]:
mean_rating = data_users.groupby(['userId']).agg({'movieId':['count'],'rating':['mean']})
mean_rating.columns = mean_rating.columns.map('_'.join)
mean_rating.reset_index(inplace=True)
display(mean_rating)

mean_watched = data_users.groupby(['movieId']).agg({'userId':['count'],'rating':['mean']})
mean_watched.columns = mean_watched.columns.map('_'.join)
mean_watched.reset_index(inplace=True)
display(mean_watched)

high_rating_film_id = mean_watched[mean_watched.index == mean_watched.rating_mean.idxmax()]['movieId'].values
low_rating_film_id = mean_watched[mean_watched.index == mean_watched.rating_mean.idxmin()]['movieId'].values
print('Movie with high rate')
display(data_movies[data_movies['movieId'] == high_rating_film_id[0]])
print('Movie with low rate')
display(data_movies[data_movies['movieId'] == low_rating_film_id[0]])

Unnamed: 0,userId,movieId_count,rating_mean
0,1,232,4.366379
1,2,29,3.948276
2,3,39,2.435897
3,4,216,3.555556
4,5,44,3.636364
...,...,...,...
605,606,1115,3.657399
606,607,187,3.786096
607,608,831,3.134176
608,609,37,3.270270


Unnamed: 0,movieId,userId_count,rating_mean
0,1,215,3.920930
1,2,110,3.431818
2,3,52,3.259615
3,4,7,2.357143
4,5,49,3.071429
...,...,...,...
9719,193581,1,4.000000
9720,193583,1,3.500000
9721,193585,1,3.500000
9722,193587,1,3.500000


Movie with high rate


Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


Movie with low rate


Unnamed: 0,movieId,title,genres
2689,3604,Gypsy (1962),Musical


In [5]:
print(f'All films: {mean_watched.movieId.count()} | All users: {mean_rating.movieId_count.count()}')
print(f'Mean films wathced by user: {mean_rating.movieId_count.mean()}')
print(f'Mean rating from user: {mean_rating.rating_mean.mean()}')
print(f'High rate: {data_movies['title'].loc[data_movies['movieId'] == high_rating_film_id[0]].values[0]} \
| Rate: {mean_watched['rating_mean'].loc[mean_watched.index == mean_watched.rating_mean.idxmax()].values[0]}')
print(f'Low rate:: {data_movies['title'].loc[data_movies['movieId'] == low_rating_film_id[0]].values[0]} \
| Rate: {mean_watched['rating_mean'].loc[mean_watched.index == mean_watched.rating_mean.idxmin()].values[0]}')

All films: 9724 | All users: 610
Mean films wathced by user: 165.30491803278687
Mean rating from user: 3.6572223377474
High rate: Lamerica (1994) | Rate: 5.0
Low rate:: Gypsy (1962) | Rate: 0.5


# Prepare data

In [6]:
import numpy as np
from scipy.sparse import csr_matrix

def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
    
    # ID в индексы
    user_mapper = dict(zip(np.unique(df['userId']), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))

    # Индексы в ID
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df['userId'])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df['movieId']))) 

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df['rating'], (movie_index, user_index)), shape=(M, N))
                        
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
                        
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(data_users)

# K-Nearest-Neighbors

In [7]:
from sklearn.neighbors import NearestNeighbors

# Find similar movies using KNN
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    neighbor_ids = []
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_id]
    k+=1
    knn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric)
    knn.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbor = knn.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbor.item(i)
        neighbor_ids.append(movie_inv_mapper[n])
    neighbor_ids.pop(0)
    return neighbor_ids

movie_titles = dict(zip(data_movies['movieId'], data_movies['title']))
movie_id=13
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Если вам понравился фильм: {movie_title}\n")
for i in similar_ids:
    print(movie_titles[i])

Если вам понравился фильм: Balto (1995)

Mighty Aphrodite (1995)
Dead Man Walking (1995)
Leaving Las Vegas (1995)
Primal Fear (1996)
City Hall (1996)
Crucible, The (1996)
Time to Kill, A (1996)
Anne Frank Remembered (1995)
Sense and Sensibility (1995)
Mother (1996)


# Make Recommendation for all movies

In [8]:
rec_movies = []
for movie_id in mean_watched['movieId']:
    try:
        similar_ids = find_similar_movies(movie_id, X, k=10)
        rec_movies.append(similar_ids)
    except:
        rec_movies.append([0])

len(rec_movies)

9724

In [9]:
mean_watched['recommendation_movieId'] = pd.Series(rec_movies)
mean_watched.loc[(mean_watched['rating_mean'] > 3.1) & (mean_watched['rating_mean'] <= 4.4)].head()

Unnamed: 0,movieId,userId_count,rating_mean,recommendation_movieId
0,1,215,3.92093,"[364, 500, 367, 480, 586, 551, 588, 595, 19, 317]"
1,2,110,3.431818,"[3450, 762, 788, 736, 5, 95, 65, 708, 7, 141]"
2,3,52,3.259615,"[113, 336, 979, 987, 243, 775, 324, 359, 510, 55]"
5,6,102,3.946078,"[708, 5, 62, 141, 852, 3, 648, 17, 11, 539]"
6,7,54,3.185185,"[271, 174, 502, 217, 542, 484, 259, 250, 374, ..."
