In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

In [2]:
ratings = pd.read_csv('./the-movies-dataset/ratings_small.csv', usecols=['userId', 'movieId', 'rating'])

print('Unique users count: ', len(ratings['userId'].unique()))
print('Unique movies count: ', len(ratings['movieId'].unique()))
print('DataFrame shape: ', ratings.shape)

ratings.head()

Unique users count:  671
Unique movies count:  9066
DataFrame shape:  (100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [3]:
tmp = pd.read_csv("./the-movies-dataset/movies_metadata_fixed.csv", usecols=['id', 'title'])
tmp.head()

info = pd.merge(ratings, tmp, left_on='movieId', right_on='id').sort_values('userId')

info = info.drop('id', 1)

info.query("userId == 1")

Unnamed: 0,userId,movieId,rating,title
0,1,1371,2.5,Rocky III
182,1,2294,2.0,Jay and Silent Bob Strike Back
235,1,2455,2.5,Confidentially Yours
47,1,1405,1.0,Greed
140,1,2193,2.0,My Tutor
93,1,2105,4.0,American Pie


In [4]:
movie_ids = info['movieId'].unique()

def scale_movie_id(movie_id):
    scaled = np.where(movie_ids == movie_id)[0][0] + 1
    return scaled

info['movieId'] = info['movieId'].apply(scale_movie_id)


user_ids = info['userId'].unique()

def scale_user_id(user_id):
    scaled = np.where(user_ids == user_id)[0][0] + 1
    return scaled

info['userId'] = info['userId'].apply(scale_user_id)

n_movies = len(info['movieId'].unique())
n_users = len(info['userId'].unique())

info.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,2.5,Rocky III
182,1,2,2.0,Jay and Silent Bob Strike Back
235,1,3,2.5,Confidentially Yours
47,1,4,1.0,Greed
140,1,5,2.0,My Tutor


In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(info, test_size=0.2)

print('Train shape: ', train_data.shape)
print('Test shape: ', test_data.shape)

Train shape:  (35995, 4)
Test shape:  (8999, 4)


In [6]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
    mse = mean_squared_error(ground_truth, prediction)
    return sqrt(mse)

In [7]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
    
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [8]:
from  sklearn.metrics.pairwise import pairwise_distances
pd.set_option('display.max_columns', 500)
# считаем косинусное расстояние для пользователей и фильмов 
# (по строкам и по колонкам соотвественно).
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [9]:
def k_fract_mean_predict(top):
    top_similar = np.zeros((n_users, top))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]

        top_similar[i] = top_sim_users

    pred = np.zeros((n_users, n_movies))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int) 
        numerator = user_similarity[i][indexes] 
        
        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean() 
        
        diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean() 
        
        product = np.dot(numerator, diff_ratings)
        
        denominator = numerator.sum() 
      
        pred[i] =  mean_rating + product / denominator
        
    return pred



def k_fract_mean_predict_item(top):
    top_similar = np.zeros((n_movies, top))
    
    for i in range(n_movies): 
        movie_sim = item_similarity[i]
       
        top_sim_movies = movie_sim.argsort()[1:top + 1]
        
        top_similar[i] = top_sim_movies
           
    pred = np.zeros((n_movies, n_users))
    
    for i in range(n_movies):
        indexes = top_similar[i].astype(np.int)
        numerator = item_similarity[i][indexes]

        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        
        diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
        
        product = np.dot(numerator, diff_ratings)
        
        denominator = numerator.sum()
        
        pred[i] = mean_rating + product / denominator
        
    return pred.T

k_predict = k_fract_mean_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_mean_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))

User-based CF RMSE:  1.5290458760586616
Item-based CF RMSE:  1.5093303250774526
