In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
%matplotlib inline



In [2]:
ratings = pd.read_csv('../../../data/movies/ratings.csv')
movies = pd.read_csv('../../../data/movies/movies.csv')

In [3]:
count_review_by_user = ratings.groupby('userId')['rating'].count()
count_review_by_movie = ratings.groupby('movieId')['rating'].count()

In [4]:
count_review_by_movie.describe()

count    26744.000000
mean       747.841123
std       3085.818268
min          1.000000
25%          3.000000
50%         18.000000
75%        205.000000
max      67310.000000
Name: rating, dtype: float64

In [5]:
count_review_by_user.describe()

count    138493.000000
mean        144.413530
std         230.267257
min          20.000000
25%          35.000000
50%          68.000000
75%         155.000000
max        9254.000000
Name: rating, dtype: float64

In [6]:
count_review_by_user = count_review_by_user.reset_index()
count_review_by_movie = count_review_by_movie.reset_index()
count_review_by_user.columns = ['userId','count_rating_user']
count_review_by_movie.columns = ['movieId','count_rating_movie']

In [7]:
df_rating = pd.merge(ratings, count_review_by_user, how='left', on='userId')
df_rating = pd.merge(df_rating, count_review_by_movie, how='left', on='movieId')

In [8]:
rating_subset = df_rating[(df_rating.count_rating_user >= 500) & (df_rating.count_rating_movie >= 500)]

In [9]:
rating_subset.shape, ratings.shape

((5776627, 6), (20000263, 4))

In [22]:
len(ratings.userId.unique()), len(ratings.movieId.unique())

(138493, 26744)

In [10]:
len(rating_subset.userId.unique()), len(rating_subset.movieId.unique())

(7491, 4489)

In [11]:
n_users = len(rating_subset.userId.unique())
n_movies = len(rating_subset.movieId.unique())

In [12]:
#del rating_subset['timestamp']
#del rating_subset['count_rating_user']
#del rating_subset['count_rating_movie']
rating_subset.head()

Unnamed: 0,userId,movieId,rating,timestamp,count_rating_user,count_rating_movie
960,11,1,4.5,1230858821,504,49695
961,11,10,2.5,1230858959,504,29005
962,11,19,3.5,1230783704,504,20938
963,11,32,5.0,1230783095,504,44980
964,11,39,4.5,1230859032,504,26254


In [13]:
train, test = cv.train_test_split(rating_subset, test_size=0.25)

In [14]:
train_mtx = train.pivot_table(values='rating', index='userId', columns='movieId')  
train_mtx.fillna(0, inplace=True)
movie_indextain = train_mtx.columns
train_mtx.head()  

movieId,1,2,3,4,5,6,7,8,9,10,...,112183,112290,112552,112556,112623,112852,115569,116797,116823,118696
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.0,3.5,3.0,0.0,0.0,0.0,2.5,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_mtx = test.pivot_table(values='rating', index='userId', columns='movieId')  
test_mtx.fillna(0, inplace=True)
movie_indextest = test_mtx.columns
test_mtx.head()  

movieId,1,2,3,4,5,6,7,8,9,10,...,112183,112290,112552,112556,112623,112852,115569,116797,116823,118696
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
trainm = train_mtx.as_matrix()
testm = test_mtx.as_matrix()

In [17]:
user_similarity = pairwise_distances(trainm, metric='cosine')
item_similarity = pairwise_distances(trainm.T, metric='cosine')

In [18]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [19]:
item = predict(trainm, item_similarity, type='item')
user = predict(trainm, user_similarity, type='user')

In [20]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [21]:
print 'User-user RMSE: ' + str(rmse(user, testm))
print 'Item-item RMSE: ' + str(rmse(item, testm))

User-based CF RMSE: 2.67298258491
Item-based CF RMSE: 3.09259650612
