In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Recommend.csv',names=['user_id', 'movie_id', 'rating', 'timestamp'])
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
from sklearn.model_selection import train_test_split
n_users = df.user_id.unique().shape[0] 
n_movies = df.movie_id.unique().shape[0]
train_data, test_data = train_test_split(df, test_size=0.25)

In [5]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [7]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis] 
ratings_diff = (train_data_matrix - mean_user_rating) 
user_pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[ 1.61697473,  0.5603393 ,  0.50247415, ...,  0.31157734,
         0.31143834,  0.31131238],
       [ 1.34999323,  0.25775824,  0.16137518, ..., -0.06105039,
        -0.05976504, -0.05963342],
       [ 1.35771782,  0.21119056,  0.1230913 , ..., -0.10478045,
        -0.10310644, -0.10287322],
       ...,
       [ 1.22126885,  0.1852287 ,  0.09340132, ..., -0.11823767,
        -0.1174208 , -0.11741112],
       [ 1.39452659,  0.28381022,  0.21563665, ..., -0.0066506 ,
        -0.00581635, -0.00560113],
       [ 1.44299925,  0.36616951,  0.30826493, ...,  0.11262159,
         0.11245875,  0.11259888]])

In [8]:
movie_pred = train_data_matrix.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis=1)])
movie_pred

array([[0.37531446, 0.39357354, 0.40997923, ..., 0.46247436, 0.4512    ,
        0.447758  ],
       [0.09065327, 0.1039924 , 0.1007212 , ..., 0.1052556 , 0.1063919 ,
        0.10682482],
       [0.0608167 , 0.06364403, 0.06200853, ..., 0.06054141, 0.06301548,
        0.0641633 ],
       ...,
       [0.03213712, 0.04121977, 0.03982021, ..., 0.0459713 , 0.04512997,
        0.04504078],
       [0.13064921, 0.13898878, 0.1474112 , ..., 0.15133262, 0.1504511 ,
        0.1519039 ],
       [0.21035659, 0.21446936, 0.23023174, ..., 0.26487973, 0.25470325,
        0.25632439]])

In [9]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [10]:
rmse(user_pred, test_data_matrix)

3.123022403131597

In [11]:
rmse(movie_pred, test_data_matrix)

3.449118406150586