# Import some libraries you will need:

In [16]:
import numpy as np
import pandas as pd

# Read the data using pandas in Dataframe df

In [31]:
df_data = pd.read_csv("recommend.csv", names=["UserID","MovieID","Rating","Timestamp"])
df_data

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [32]:
df_data.shape

(100000, 4)

In [33]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   UserID     100000 non-null  int64
 1   MovieID    100000 non-null  int64
 2   Rating     100000 non-null  int64
 3   Timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


# Output both userid and movieid columns and split them into train and test data 

In [36]:
from sklearn.model_selection import train_test_split
n_users = df_data.UserID.unique().shape[0]
n_movies = df_data.MovieID.unique().shape[0]
train_data,test_data = train_test_split(df_data,test_size=0.25)

In [37]:
train_data_matrix = np.zeros((n_users,n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1,line[2]-1] = line[3]
train_data_matrix

array([[5., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [38]:
test_data_matrix = np.zeros((n_users,n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1] = line[3]
test_data_matrix

array([[0., 3., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Aplly cosine to the none zero datapoints

In [43]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')
# mean_user_rating = train_data_matrix.mean(axis=1)[:,np.newaxis]
# ratings_diff = (train_data_matrix - mean_user_rating)
# user_pred = mean_user_rating + user_similarity.dot(ratings_diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T

# Create a predict function

In [56]:
# user_pred
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings- mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

# Apply the predict function defined above

In [53]:
movie_prediction = predict(train_data_matrix, movie_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# Time to Evaluate our model. We choose to use Root mMean Squared Error (RMSE)

In [54]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,ground_truth))

In [55]:
print('User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-base CF RMSE: ' + str(rmse(movie_prediction,train_data_matrix)))

User based CF RMSE: 3.1286192077511146
Item-base CF RMSE: 3.456277909084724
