In [12]:
import numpy as np
import pandas as pd

In [13]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('H:/ml-100k/u.data', sep='\t', names=header)

In [14]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [15]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [16]:
#Memory Based

#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
    
print (train_data_matrix)
print (test_data_matrix)

[[ 5.  3.  0. ...,  0.  0.  0.]
 [ 4.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  5.  0. ...,  0.  0.  0.]]
[[ 0.  0.  4. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [17]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

print (user_similarity)
print (item_similarity)

[[ 0.          0.85893314  0.98960083 ...,  0.92999753  0.86784395
   0.70414727]
 [ 0.85893314  0.          0.97988022 ...,  0.80153321  0.90653031
   0.94235079]
 [ 0.98960083  0.97988022  0.         ...,  0.9736328   0.88662572  1.        ]
 ..., 
 [ 0.92999753  0.80153321  0.9736328  ...,  0.          0.97098849
   0.928715  ]
 [ 0.86784395  0.90653031  0.88662572 ...,  0.97098849  0.          0.85312118]
 [ 0.70414727  0.94235079  1.         ...,  0.928715    0.85312118  0.        ]]
[[ 0.          0.68242707  0.76228579 ...,  1.          0.94457792  1.        ]
 [ 0.68242707  0.          0.74138164 ...,  1.          0.91240643
   0.91240643]
 [ 0.76228579  0.74138164  0.         ...,  1.          1.          0.88835156]
 ..., 
 [ 1.          1.          1.         ...,  0.          1.          1.        ]
 [ 0.94457792  0.91240643  1.         ...,  1.          0.          1.        ]
 [ 1.          0.91240643  0.88835156 ...,  1.          1.          0.        ]]


In [18]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [19]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

print (item_prediction)
print (user_prediction)

[[ 0.38303414  0.39003859  0.41779599 ...,  0.45922246  0.45015666
   0.44579602]
 [ 0.08596167  0.09910549  0.0963776  ...,  0.09953534  0.10073425
   0.10108989]
 [ 0.05435555  0.0569474   0.05391551 ...,  0.05208471  0.05529066
   0.05536142]
 ..., 
 [ 0.03132754  0.03912343  0.03788369 ...,  0.04479157  0.04391392
   0.04391478]
 [ 0.11536825  0.12259707  0.12926399 ...,  0.13379592  0.1330611
   0.13353281]
 [ 0.21077053  0.2026069   0.22427093 ...,  0.26191712  0.2540892
   0.25451158]]
[[ 1.55437986  0.60111124  0.49766422 ...,  0.30766763  0.30765272
   0.30751807]
 [ 1.25267872  0.30126524  0.14991276 ..., -0.06559955 -0.06435775
  -0.06414722]
 [ 1.26727097  0.25416574  0.10488549 ..., -0.11356036 -0.111789   -0.111738  ]
 ..., 
 [ 1.14641512  0.23187527  0.08228055 ..., -0.1200224  -0.11927425
  -0.11913055]
 [ 1.30412294  0.31464514  0.18928677 ..., -0.02471823 -0.02383961
  -0.02372699]
 [ 1.36880542  0.3939633   0.29361678 ...,  0.11021199  0.11018769
   0.11036611]]


In [20]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [21]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.127978924337624
Item-based CF RMSE: 3.457999108515081


In [None]:
# MODEL BASED
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print 'The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%'

In [22]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.724406819215572
