In [1]:
import pandas as pd
import matplotlib
from surprise import Dataset, SVD, NormalPredictor, BaselineOnly, KNNBasic, NMF
from surprise.model_selection import cross_validate, KFold



In [2]:
data = Dataset.load_builtin('ml-100k')

In [3]:
rf = data.ratings_file
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
raw_data = pd.read_table(rf,names=col_names)
print(raw_data.head(5))
raw_data['rating'].value_counts().sort_index().plot.bar()

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


<matplotlib.axes._subplots.AxesSubplot at 0x12fb36b70>

In [4]:
raw_data['rating'].value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

# Model 1: Random

We want to first get a baseline value for our model.
That can be best done with random model as this algorithm is not personalized to the desire of any users - we just assign them movie ratings based on the initial distribution of the data.

In [5]:
model = NormalPredictor()

In [6]:
model_random_results = cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5202  1.5165  1.5168  1.5250  1.5198  1.5197  0.0031  
Fit time          0.14    0.16    0.15    0.17    0.15    0.15    0.01    
Test time         0.23    0.21    0.17    0.24    0.15    0.20    0.03    


# Model 2: User-Based Collaborative Filtering

User-Based Collaborative Filtering model will use the user-user defined notion of similarity to implement collaborative filtering.

In [7]:
sim_options={'name':'cosine','user_based':True}
model_KNN_user_based=KNNBasic(sim_options=sim_options)

In [8]:
model_KNN_user_based_results = cross_validate(model_KNN_user_based, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0088  1.0195  1.0196  1.0152  1.0173  1.0161  0.0040  
Fit time          1.22    1.20    1.21    1.07    1.18    1.18    0.05    
Test time         4.17    4.19    5.12    3.84    4.53    4.37    0.43    


# Model 3: Item-Based Collaborative Filtering

This model uses item-item defined notion of similarity to once again implement collaborative filtering.

In [9]:
sim_options={'name':'cosine','user_based':False}
model_KNN_item_based=KNNBasic(sim_options=sim_options)

In [10]:
model_KNN_item_based_results = cross_validate(model_KNN_item_based, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0295  1.0290  1.0257  1.0280  1.0242  1.0273  0.0020  
Fit time          2.34    2.14    2.86    2.25    2.27    2.37    0.25    
Test time         5.88    6.11    7.09    5.83    4.94    5.97    0.69    


# Model 4: Matrix Factorization

Our final model will use the matrix factorization approach with the SVD algrithm to try to predict user's movie ratings. Here, we try to determine some underlying mathematical structure in the user rating matrix, which can help missing ratings in the future.

In [11]:
model_SVD = SVD()

In [12]:
model_SVD_results = cross_validate(model_SVD,data,measures=['RMSE'],cv=5,verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9388  0.9296  0.9441  0.9398  0.9310  0.9367  0.0055  
Fit time          5.71    6.18    5.85    5.78    5.71    5.85    0.18    
Test time         0.25    0.17    0.24    0.19    0.16    0.20    0.04    


# Precision and Recall

In [13]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    # First map the predictions to each user.
    user_est_true = dict()
    for uid,_,true_r,est,_ in predictions:
        current = user_est_true.get(uid,list())
        current.append((est,true_r))
        user_est_true[uid] = current
        
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_,true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est,_) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
    return precisions, recalls

        

We will compute the precision and recall at `K` = 5 and 10 for each of the 4 models. We use 5-fold cross validation again to average the results across the entire dataset.

In [14]:
K = [5,10]
models = [model,model_KNN_user_based,model_KNN_item_based,model_SVD]
kf = KFold(n_splits=5)
for k in K:
    for mod in models:
        p=[]
        r=[]
        for train_set, test_set in kf.split(data):
            mod.fit(train_set)
            predictions = model.test(test_set,verbose=False)
            precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5)
            
            p.append(sum(prec for prec in precisions.values())/len(precisions))
            r.append(sum(rec for rec in recalls.values())/len(recalls))
        
        print('>>> precision :', round(sum(p)/len(p),3))
        print('>>> recall    :', round(sum(r)/len(r),3))

>>> precision : 0.585
>>> recall    : 0.347
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
>>> precision : 0.589
>>> recall    : 0.343
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
>>> precision : 0.591
>>> recall    : 0.346
>>> precision : 0.589
>>> recall    : 0.347
>>> precision : 0.588
>>> recall    : 0.433
Computing the cosine

# Top-`n` Predictions

In [15]:
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = dict()
    for uid, iid, true_r, est, _ in predictions:
        current = top_n.get(uid, [])
        current.append((iid,est))
        top_n[uid] = current
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [16]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [17]:
for model in models:
    model.fit(trainset)
    predictions = model.test(testset)
    top_n = get_top_n(predictions, n=5)
    user = list(top_n.keys())[0]
    print(f'model: {model}, {user}:{top_n[user]}')

model: <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x12fc6d048>, 196:[('1014', 5), ('392', 5), ('1081', 5), ('219', 5), ('416', 5)]
Computing the cosine similarity matrix...
Done computing similarity matrix.
model: <surprise.prediction_algorithms.knns.KNNBasic object at 0x12fc51588>, 196:[('1189', 5), ('1500', 5), ('814', 5), ('1536', 5), ('1293', 5)]
Computing the cosine similarity matrix...
Done computing similarity matrix.
model: <surprise.prediction_algorithms.knns.KNNBasic object at 0x12fc58eb8>, 196:[('1309', 4.5), ('1310', 4.5), ('1676', 4.25), ('1675', 4.25), ('1593', 4.090909090909091)]
model: <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x12d90e9b0>, 196:[('64', 4.637528442418255), ('318', 4.630783225870791), ('408', 4.505150613313384), ('427', 4.504829890156369), ('178', 4.451072219072106)]
