In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [91]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [92]:
ratings = pd.read_csv("/content/gdrive/MyDrive/RS Datasets/ratings_small.csv")

In [93]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [94]:
ratings.shape

(100004, 4)

In [95]:
ratings.isnull().sum() # no null values here

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


### Train-test split

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(ratings, test_size = 0.30, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(70002, 4)
(30002, 4)


## User-User collaborative filtering

In [97]:
# pivot to create user-movie interaction matrix
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
print(user_data.shape)

(671, 8043)


In [98]:
user_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,160656,160718,161084,161155,161594,161830,161918,161944,162542,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
dummy_train = X_train.copy()
dummy_test = X_test.copy()

# The movies not rated by user is marked as 1 for prediction
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)

# The movies not rated by user is marked as 0 for evaluation
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

In [100]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,160656,160718,161084,161155,161594,161830,161918,161944,162542,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [101]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,157296,157667,159462,160271,160440,160563,160567,160590,162376,162672
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### User-User Similarity matrix

In [102]:
from sklearn.metrics.pairwise import cosine_similarity

# User Similarity Matrix using Cosine similarity as a similarity measure between Users
user_similarity = cosine_similarity(user_data)
user_similarity[np.isnan(user_similarity)] = 0
print(user_similarity)
print(user_similarity.shape)

[[1.         0.         0.         ... 0.         0.         0.02393547]
 [0.         1.         0.1038815  ... 0.02461364 0.05242847 0.04278715]
 [0.         0.1038815  1.         ... 0.03033982 0.03282569 0.07569511]
 ...
 [0.         0.02461364 0.03033982 ... 1.         0.         0.        ]
 [0.         0.05242847 0.03282569 ... 0.         1.         0.18450998]
 [0.02393547 0.04278715 0.07569511 ... 0.         0.18450998 1.        ]]
(671, 671)


###  Predicting the User ratings on the movies

In [103]:
user_predicted_ratings = np.dot(user_similarity, user_data)
user_predicted_ratings

array([[1.30337799e+01, 5.44926060e+00, 2.45313737e+00, ...,
        1.20464562e-01, 1.34956662e-01, 1.80611822e-01],
       [7.66222363e+01, 5.30689619e+01, 1.29076933e+01, ...,
        1.32398358e-01, 1.26234991e-01, 3.43261458e-01],
       [5.55388120e+01, 2.36795818e+01, 6.63921027e+00, ...,
        3.15486686e-01, 2.13397912e-01, 3.47930392e-01],
       ...,
       [1.70060137e+01, 8.05160470e+00, 3.14684783e+00, ...,
        7.22830712e-02, 0.00000000e+00, 2.47564924e-01],
       [6.25718199e+01, 2.58449970e+01, 1.02997955e+01, ...,
        1.64231594e-01, 2.45318594e-01, 4.67767180e-01],
       [1.05562842e+02, 3.51821767e+01, 1.24033851e+01, ...,
        4.99106902e-01, 3.00308694e-01, 5.61785507e-01]])

In [104]:
user_predicted_ratings.shape

(671, 8043)

In [105]:
# np.multiply for cell-by-cell multiplication to remove the ratings for the movies user already rated
user_final_ratings = np.multiply(user_predicted_ratings, dummy_train)
user_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,160656,160718,161084,161155,161594,161830,161918,161944,162542,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,13.03378,5.449261,2.453137,0.679754,1.964817,4.875659,2.370815,0.102612,0.952908,5.808705,...,0.126428,0.144489,0.090306,0.018473,0.158857,0.044694,0.067042,0.120465,0.134957,0.180612
2,76.622236,53.068962,12.907693,4.011674,15.803771,34.441235,12.232038,0.830773,5.546404,0.0,...,0.240283,0.274609,0.171631,0.055903,0.277516,0.11159,0.167385,0.132398,0.126235,0.343261
3,55.538812,23.679582,6.63921,1.530928,7.409841,19.558837,4.684573,0.304889,2.288593,24.726943,...,0.243551,0.278344,0.173965,0.048133,0.242163,0.061612,0.092418,0.315487,0.213398,0.34793
4,71.958256,35.47002,11.032536,2.562926,11.58586,26.054558,9.99126,0.824395,3.921522,33.82743,...,0.312294,0.356907,0.223067,0.086111,0.456994,0.17562,0.26343,0.532784,0.232701,0.446134
5,61.071668,29.297462,8.109153,1.918585,11.516647,19.224022,7.814428,0.583283,1.951332,27.676196,...,0.229508,0.262295,0.163935,0.073583,0.514617,0.119646,0.179469,0.358784,0.349602,0.327869


### Top 5 movie recommendations for the User 37

In [106]:
user_final_ratings.iloc[37].sort_values(ascending = False)[0:5]

Unnamed: 0_level_0,38
movieId,Unnamed: 1_level_1
356,88.179028
2571,78.891349
260,72.057547
480,65.630475
1196,61.134543


## Evaluating User-User Collaborative Filtering

---



In [107]:
test_features = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
test_similarity = cosine_similarity(test_features)
test_similarity[np.isnan(test_similarity)] = 0

predicted_ratings_test = np.dot(test_similarity, test_features)

test_final_rating = np.multiply(predicted_ratings_test, dummy_test)
test_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,157296,157667,159462,160271,160440,160563,160567,160590,162376,162672
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.653968,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.334717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
ratings['rating'].describe()

Unnamed: 0,rating
count,100004.0
mean,3.543608
std,1.058064
min,0.5
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [109]:
from sklearn.preprocessing import MinMaxScaler

X = test_final_rating.copy()
X = X[X > 0] # only consider non-zero values as 0 means the user haven't rated the movies

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
total_non_nan = np.count_nonzero(~np.isnan(pred))

# Mean abslute error
mae = np.abs(pred - test).sum().sum()/total_non_nan
print(mae)

1.1947405439554972


It means that on an average our User-based recommendation engine is making an error of 1.2 in predicting the User ratings.

## Item-based collaborative filtering

In [110]:
movie_features = X_train.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


### Item-Item Similarity matrix

In [111]:
from sklearn.metrics.pairwise import cosine_similarity

# Item Similarity Matrix using Cosine similarity as a similarity measure between Items
item_similarity = cosine_similarity(movie_features)
item_similarity[np.isnan(item_similarity)] = 0
print(item_similarity)
print("- "*10)
print(item_similarity.shape)

[[1.         0.29312319 0.2089444  ... 0.09364215 0.         0.06554951]
 [0.29312319 1.         0.19586877 ... 0.         0.         0.        ]
 [0.2089444  0.19586877 1.         ... 0.         0.         0.        ]
 ...
 [0.09364215 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.06554951 0.         0.         ... 0.         0.         1.        ]]
- - - - - - - - - - 
(8043, 8043)


### Predicting the User ratings on the movies

In [112]:
item_predicted_ratings = np.dot(movie_features.T, item_similarity)
item_predicted_ratings

array([[ 7.06685729,  5.5669184 ,  3.43240972, ...,  0.75762196,
         0.26548932,  2.69538099],
       [48.00838796, 54.11267831, 24.22708097, ...,  1.30102204,
         0.23151015, 10.85351411],
       [23.77265718, 17.94221111,  8.5326474 , ...,  1.86113847,
         0.48008536,  6.20111701],
       ...,
       [11.10438842,  9.6850143 ,  6.10573827, ...,  0.27299815,
         0.        ,  5.71027824],
       [21.29294039, 15.32788347,  8.58332684, ...,  0.81837386,
         0.30285268,  5.97997039],
       [78.22073194, 49.54884076, 24.70523659, ...,  5.07466113,
         0.8710096 , 18.85811619]])

In [113]:
item_predicted_ratings.shape

(671, 8043)

In [114]:
dummy_train.shape

(671, 8043)

In [115]:
# np.multiply for cell-by-cell multiplication to remove the ratings for the movies user already rated

item_final_ratings = np.multiply(item_predicted_ratings, dummy_train)
item_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,160656,160718,161084,161155,161594,161830,161918,161944,162542,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.066857,5.566918,3.43241,2.310125,3.659288,4.613999,4.144598,0.991955,2.292581,5.446323,...,2.695381,2.695381,2.695381,1.885681,3.377039,2.39267,2.39267,0.757622,0.265489,2.695381
2,48.008388,54.112678,24.227081,19.095393,30.375522,36.477313,28.697831,9.221338,18.370782,0.0,...,10.853514,10.853514,10.853514,8.476851,9.21413,10.775891,10.775891,1.301022,0.23151,10.853514
3,23.772657,17.942211,8.532647,5.180997,10.713019,15.274718,7.863384,3.27488,5.255132,17.867196,...,6.201117,6.201117,6.201117,5.596308,7.511046,4.715341,4.715341,1.861138,0.480085,6.201117
4,109.859407,98.244427,51.248676,31.622443,60.991106,70.423507,61.029445,22.989648,31.428696,85.186125,...,27.853303,27.853303,27.853303,31.48182,37.817179,40.400585,40.400585,11.268766,0.910888,27.853303
5,56.458438,45.395535,20.158634,12.192075,30.626761,34.194423,23.777144,9.296242,7.644094,43.798221,...,10.264494,10.264494,10.264494,16.415521,24.243311,15.381262,15.381262,3.622707,1.325247,10.264494


### Top 5 movie recommendations for the User 37

In [116]:
item_final_ratings.iloc[37].sort_values(ascending = False)[0:5]

Unnamed: 0_level_0,38
movieId,Unnamed: 1_level_1
2858,51.401046
1270,50.697382
2571,50.471459
1704,50.466411
1198,48.995437


## Evaluating Item-Item Collaborative Filtering

---

In [117]:
test_features = X_test.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
test_similarity = cosine_similarity(test_features)
test_similarity[np.isnan(test_similarity)] = 0

predicted_ratings_test = np.dot(test_similarity, test_features)

test_final_rating = np.multiply(predicted_ratings_test.transpose(), dummy_test)
test_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,157296,157667,159462,160271,160440,160563,160567,160590,162376,162672
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.325192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,18.743378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
from sklearn.preprocessing import MinMaxScaler

X = test_final_rating.copy()
X = X[X > 0] # only consider non-zero values as 0 means the user haven't rated the movies

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

test = X_test.pivot(index = 'movieId', columns = 'userId', values = 'rating')
total_non_nan = np.count_nonzero(~np.isnan(pred))

# Mean abslute error
mae = np.abs(pred - test.transpose()).sum().sum()/total_non_nan
print(mae)

2.2282980637061653


It means that on an average our Item-based recommendation engine is making an error of 2.21 in predicting the User ratings.
