In [2]:
#Loading libraries and files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Define the file path: Plese change it
file_path = r"sample.csv"

# Read the CSV file into a DataFrame
ratings = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify it's loaded correctly
print(ratings.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [8]:
ratings.movieId.nunique()

9724

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(ratings, test_size = 0.30, random_state = 1)

print(X_train.shape)
print(X_test.shape)

ModuleNotFoundError: No module named 'sklearn'

In [5]:
# pivot ratings into movie features
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
user_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a Copy of train and test dataset
These datasets will be used for prediction and evaluation.

Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction.

Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train

In [6]:
# make a copy of train and test datasets
dummy_train = X_train.copy()
dummy_test = X_test.copy()

dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)

In [7]:
# The movies not rated by user is marked as 1 for prediction 
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)

# The movies not rated by user is marked as 0 for evaluation 
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

## User-User Similarity matrix Using Cosine Similarity

In [8]:
#Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

# User Similarity Matrix using Cosine similarity as a similarity measure between Users
user_similarity = cosine_similarity(user_data)
user_similarity[np.isnan(user_similarity)] = 0
print(user_similarity)
print(user_similarity.shape)

[[1.         0.01988249 0.03910678 ... 0.1730224  0.05084945 0.11089502]
 [0.01988249 1.         0.         ... 0.04399393 0.04145616 0.05915412]
 [0.03910678 0.         1.         ... 0.00935101 0.         0.03426788]
 ...
 [0.1730224  0.04399393 0.00935101 ... 1.         0.09995235 0.22243842]
 [0.05084945 0.04145616 0.         ... 0.09995235 1.         0.04167675]
 [0.11089502 0.05915412 0.03426788 ... 0.22243842 0.04167675 1.        ]]
(610, 610)


In [9]:
#Predicting the User ratings on the movies
user_predicted_ratings = np.dot(user_similarity, user_data)
user_predicted_ratings

array([[7.29951976e+01, 3.31637993e+01, 1.77295056e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.15997809e-01],
       [2.17006904e+01, 9.93524271e+00, 3.14339134e+00, ...,
        2.92531477e-01, 2.92531477e-01, 4.16107181e-01],
       [5.16588383e+00, 2.69825726e+00, 1.52946104e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [8.99191130e+01, 5.01638130e+01, 2.45898151e+01, ...,
        3.21292680e-02, 3.21292680e-02, 6.21699830e-01],
       [6.43931897e+01, 3.09947967e+01, 1.36332240e+01, ...,
        0.00000000e+00, 0.00000000e+00, 7.16628867e-02],
       [8.18734614e+01, 3.70332145e+01, 1.38145363e+01, ...,
        3.02982474e-01, 3.02982474e-01, 8.30445406e-01]])

In [10]:
user_predicted_ratings.shape

(610, 8531)

In [11]:
# np.multiply for cell-by-cell multiplication 
user_final_ratings = np.multiply(user_predicted_ratings, dummy_train)
user_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,33.163799,17.729506,0.944557,11.898072,0.0,12.901289,2.415614,4.144356,42.129841,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115998
2,21.70069,9.935243,3.143391,0.268158,3.386079,10.931683,3.000615,0.678489,0.551667,10.349057,...,0.376112,0.292531,0.250741,0.334322,0.292531,0.334322,0.292531,0.292531,0.292531,0.416107
3,5.165884,2.698257,1.529461,0.112824,0.817166,3.128891,0.881711,0.206225,0.180813,2.655929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47.50601,21.876166,10.610106,0.982457,7.863567,23.208589,11.016966,1.515071,1.588939,26.819153,...,0.05635,0.043828,0.037567,0.050089,0.043828,0.050089,0.043828,0.043828,0.043828,0.159751
5,48.383667,28.985985,14.732788,1.919722,13.292613,29.207201,16.573275,2.850799,2.273294,42.990794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Top 5 movie recommendations for the User 29
user_final_ratings.iloc[29].sort_values(ascending = False)[0:5]

movieId
2571    89.156238
356     81.120554
318     75.187547
260     74.077550
296     72.034695
Name: 30, dtype: float64

## Evaluation

Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the User instead of predicting it for the movie not rated by the user.

In [13]:
test_user_features = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
test_user_similarity = cosine_similarity(test_user_features)
test_user_similarity[np.isnan(test_user_similarity)] = 0

print(test_user_similarity)
print("- "*10)
print(test_user_similarity.shape)

[[1.         0.         0.         ... 0.07142113 0.         0.02771632]
 [0.         1.         0.         ... 0.02771399 0.         0.06031321]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.07142113 0.02771399 0.         ... 1.         0.04681269 0.05868601]
 [0.         0.         0.         ... 0.04681269 1.         0.        ]
 [0.02771632 0.06031321 0.         ... 0.05868601 0.         1.        ]]
- - - - - - - - - - 
(610, 610)


In [14]:
user_predicted_ratings_test = np.dot(test_user_similarity, test_user_features)
user_predicted_ratings_test

array([[1.52716205e+01, 6.60581461e+00, 6.10743927e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.02714151e+00, 2.59495900e+00, 3.22131514e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.06098685e+00, 2.33534007e-01, 8.81138497e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.36749870e+01, 8.84880068e+00, 2.43966623e+00, ...,
        3.71749020e-02, 3.71749020e-02, 0.00000000e+00],
       [8.97268309e+00, 3.57602366e+00, 7.46545417e-01, ...,
        1.19633684e-01, 1.19633684e-01, 0.00000000e+00],
       [8.95126179e+00, 5.73858990e+00, 1.19312726e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.05331433e-01]])

In [15]:
test_user_final_rating = np.multiply(user_predicted_ratings_test, dummy_test)
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,188751,188797,189333,189713,190207,190209,190213,190219,190221,193573
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,6.107439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,29.620289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [17]:
from sklearn.preprocessing import MinMaxScaler

X = test_user_final_rating.copy() 
X = X[X > 0] # only consider non-zero values as 0 means the user haven't rated the movies

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

print(pred)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[[       nan        nan 5.         ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 ...
 [2.99566417        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]


In [18]:
# total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(pred))
total_non_nan

30251

In [19]:
test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,188751,188797,189333,189713,190207,190209,190213,190219,190221,193573
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [20]:
# RMSE Score User-User Based Collaborative Filtering

diff_sqr_matrix = (test - pred)**2
sum_of_squares_err = diff_sqr_matrix.sum().sum() # df.sum().sum() by default ignores null values

rmse = np.sqrt(sum_of_squares_err/total_non_nan)
print(rmse)

1.581456668523464
