# Assignment 3 by: Devanshu Poonith (s3970304)


# Task 1: User-based Collaborative Filtering


### KNNCF

In [1]:
# Importing relevant libraries for usage
import numpy as np 
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.neighbors import NearestNeighbors
import os
from sklearn.model_selection import train_test_split

In [2]:
# Going to the ml-1m directory or folder
os.chdir('ml-1m/')


In [3]:
# listing all the files present in the ml-1m folder
!ls

README.txt  movies.dat  ratings.dat users.dat


In [4]:
# Viewing the first 10 movies in the movies.data file
!head movies.dat

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller


In [5]:
# Viewing the first 10 items in the ratings data

!head ratings.dat

1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
1::594::4::978302268
1::919::4::978301368


In [6]:
# Viewing the first 10 items in the users data
!head users.dat

1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
6::F::50::9::55117
7::M::35::1::06810
8::M::25::12::11413
9::M::25::17::61614
10::F::35::1::95370


In [7]:
names = ['UserID','MovieID','Rating','Timestamp'] # giving the names of the columns
df = pd.read_csv('ratings.dat',sep='::',names=names,engine='python') # reading the ratings data into a dataframe
df.head(20) # listing the first 5 rows of the data rating 

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [8]:
df.shape

(1000209, 4)

In [9]:
df = df[df['Rating'] > 0]# selecting users that has more than 0 rating on the movies
df.shape

(1000209, 4)

In [10]:
!head movies.dat

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller


In [11]:
# Reading the movie.dat file
df_1 = pd.read_csv('movies.dat', sep ='::',names = ['MovieID','MovieTitle','MovieGenre'],encoding = 'latin-1',engine= 'python')
df_1.head()

Unnamed: 0,MovieID,MovieTitle,MovieGenre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
# Joining the tables on the 'MovieID' column
merged_df = pd.merge(df, df_1, on='MovieID',how = 'inner')
merged_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,White Boys (1999),Drama
1000207,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [13]:

# Create a group by object based on 'MovieTitle'
grouped = merged_df.groupby('MovieTitle')

# Count the number of ratings for each movie and filter movies that have more than 30 ratings
movie_rating_count = grouped['Rating'].count()
popular_movies = movie_rating_count[movie_rating_count > 100].index.tolist()

# Filter the DataFrame to include only those popular movies
df = merged_df[merged_df['MovieTitle'].isin(popular_movies)]
df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
991639,5841,3445,3,961508809,Eyes of Laura Mars (1978),Mystery|Thriller
991640,5852,3445,2,958103360,Eyes of Laura Mars (1978),Mystery|Thriller
991641,5916,3445,3,957461368,Eyes of Laura Mars (1978),Mystery|Thriller
991642,5954,3445,3,957706442,Eyes of Laura Mars (1978),Mystery|Thriller


In [14]:
genre_df = df_1['MovieGenre'].str.get_dummies(sep='|')

# Concatenating the original DataFrame with the new genre DataFrame
final_df = pd.concat([df_1, genre_df], axis=1)

# Dropping the MovieGenre column
final_df.drop('MovieGenre', axis=1, inplace=True)

final_df.head()

Unnamed: 0,MovieID,MovieTitle,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# Checking the size of the data
n_users = df.UserID.unique().shape[0]
n_movies = df.MovieID.unique().shape[0]
print (str(n_users) + ' users')
print (str(n_movies) + ' movies')

6040 users
2006 movies


In [20]:
# Factorizing UserID and MovieID
# Factorizing UserID and MovieID using .loc
df.loc[:, 'UserID'] = pd.factorize(df['UserID'])[0]
df.loc[:, 'MovieID'] = pd.factorize(df['MovieID'])[0]


In [21]:
# Splitting dataset into training and testing
train_df, test_df = train_test_split(df, test_size=0.97)
train_df, test_df

(        UserID  MovieID  Rating  Timestamp  \
 101749    1389       95       2  962642543   
 552467    1849      726       3  975872721   
 247571      33      242       4  977529882   
 596659    4249      812       4  958180253   
 758670    1217     1194       3  966355716   
 ...        ...      ...     ...        ...   
 768007    1510     1221       4  960520162   
 948919     617     1886       3  974496809   
 484518    5136      619       4  974424140   
 346351    3020      395       3  977372387   
 475010    1038      602       4  966133069   
 
                                         MovieTitle  \
 101749   Close Encounters of the Third Kind (1977)   
 552467                           Waterworld (1995)   
 247571                       One False Move (1991)   
 596659                         Arrival, The (1996)   
 758670                       Risky Business (1983)   
 ...                                            ...   
 768007                          French Kiss (199

In [22]:
# Training Dataset
train_ds = np.zeros((n_users, n_movies))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_movies))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

train_ds, test_ds

(      0     1     2     3     4     5     6     7     8     9     ...  1996  \
 0      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 1      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 2      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 3      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 4      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 ...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
 6035   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 6036   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 6037   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 6038   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 6039   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0   0.0   0.0  ...   0.0   
 
       1997  1998  1999  2000  2001  2

### Using the cosine similarity algorithm for finding the user similarity 

In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Using the cosine similarity on the training data
user_similarity = cosine_similarity(train_ds)
user_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Using K = 100

In [24]:

np_predictions = np.zeros((n_users, n_movies))
K = 100
EPSILON = 1e-9

# Selecting one random user
random_user_index = np.random.choice(n_users)

for j, rating in enumerate(test_ds.values[random_user_index]):
    if rating > 0:
        # Finding top-K most similar users 
        sim_user_ids = np.argsort(user_similarity[random_user_index])[-(K + 1):-1]

        # Finding the coefficient values of similar users
        sim_val = user_similarity[random_user_index][sim_user_ids]

        # Calculating the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[random_user_index]) / (np.sum(np.clip(train_ds.values[random_user_index], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Selecting the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

       
        
        np_predictions[random_user_index][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[random_user_index][j] = np.clip(np_predictions[random_user_index][j], 0, 5)


In [25]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based),k= 100: " + str(RMSE));

RMSE on Tesing set (User-based),k= 100: 3.7765305365246853


### Using k=50

In [26]:
np_predictions = np.zeros((n_users, n_movies))
K = 50
EPSILON = 1e-9

# Selecting one random user
random_user_index = np.random.choice(n_users)

for j, rating in enumerate(test_ds.values[random_user_index]):
    if rating > 0:
        # Finding top-K most similar users 
        sim_user_ids = np.argsort(user_similarity[random_user_index])[-(K + 1):-1]

        # Finding the coefficient values of similar users
        sim_val = user_similarity[random_user_index][sim_user_ids]

        # Calculating the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[random_user_index]) / (np.sum(np.clip(train_ds.values[random_user_index], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Selecting the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

       
        
        np_predictions[random_user_index][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[random_user_index][j] = np.clip(np_predictions[random_user_index][j], 0, 5)


In [27]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based), k= 50: " + str(RMSE));

RMSE on Tesing set (User-based), k= 50: 3.7761478469737697


### Using k=30

In [28]:
np_predictions = np.zeros((n_users, n_movies))
K = 30
EPSILON = 1e-9

# Selecting one random user
random_user_index = np.random.choice(n_users)

for j, rating in enumerate(test_ds.values[random_user_index]):
    if rating > 0:
        # Finding top-K most similar users 
        sim_user_ids = np.argsort(user_similarity[random_user_index])[-(K + 1):-1]

        # Finding the coefficient values of similar users
        sim_val = user_similarity[random_user_index][sim_user_ids]

        # Calculating the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[random_user_index]) / (np.sum(np.clip(train_ds.values[random_user_index], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Selecting the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

       
        
        np_predictions[random_user_index][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[random_user_index][j] = np.clip(np_predictions[random_user_index][j], 0, 5)


In [29]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based), k= 30: " + str(RMSE));

RMSE on Tesing set (User-based), k= 30: 3.776628083464645


### Uisng k = 20

In [30]:
np_predictions = np.zeros((n_users, n_movies))
K = 20
EPSILON = 1e-9

# Selecting one random user
random_user_index = np.random.choice(n_users)

for j, rating in enumerate(test_ds.values[random_user_index]):
    if rating > 0:
        # Finding top-K most similar users 
        sim_user_ids = np.argsort(user_similarity[random_user_index])[-(K + 1):-1]

        # Finding the coefficient values of similar users
        sim_val = user_similarity[random_user_index][sim_user_ids]

        # Calculating the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[random_user_index]) / (np.sum(np.clip(train_ds.values[random_user_index], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Selecting the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

       
        
        np_predictions[random_user_index][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[random_user_index][j] = np.clip(np_predictions[random_user_index][j], 0, 5)


In [31]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based): " + str(RMSE));

RMSE on Tesing set (User-based): 3.775956670681962


 ### Using k = 10

In [32]:
np_predictions = np.zeros((n_users, n_movies))
K = 20
EPSILON = 1e-9

# Selecting one random user
random_user_index = np.random.choice(n_users)

for j, rating in enumerate(test_ds.values[random_user_index]):
    if rating > 0:
        # Finding top-K most similar users 
        sim_user_ids = np.argsort(user_similarity[random_user_index])[-(K + 1):-1]

        # Finding the coefficient values of similar users
        sim_val = user_similarity[random_user_index][sim_user_ids]

        # Calculating the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[random_user_index]) / (np.sum(np.clip(train_ds.values[random_user_index], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # Selecting the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])

       
        
        np_predictions[random_user_index][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[random_user_index][j] = np.clip(np_predictions[random_user_index][j], 0, 5)


In [33]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based), k=10: " + str(RMSE));

RMSE on Tesing set (User-based), k=10: 3.776655180721159


# Task 2: Item-based Filtering

### Using the cosine similarity test for item-item similarity

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
# Using the transpose function, we can make the movies become the rows
movie_similarity = cosine_similarity(train_ds.transpose())
movie_similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.00799019],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.0207289 ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00799019, 0.0207289 , ..., 0.        , 0.        ,
        1.        ]])

In [40]:
import numpy as np


K = 3
EPSILON = 1e-9

# Pre-compute the mean rating for each movie
movie_means = np.nanmean(train_ds.values, axis=0)

# Create a placeholder for predicted ratings
np_predictions = np.zeros((n_users, n_movies))


# Randomly choose a movie for prediction
random_movie_index = np.random.choice(n_movies)

# Loop through each user and rated movie to predict the rating
for i, user_ratings in enumerate(test_ds.values):
    # Indices of movies that the user has rated
    rated_movies = np.where(user_ratings > 0)[0]
    
    # If the user has rated at least one movie
    if len(rated_movies) > 0:
        # For each rated movie
        for j in rated_movies:
            # Get indices of top-K similar movies, excluding the movie itself
            sim_movie_ids = np.argsort(movie_similarity[j])[-(K + 1):-1]
            
            # Extract the similarities and ratings of the top-K similar movies
            sim_values = movie_similarity[j][sim_movie_ids]
            sim_movie_ratings = train_ds.values[i, sim_movie_ids]
            
            # Identify the movies among the top-K that the user has actually rated
            mask_rated = sim_movie_ratings > 0
            
            # Compute predicted rating using a weighted sum
            if np.sum(mask_rated) > 0:
                numerator = np.dot(sim_values[mask_rated], sim_movie_ratings[mask_rated] - movie_means[sim_movie_ids][mask_rated])
                denominator = np.sum(np.abs(sim_values[mask_rated])) + EPSILON
                predicted_rating = movie_means[j] + (numerator / denominator)
                
                # Clip the predicted rating to be between 0 and 5
                np_predictions[i, j] = np.clip(predicted_rating, 0, 5)


In [41]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (Item-based): " + str(RMSE))

RMSE on Tesing set (Item-based): 3.767983436875267


### Using the jaccard similarity test for item-item similarity

In [34]:


n_movies = train_ds.shape[1]

# Initialize a zero matrix for storing item-item Jaccard similarities
np_item_jaccard_sim = np.zeros((n_movies, n_movies))

for i, item_i_vec in enumerate(train_ds.T.values):
    for j, item_j_vec in enumerate(train_ds.T.values):
        
        # Boolean masks for rated items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0
        
        # Find intersection and union of two masks
        intersection = np.sum(mask_i & mask_j)
        union = np.sum(mask_i | mask_j)
        
        # Calculate Jaccard similarity
        if union == 0:
            sim = 0
        else:
            sim = intersection / union
        
        np_item_jaccard_sim[i][j] = sim


In [35]:
np_item_jaccard_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.01587302],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.01098901],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.01587302, 0.01098901, ..., 0.        , 0.        ,
        1.        ]])

In [37]:
# Randomly choose a movie for prediction
random_movie_index = np.random.choice(n_movies)
movie_means = np.nanmean(train_ds.values, axis=0)
# Create a placeholder for predicted ratings for this movie only
np_predictions_random_movie = np.zeros(n_users)

# Loop through each user and rated movie to predict the rating
for i, user_ratings in enumerate(test_ds.values):
    # Check if the user has rated the randomly chosen movie
    if user_ratings[random_movie_index] > 0:
        
        # Get indices of top-K similar movies, excluding the movie itself
        sim_movie_ids = np.argsort(np_item_jaccard_sim[random_movie_index])[-(K + 1):-1]
        
        # Extract the Jaccard similarities and ratings of the top-K similar movies
        sim_values = np_item_jaccard_sim[random_movie_index][sim_movie_ids]
        sim_movie_ratings = train_ds.values[i, sim_movie_ids]
        
        # Identify the movies among the top-K that the user has actually rated
        mask_rated = sim_movie_ratings > 0
        
        # Compute predicted rating using a weighted sum
        if np.sum(mask_rated) > 0:
            numerator = np.dot(sim_values[mask_rated], sim_movie_ratings[mask_rated] - movie_means[sim_movie_ids][mask_rated])
            denominator = np.sum(np.abs(sim_values[mask_rated])) + EPSILON
            predicted_rating = movie_means[random_movie_index] + (numerator / denominator)
            
            # Clip the predicted rating to be between 0 and 5
            np_predictions_random_movie[i] = np.clip(predicted_rating, 0, 5)




In [38]:
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (Item-based): " + str(RMSE))

RMSE on Tesing set (Item-based): 3.776655180721159


# Task 3: A Better Recommender System

#### Option Chosen: `Option1RecSys`

Recommender system used: NEURAL COLLABORATIVE FILTERING 

Reference: He, X., Liao, L., Zhang, H., Nie, L., Hu, X., & Chua, T. S. (2017). Neural Collaborative Filtering.

In [16]:
# Installing libraries for carrying neural collaboratuve filtering
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Embedding,Flatten,Concatenate,Dense
from tensorflow.keras.optimizers import Adam

2023-10-18 14:10:16.491716: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Loading the data

In [17]:
# Using and loading the merged_df dataframe 
merged_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,White Boys (1999),Drama
1000207,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


### Processing the data 

In [18]:
user_id = df["UserID"].unique().tolist()# Making the userID unique and in a well sequenced list 
user_id_encoded = {x: i for i,x in enumerate(user_id)}# Making a dictionary mapping the unique value of userid from the sequenced integer number
user_id_encoded


movie_id = df["MovieID"].unique().tolist() # Making the MovieID unique and in a well sequenced list 
Movie_id_encoded = {x: i for i,x in enumerate(movie_id)}# Making a dictionary mapping the unique value of MovieID from the sequenced integer number


# creating new dataframes for movie and user

df["UserID"] = df["UserID"].map(user_id_encoded)# mapping the userID to corresponding interger in user_id_encoded
df["MovieID"] = df["MovieID"].map(Movie_id_encoded)# mapping the movieID to corresponding integer in movie_id_encoded




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["UserID"] = df["UserID"].map(user_id_encoded)# mapping the userID to corresponding interger in user_id_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["MovieID"] = df["MovieID"].map(Movie_id_encoded)# mapping the movieID to corresponding integer in movie_id_encoded


In [19]:
# Getting the number of users and movies

n_users = len(user_id_encoded)
n_movies = len(Movie_id_encoded)
print("Number of users in the merged_df data is " + str(n_users))
print("Number of movies in the merged_df data is " + str(n_movies))

Number of users in the merged_df data is 6040
Number of movies in the merged_df data is 2006


In [20]:
# Training-splitting data
df_train,df_test = train_test_split(merged_df,test_size = 0.2, random_state = 42)

In [25]:
# Example
max_user_id = df_train['UserID'].max() 
max_movie_id = df_train['MovieID'].max()  
print("Maximum user id is : "+  str(max_user_id))
print("Maximum movie id is : "+  str(max_movie_id))

Maximum user id is : 6040
Maximum movie id is : 3952


In [21]:
n_unique_users = df_train['UserID'].nunique()
n_unique_movies = df_train['MovieID'].nunique()


In [23]:
# Building model 
embedding_size = 50

# User embedding
user_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=6041, output_dim=embedding_size, input_length=1)(user_input)
user_vec = Flatten()(user_embedding)

# Movie embedding
movie_input = Input(shape=(1,))
movie_embedding = Embedding(input_dim=3953, output_dim=embedding_size, input_length=1)(movie_input)
movie_vec = Flatten()(movie_embedding)

# Concatenate features that are movie and users 
merged = Concatenate()([user_vec, movie_vec])



# Establishing Neural network collaborative filtering

hidden = Dense(128,activation = 'relu')(merged)
hidden = Dense(64,activation = 'relu')(hidden)


# Output layer
output = Dense(1)(hidden)

model = Model([user_input,movie_input],output)
model.compile(optimizer = Adam(0.001), loss = 'mean_squared_error')

In [24]:
# Fitting the model

history = model.fit(
    [df_train["UserID"], df_train["MovieID"]],
    df_train["Rating"],
    batch_size=2048,
    epochs=10,
    verbose=1,
    validation_split=0.2,
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
# Make prediction
predictions = model.predict([df_test["UserID"], df_test["MovieID"]])






In [34]:
df_test.sample()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
106749,6016,982,2,956958317,Picnic (1955),Drama


In [37]:
import numpy as np

# For a single prediction
user_id_to_predict = np.array([6016])  
movie_id_to_predict = np.array([982])  

# Make the prediction
predicted_rating = model.predict([user_id_to_predict, movie_id_to_predict])

print(f"The predicted rating for user {user_id_to_predict[0]} and movie {movie_id_to_predict[0]} is {predicted_rating[0][0]}")



The predicted rating for user 6016 and movie 982 is 3.422943115234375


In [38]:
df_train.sample()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
908725,2785,3185,3,972988761,Snow Falling on Cedars (1999),Drama


In [39]:
import numpy as np

# For a single prediction
user_id_to_predict = np.array([2785]) 
movie_id_to_predict = np.array([3185]) 

# Make the prediction
predicted_rating = model.predict([user_id_to_predict, movie_id_to_predict])

print(f"The predicted rating for user {user_id_to_predict[0]} and movie {movie_id_to_predict[0]} is {predicted_rating[0][0]}")



The predicted rating for user 2785 and movie 3185 is 2.9553656578063965


In [42]:
df_train.sample()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,MovieTitle,MovieGenre
23642,932,1270,2,975186059,Back to the Future (1985),Comedy|Sci-Fi


In [43]:
import numpy as np

# For a single prediction
user_id_to_predict = np.array([932])  
movie_id_to_predict = np.array([1270]) 

# Make the prediction
predicted_rating = model.predict([user_id_to_predict, movie_id_to_predict])

print(f"The predicted rating for user {user_id_to_predict[0]} and movie {movie_id_to_predict[0]} is {predicted_rating[0][0]}")



The predicted rating for user 932 and movie 1270 is 2.9649417400360107


### Task 3.2

### MovieAvg

In [48]:
# Making the ratinng matrix {user-item}
ratings = np.zeros((n_users, n_movies))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print(ratings)

[[0. 0. 0. ... 0. 0. 5.]
 [0. 0. 0. ... 0. 0. 4.]
 [0. 0. 4. ... 0. 0. 4.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 3. 4. ... 0. 0. 5.]]


In [51]:
# Making the rating number matrix from users and movies 
ratingsNum = np.zeros((n_users, n_movies))
for row in df.itertuples():
    ratingsNum[row[1]-1, row[2]-1] = 1
print(ratingsNum)


[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 1.]]


In [54]:
# Calculating the movie average 
itemRateNum = ratingsNum.sum(axis=0)
itemRateSum = ratings.sum(axis=0)
itemRateAvg = itemRateSum/itemRateNum
print(itemRateAvg)


[3.4647619  4.15408805 3.86387833 ... 2.91176471 3.1942446  4.39072464]
[ 525.  636. 1315. ...  102.  139. 1725.]


In [66]:
import random
top_n = 30 # setting the movies to be recommended to be 30 


def evaluate_AP_NDCG(recommended, actual):
    # Computing Average Precision (AP) and Normalized Discounted Cumulative Gain (NDCG)
    AP = 0
    NDCG = 0
    num_hits = 0
    for i, rec in enumerate(recommended):
        if rec in actual:
            num_hits += 1
            AP += num_hits / (i + 1)
    AP /= len(actual)
    
    DCG = sum([(rec in actual) / np.log2(i + 2) for i, rec in enumerate(recommended)])
    IDCG = sum([1 / np.log2(i + 2) for i in range(len(actual))])
    NDCG = DCG / IDCG
    
    return AP, NDCG

# Identifying users who have rated more than 100 movies
users_to_consider = np.where(np.sum(ratings > 0, axis=1) > 100)[0]
chosen_users = random.sample(list(users_to_consider), 5)



for user in chosen_users:
    mask_user = ratings[user, :] > 0  # Movies rated by the current 'user'
    
    itemRateAvgCurrent = itemRateAvg.copy()
    itemRateAvgCurrent[mask_user] = 0  # Set to zero the ratings of movies already rated by 'user'
    
    itemSortInd = itemRateAvgCurrent.argsort()
    print("")
    print(f'Recommendations for User {user}')
    print('movie ID' + '\t movie title')
    print(final_df['MovieTitle'][itemSortInd[range(len(itemSortInd)-1, len(itemSortInd)-top_n-1, -1)]])



Recommendations for User 1902
movie ID	 movie title
27                                     Persuasion (1995)
496                                Mrs. Doubtfire (1993)
1309    Blood For Dracula (Andy Warhol's Dracula) (1974)
123                        Flirting With Disaster (1996)
1823                            Perfect Murder, A (1998)
1171                            Strictly Ballroom (1992)
20                                     Get Shorty (1995)
45                  How to Make an American Quilt (1995)
1129                                   Snowriders (1996)
68                                         Friday (1995)
312                               Specialist, The (1994)
2005     Night Porter, The (Il Portiere di notte) (1974)
617                            My Favorite Season (1993)
1650                         Big Bang Theory, The (1994)
775                                       Kingpin (1996)
1344                                     Ridicule (1996)
500                                