# Import the necessary files

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

## Load MovieLens dataset

In [None]:
DATASET_LINK='http://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [None]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -n ml-100k.zip

--2022-12-03 11:32:58--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2022-12-03 11:32:59 (11.3 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

Loading u.info     -- The number of users, items, and ratings in the u data set.

In [None]:
overall_stats = pd.read_csv('ml-100k/u.info', header=None)
print("Details of users, items and ratings involved in the loaded movielens dataset: ",list(overall_stats[0]))

Details of users, items and ratings involved in the loaded movielens dataset:  ['943 users', '1682 items', '100000 ratings']


In [None]:
##item id column is renamed as movie id
column_names1 = ['user id','movie id','rating','timestamp']
dataset = pd.read_csv('ml-100k/u.data', sep='\t',header=None,names=column_names1)
dataset.head() 


Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
d = 'movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')
column_names2

['movie id',
 'movie title',
 'release date',
 'video release date',
 'IMDb URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [None]:
items_dataset = pd.read_csv('ml-100k/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
items_dataset

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
movie_dataset = items_dataset[['movie id','movie title']]
movie_dataset.head()

Unnamed: 0,movie id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


## Merging required datasets

In [None]:
merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie id')
merged_dataset.head()

Unnamed: 0,user id,movie id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [None]:
refined_dataset = merged_dataset.groupby(['user id','movie id', 'movie title'], as_index=False).agg({"rating":"mean"})

refined_dataset

Unnamed: 0,user id,movie id,movie title,rating
0,1,1,Toy Story (1995),5.0
1,1,2,GoldenEye (1995),3.0
2,1,3,Four Rooms (1995),4.0
3,1,4,Get Shorty (1995),3.0
4,1,5,Copycat (1995),3.0
...,...,...,...,...
99995,943,1067,Bottle Rocket (1996),2.0
99996,943,1074,Reality Bites (1994),4.0
99997,943,1188,Young Guns II (1990),3.0
99998,943,1228,Under Siege 2: Dark Territory (1995),3.0



# Nearest Neighbour model to build user-user based collaborative Recommender System
This part of the code performs movie recommendation using KNN with input as User ID, size of the neighbourhood, and the number of movies to be recommended:


## Transformation - Reshaping the Dataframe
We need to transform the data so that each row of the dataframe represents a user and each column represents a different movie. So we want the data to be [users, movies] array if the user is the subject: ie, similar users must be found. To reshape, we pivot the dataframe to a format with users as rows and movies as columns. Not all users watch all the movies, so we can expect a lot of missing values. 

However, in the MovieLens 100k dataset, every user has rated at least 20 movies. The missing observations are filled with 0s to avoid NaN values while performing linear algebra operations (in this case, calculating cosine distances between vectors). Finally, we transform the values of the dataframe into a scipy sparse matrix for efficient calculations. This dataframe is then fed into a KNN model. 

In [None]:
# pivot and create movie-user matrix
user_to_movie_df = refined_dataset.pivot_table(
    index='user id',
     columns='movie title',
      values='rating').fillna(0)

user_to_movie_df.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


The model is reshaped so that each user has an n-dimensional rating space where n is number of movies in the dataset.

The KNN model is trained to find the most similar users to the active user given as input. The system recommends the top movies based on the rating of the similar users in the considered neighbourhood. 

In [None]:
# transform matrix to scipy sparse matrix
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df

<943x1664 sparse matrix of type '<class 'numpy.float64'>'
	with 99693 stored elements in Compressed Sparse Row format>

Fitting K-Nearest Neighbours model to the scipy sparse matrix:

In [None]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

NearestNeighbors(algorithm='brute', metric='cosine')

In [None]:
## function to find top n similar users of the given input user 
def get_similar_users(user, n = 5):
  ## input to this function is the user and number of top similar users you want.

  knn_input = np.asarray([user_to_movie_df.values[user-1]])  #.reshape(1,-1)
  # knn_input = user_to_movie_df.iloc[0,:].values.reshape(1,-1)
  distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
  
  print("Top",n,"users who are very much similar to the User-",user, "are: ")
  print(" ")
  for i in range(1,len(distances[0])):
    print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
  return indices.flatten()[1:] + 1, distances.flatten()[1:]


Specify the user ID and number of similar users / size of neighbourhood:

In [None]:
from pprint import pprint
user_id = 5
#print(" Few of movies seen by the User:")
#pprint(list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])[:10])
similar_user_list, distance_list = get_similar_users(user_id,5)

Top 5 users who are very much similar to the User- 5 are: 
 
1 . User: 307 separated by distance of 0.5211146313938015
2 . User: 648 separated by distance of 0.521592822610484
3 . User: 407 separated by distance of 0.5308693742726247
4 . User: 497 separated by distance of 0.5355527193614126
5 . User: 660 separated by distance of 0.5419679287503053


The neighbourhood of simialr users is now built using the KNN model. The next step is to select the top-N movies to recommend.

This can be implemented in multiple ways: by taking the average of the existing ratings given by the similar users and picking the top 10 or 15 movies to recommend to our current user.

However, a better recommendation system would weight each user's ratings based on the how close they are to the input/active user. Defining these weights would give us the accurate recommendations by eliminating the chance of decision manipulation by the users who are very far from the input user.

In [None]:
similar_user_list, distance_list

(array([307, 648, 407, 497, 660]),
 array([0.52111463, 0.52159282, 0.53086937, 0.53555272, 0.54196793]))

In [None]:
weightage_list = distance_list/np.sum(distance_list)
weightage_list

array([0.19656562, 0.196746  , 0.20024514, 0.2020117 , 0.20443154])

Getting ratings of all movies by derived similar users

In [None]:
mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
mov_rtngs_sim_users

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
movies_list = user_to_movie_df.columns
movies_list

Index([''Til There Was You (1997)', '1-900 (1994)', '101 Dalmatians (1996)',
       '12 Angry Men (1957)', '187 (1997)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '3 Ninjas: High Noon At Mega Mountain (1998)', '39 Steps, The (1935)',
       ...
       'Yankee Zulu (1994)', 'Year of the Horse (1997)', 'You So Crazy (1994)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Poisoner's Handbook, The (1995)',
       'Zeus and Roxanne (1997)', 'unknown',
       'Á köldum klaka (Cold Fever) (1994)'],
      dtype='object', name='movie title', length=1664)

In [None]:
print("Weightage list shape:", len(weightage_list))
print("mov_rtngs_sim_users shape:", mov_rtngs_sim_users.shape)
print("Number of movies:", len(movies_list))

Weightage list shape: 5
mov_rtngs_sim_users shape: (5, 1664)
Number of movies: 1664


**Broadcasting weightage matrix to similar user rating matrix, so that it gets compatible for matrix operations**

In [None]:
weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
weightage_list.shape

(5, 1664)

In [None]:
weightage_list[:,np.newaxis]

array([[[0.19656562, 0.19656562, 0.19656562, ..., 0.19656562,
         0.19656562, 0.19656562]],

       [[0.196746  , 0.196746  , 0.196746  , ..., 0.196746  ,
         0.196746  , 0.196746  ]],

       [[0.20024514, 0.20024514, 0.20024514, ..., 0.20024514,
         0.20024514, 0.20024514]],

       [[0.2020117 , 0.2020117 , 0.2020117 , ..., 0.2020117 ,
         0.2020117 , 0.2020117 ]],

       [[0.20443154, 0.20443154, 0.20443154, ..., 0.20443154,
         0.20443154, 0.20443154]]])

In [None]:
np.zeros(len(movies_list))

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
weightage_list


array([[0.19656562, 0.19656562, 0.19656562, ..., 0.19656562, 0.19656562,
        0.19656562],
       [0.196746  , 0.196746  , 0.196746  , ..., 0.196746  , 0.196746  ,
        0.196746  ],
       [0.20024514, 0.20024514, 0.20024514, ..., 0.20024514, 0.20024514,
        0.20024514],
       [0.2020117 , 0.2020117 , 0.2020117 , ..., 0.2020117 , 0.2020117 ,
        0.2020117 ],
       [0.20443154, 0.20443154, 0.20443154, ..., 0.20443154, 0.20443154,
        0.20443154]])

In [None]:
new_rating_matrix = weightage_list*mov_rtngs_sim_users
mean_rating_list = new_rating_matrix.sum(axis =0)
mean_rating_list

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
from pprint import pprint
def recommend_movies(n):
  n = min(len(mean_rating_list),n)
  # print(np.argsort(mean_rating_list)[::-1][:n])
  pprint(list(movies_list[np.argsort(mean_rating_list)[::-1][:n]]))


In [None]:
print("Movies recommended based on similar users are: ")
recommend_movies(10)

Movies recommended based on similar users are: 
['Star Wars (1977)',
 '2001: A Space Odyssey (1968)',
 'Blade Runner (1982)',
 'Return of the Jedi (1983)',
 'North by Northwest (1959)',
 'Apocalypse Now (1979)',
 'Raging Bull (1980)',
 'Toy Story (1995)',
 'Psycho (1960)',
 'Monty Python and the Holy Grail (1974)']


# Post-processing

By overcoming drawbacks of this system, it can be made more efficient. Here are the post processing steps to make the system better:

1. Remove movies that have already been watched by the user.

2. Do not recommend movies that are not at all seen by any of the similar users.

In [None]:
#function to remove movies already watched by the user and movies not watched by any of the similar users.
def filtered_movie_recommendations(n):
  
  first_zero_index = np.where(mean_rating_list == 0)[0][-1]
  sortd_index = np.argsort(mean_rating_list)[::-1]
  sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
  n = min(len(sortd_index),n)
  movies_watched = list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])
  filtered_movie_list = list(movies_list[sortd_index])
  count = 0
  final_movie_list = []
  for i in filtered_movie_list:
    if i not in movies_watched:
      count+=1
      final_movie_list.append(i)
    if count == n:
      break
  if count == 0:
    print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
  else:
    pprint(final_movie_list)


In [None]:
filtered_movie_recommendations(10)

['North by Northwest (1959)',
 'Apocalypse Now (1979)',
 'Raging Bull (1980)',
 'Raising Arizona (1987)',
 'Godfather, The (1972)',
 'Boot, Das (1981)',
 'Graduate, The (1967)',
 'Shawshank Redemption, The (1994)',
 'Clockwork Orange, A (1971)',
 'Amadeus (1984)']


# User-User Collaborative Filtering Recommender System

A single function that takes in a user ID and size of the neighbourhood to be considered, and returns top 10 recommended movies for the user.

In [None]:
from pprint import pprint

def recommender_system(user_id, n_similar_users, n_movies): #, user_to_movie_df, knn_model):
  
  print("Movie seen by the User:")
  pprint(list(refined_dataset[refined_dataset['user id'] == user_id]['movie title']))
  print("")

  # def get_similar_users(user, user_to_movie_df, knn_model, n = 5):
  def get_similar_users(user, n = 10):
    
    knn_input = np.asarray([user_to_movie_df.values[user-1]])
    
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    
    print("Top",n,"users who are very much similar to the User-",user, "are: ")
    print(" ")

    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    print("")
    return indices.flatten()[1:] + 1, distances.flatten()[1:]


  def filtered_movie_recommendations(n = 10):
  
    first_zero_index = np.where(mean_rating_list == 0)[0][-1]
    sortd_index = np.argsort(mean_rating_list)[::-1]
    sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
    n = min(len(sortd_index),n)
    movies_watched = list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])
    filtered_movie_list = list(movies_list[sortd_index])
    count = 0
    final_movie_list = []
    for i in filtered_movie_list:
      if i not in movies_watched:
        count+=1
        final_movie_list.append(i)
      if count == n:
        break
    if count == 0:
      print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
    else:
      pprint(final_movie_list)

  similar_user_list, distance_list = get_similar_users(user_id,n_similar_users)
  weightage_list = distance_list/np.sum(distance_list)
  mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
  movies_list = user_to_movie_df.columns
  weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
  new_rating_matrix = weightage_list*mov_rtngs_sim_users
  mean_rating_list = new_rating_matrix.sum(axis =0)
  print("")
  print("Movies recommended based on similar users are: ")
  print("")
  filtered_movie_recommendations(n_movies)

## Demonstration of the function's results:

In [None]:
print("Enter user id")
user_id= int(input())
print("number of similar users to be considered")
sim_users = int(input())
print("Enter number of movies to be recommended:")
n_movies = int(input())
recommender_system(user_id,sim_users,n_movies)

Enter user id
5
number of similar users to be considered
10
Enter number of movies to be recommended:
10
Movie seen by the User:
['Toy Story (1995)',
 'GoldenEye (1995)',
 'From Dusk Till Dawn (1996)',
 'Muppet Treasure Island (1996)',
 'Rumble in the Bronx (1995)',
 'Birdcage, The (1996)',
 'Batman Forever (1995)',
 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 'Clerks (1994)',
 'Star Wars (1977)',
 'Stargate (1994)',
 'Santa Clause, The (1994)',
 'While You Were Sleeping (1995)',
 'Forrest Gump (1994)',
 'Four Weddings and a Funeral (1994)',
 'Fugitive, The (1993)',
 'Hot Shots! Part Deux (1993)',
 'Blade Runner (1982)',
 'So I Married an Axe Murderer (1993)',
 'Home Alone (1990)',
 'Aladdin (1992)',
 'Silence of the Lambs, The (1991)',
 'Snow White and the Seven Dwarfs (1937)',
 'Fargo (1996)',
 'Heavy Metal (1981)',
 'Aristocats, The (1970)',
 'Sgt. Bilko (1996)',
 'Mystery Science Theater 3000: The Movie (1996)',
 'Operation Dumbo Drop (1995)',
 'Independence Day (ID