# Initial Setup

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import numpy as np

Import required libraries

# Read csv files

In [12]:
df_movies = pd.read_csv('/content/drive/My Drive/set_your_path/movies.csv',sep=';',encoding='latin-1').drop('Unnamed: 3',axis=1)
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Dropping NaN column 'Unnamed: 3' which is created due to format error in dataset is done.
Dataset is not comma seperated, its semicolen seperated. Therefore, read the dataset as above.

In [13]:
df_ratings = pd.read_csv('/content/drive/My Drive/set_your_path/ratings.csv',sep=';')
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [14]:
df_users = pd.read_csv('/content/drive/My Drive/set_your_path/users.csv',sep=';')
df_users

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,1060


In [15]:
df_movies.shape

(3883, 3)

Check unique values in genres column.

In [16]:
df_movies['genres'].unique()

array(["Animation|Children's|Comedy", "Adventure|Children's|Fantasy",
       'Comedy|Romance', 'Comedy|Drama', 'Comedy',
       'Action|Crime|Thriller', "Adventure|Children's", 'Action',
       'Action|Adventure|Thriller', 'Comedy|Drama|Romance',
       ' Dead and Loving It (1995)', "Animation|Children's", 'Drama',
       'Action|Adventure|Romance', 'Drama|Thriller', 'Drama|Romance',
       'Thriller', ' When Nature Calls (1995)', 'Action|Comedy|Drama',
       'Crime|Drama|Thriller', 'Drama|Sci-Fi', 'Romance',
       'Adventure|Sci-Fi', 'Adventure|Romance', "Children's|Comedy|Drama",
       'Documentary', 'Drama|War', 'Action|Crime|Drama',
       'Action|Adventure', 'Crime|Thriller',
       "Animation|Children's|Musical|Romance", 'Action|Drama|Thriller',
       "Children's|Comedy", 'Drama|Mystery', ' Beyond Cyberspace (1996)',
       'Action|Comedy|Crime|Horror|Thriller', 'Drama|Musical',
       'Sci-Fi|Thriller', 'Crime|Drama|Romance', 'Adventure|Drama',
       'Action|Thriller', ' Ho

Genres column includes some false data such as movies names. Therefore we need to handle this.

In [17]:
# List of genres to filter by
specified_genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
    ]

# Filter the DataFrame to keep rows with at least one of the specified genres
df_movies_filtered = df_movies[df_movies['genres'].str.split('|').apply(lambda x: any(item in specified_genres for item in x))]
df_movies_filtered

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


README file says dataset includes only above types of genres. Therefore remove datarecords that doesnt have at least one of above genre in genres column.

In [18]:
df_movies_filtered['genres'].unique()

array(["Animation|Children's|Comedy", "Adventure|Children's|Fantasy",
       'Comedy|Romance', 'Comedy|Drama', 'Comedy',
       'Action|Crime|Thriller', "Adventure|Children's", 'Action',
       'Action|Adventure|Thriller', 'Comedy|Drama|Romance',
       "Animation|Children's", 'Drama', 'Action|Adventure|Romance',
       'Drama|Thriller', 'Drama|Romance', 'Thriller',
       'Action|Comedy|Drama', 'Crime|Drama|Thriller', 'Drama|Sci-Fi',
       'Romance', 'Adventure|Sci-Fi', 'Adventure|Romance',
       "Children's|Comedy|Drama", 'Documentary', 'Drama|War',
       'Action|Crime|Drama', 'Action|Adventure', 'Crime|Thriller',
       "Animation|Children's|Musical|Romance", 'Action|Drama|Thriller',
       "Children's|Comedy", 'Drama|Mystery',
       'Action|Comedy|Crime|Horror|Thriller', 'Drama|Musical',
       'Sci-Fi|Thriller', 'Crime|Drama|Romance', 'Adventure|Drama',
       'Action|Thriller', "Adventure|Children's|Comedy|Musical",
       'Action|Drama|War', 'Action|Adventure|Crime', 'Crime'

Check unique values in genres column to ensure that false data are removed.

# Handling missing values

In [19]:
df_movies.shape

(3883, 3)

In [20]:
df_movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [21]:
df_ratings.shape

(1000209, 4)

In [22]:
df_ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [23]:
df_users.shape

(6040, 5)

In [24]:
df_users.isnull().sum()

userId        0
gender        0
age           0
occupation    0
zip-code      0
dtype: int64

There are no missing values in any column. Therefore we dont have to handle mising values.

# Handling outliers

In [25]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   int64 
 4   zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [26]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [27]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


Since there are no any continuous data fields, we dont have to handle outliers.

# Encoding genres

Since genres are categorical we need to encode them to represent them in numbers to make the clustering process efficient.

In [28]:
data = pd.merge(pd.merge(df_ratings, df_users), df_movies)
data

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


Mearge all 3 datasets.

In [29]:
data['genres'].unique()

array(['Drama', "Animation|Children's|Musical", 'Musical|Romance',
       "Animation|Children's|Comedy", 'Action|Adventure|Comedy|Romance',
       'Action|Adventure|Drama', 'Comedy|Drama',
       "Adventure|Children's|Drama|Musical", 'Musical', 'Comedy',
       "Animation|Children's", 'Comedy|Fantasy',
       ' The Best of Aardman Animation (1996)', 'Comedy|Sci-Fi',
       'Drama|War', 'Romance', "Animation|Children's|Musical|Romance",
       "Children's|Drama|Fantasy|Sci-Fi", 'Drama|Romance',
       'Animation|Comedy|Thriller',
       "Adventure|Animation|Children's|Comedy|Musical",
       "Animation|Children's|Comedy|Musical", 'Thriller',
       'Action|Crime|Romance', ' Episode IV - A New Hope (1977)',
       "Children's|Comedy|Musical", 'Action|Drama|War',
       "Children's|Drama", 'Crime|Drama|Thriller', 'Action|Crime|Drama',
       ' Impossible (1996)', 'Crime|Drama',
       ' Episode I - The Phantom Menace (1999)',
       'Action|Adventure|Sci-Fi|Thriller',
       ' Episode VI 

In [30]:
# Split the genres using the '|' delimiter and create a list of lists
genre_lists = df_movies['genres'].str.split('|')

# Flatten the list of lists to get a single list of genres
all_genres = [genre for genre_list in genre_lists for genre in genre_list]

# Find the unique genres
unique_genres = set(all_genres)

# Count the number of unique genres
num_unique_genres = len(unique_genres)

print(f"Number of unique genres: {num_unique_genres}")

Number of unique genres: 214


In [31]:
# One hot encoding genres
genres = df_movies['genres'].str.get_dummies('|')
genres
data = pd.concat([data, genres], axis=1)
data

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,occupation,zip-code,title,genres,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,...,,,,,,,,,,
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,...,,,,,,,,,,
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,...,,,,,,,,,,
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,...,,,,,,,,,,


In [32]:
# User genre matrix for content basrd filtering
user_genre_matrix = data.groupby('userId')[genres.columns].sum()
user_genre_matrix

Unnamed: 0_level_0,3D (1982),A Cinderella Story (1998),A Film That Was Never Made (1994),A Journal of Murder (1995),"A Mediaeval Odyssey, The (1988)",A New Beginning (1985),A New Generation (1993),A Power Rangers Movie (1997),A Sense of Life (1997),A Space Odyssey (1968),...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Content based filtering

Process the column genres in movies dataset by removing unneccesary words such as stop words in the description. And then create a matrix of genre counts where rows represent movies and columns represent terms in genres column.

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
genres = vectorizer.fit_transform(df_movies.genres).toarray()
contents = pd.DataFrame(genres,columns=vectorizer.get_feature_names_out())
print('Shape of the content table :',contents.shape)
contents.head()

Shape of the content table : (3883, 347)


Unnamed: 0,1919,1956,1963,1968,1974,1977,1978,1979,1980,1981,...,wight,willowbrook,witch,worrying,wrath,years,yellow,yes,york,yu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Set up the nearest neighbors model with the cosine distance metric to find similar items in the dataset of movie genres.

In [34]:
from sklearn.neighbors import NearestNeighbors
nn_algo = NearestNeighbors(metric='cosine')
nn_algo.fit(contents)

Recommender class creates the movie recommendation system that allows to get movie recommendations based on an input movie or based on the history of previously recommended movies. Nearest neighbors model and the genre descriptions of movies are used to make recommendations.

In [35]:
class Recommender:
    def __init__(self):
        # Store movies that called at least ones using recommend_on_movie method
        self.hist = []
        # Check whether history is empty
        self.ishist = False

    # Recommend movies based on a movie that passed as the parameter
    def recommend_on_movie(self,movie,n_recommend = 5):
        self.ishist = True
        iloc = df_movies[df_movies['title']==movie].index[0]
        self.hist.append(iloc)
        distance,neighbors = nn_algo.kneighbors([contents.iloc[iloc]],n_neighbors=n_recommend+1)
        recommeds = [df_movies.iloc[i]['title'] for i in neighbors[0] if i not in [iloc]]
        return recommeds[:n_recommend]

    # Recommend movies based on history stored in self.hist list
    def recommend_on_history(self,n_recommend = 5):
        if self.ishist == False:
            return print('No history found')
        history = np.array([list(contents.iloc[iloc]) for iloc in self.hist])
        distance,neighbors = nn_algo.kneighbors([np.average(history,axis=0)],n_neighbors=n_recommend + len(self.hist))
        recommeds = [df_movies.iloc[i]['title'] for i in neighbors[0] if i not in self.hist]
        return recommeds[:n_recommend]

Create an instance of Recommender class.

In [36]:
recommender = Recommender()

recommend_on_history() method is called. Since no prior history, No history found output is given.

In [37]:
recommender.recommend_on_history()

No history found


Recommend movies based on the movie that is passed as the parameter using recommend_on_movie() method.

In [38]:
recommender.recommend_on_movie('Father of the Bride Part II (1995)')



['Waiting for Guffman (1996)',
 'Jimmy Hollywood (1994)',
 'Kolya (1996)',
 'Life with Mikey (1993)',
 '8 1/2 Women (1999)']

recommend_on_history() method is called. Since now prior history is available, recommendations given.

In [39]:
recommender.recommend_on_history()



['Waiting for Guffman (1996)',
 'Jimmy Hollywood (1994)',
 'Kolya (1996)',
 'Life with Mikey (1993)',
 '8 1/2 Women (1999)']

In [40]:
recommender.recommend_on_movie('Tigerland (2000)')



['Breaking the Waves (1996)',
 'Jails, Hospitals & Hip-Hop (2000)',
 'They Bite (1996)',
 'Black Tights (Les Collants Noirs) (1960)',
 'Identification of a Woman (Identificazione di una donna) (1982)']

In [41]:
recommender.recommend_on_history()



['Sleepover (1995)',
 'Seven Beauties (Pasqualino Settebellezze) (1976)',
 'Virgin Suicides, The (1999)',
 'Man on the Moon (1999)',
 'Two Girls and a Guy (1997)']

In [42]:
recommender.recommend_on_movie('Dracula')



['Nemesis 2',
 'Best of the Best 3',
 'Mighty Morphin Power Rangers',
 'Gumby',
 'Die Hard']

In [43]:
recommender.recommend_on_history()



['Play it to the Bone (1999)',
 'Seven Beauties (Pasqualino Settebellezze) (1976)',
 "Swept Away (Travolti da un insolito destino nell'azzurro mare d'Agosto) (1975)",
 'Muse, The (1999)',
 'Carriers Are Waiting, The (Les Convoyeurs Attendent) (1999)']

In [44]:
recommender.recommend_on_movie('Money Train (1995)')



['Shadow, The (1994)',
 'Black Mask (Hak hap) (1996)',
 'Stranger, The (1994)',
 'Shanghai Noon (2000)',
 'Thunderball (1965)']

In [45]:
recommender.recommend_on_history()



['Get Shorty (1995)',
 'Buffalo 66 (1998)',
 'Faster Pussycat! Kill! Kill! (1965)',
 'Lethal Weapon 4 (1998)',
 'Lethal Weapon 2 (1989)']

In [46]:
recommender.recommend_on_movie('GoldenEye (1995)')



['Anaconda (1997)',
 'Clear and Present Danger (1994)',
 'Surviving the Game (1994)',
 'Chain Reaction (1996)',
 'Rock, The (1996)']

In [47]:
recommender.recommend_on_history()



['Runaway Train (1985)',
 'Daylight (1996)',
 'Con Air (1997)',
 'Fire Down Below (1997)',
 'Outbreak (1995)']