In [87]:
# importing required libraries
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
# load the dataset
movies_df = pd.read_csv('movies.csv')
tags_df = pd.read_csv('tags.csv')

In [89]:
# dropping any unused columns
tags_df = tags_df.drop(['userId', 'timestamp'], axis = 1).reset_index(drop=True)

In [90]:
# check for missing values
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [91]:
# check for missing values
tags_df.isnull().sum()

movieId     0
tag        16
dtype: int64

In [92]:
# show result
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [93]:
# show result
tags_df

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good
...,...,...
1093355,66934,Neil Patrick Harris
1093356,103341,cornetto trilogy
1093357,189169,comedy
1093358,189169,disabled


In [94]:
# removing null values from tag
tags_df['tag'] = tags_df['tag'].apply(lambda x: '' if pd.isnull(x) else x)

# grouping tags with the same movieId
tags_combined_df = tags_df.groupby('movieId')['tag'].agg(lambda x: '|'.join(x)).reset_index()

# show result
tags_combined_df

Unnamed: 0,movieId,tag
0,1,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Robin Williams|time travel|fantasy|based on ch...
2,3,funny|best friend|duringcreditsstinger|fishing...
3,4,based on novel or book|chick flick|divorce|int...
4,5,aging|baby|confidence|contraception|daughter|g...
...,...,...
45246,208813,might like
45247,208933,black and white|deal with the devil
45248,209035,computer animation|Japan|mass behavior|mass sc...
45249,209037,chameleon|computer animation|gluttony|humorous...


In [95]:
# Merge tags with movies
merged = pd.merge(movies_df, tags_combined_df, on='movieId', how='left')
merged

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62418,209157,We (2018),Drama,
62419,209159,Window of the Soul (2001),Documentary,
62420,209163,Bad Poems (2018),Comedy|Drama,
62421,209169,A Girl Thing (2001),(no genres listed),


In [96]:
# Fill NaN values in 'tag' column with an empty string
merged['tag'] = merged['tag'].fillna('')

In [97]:
# removing any movies with no tags
merged = merged[merged['tag'] != ""]

In [98]:
# removing any movies with no genres
merged = merged[merged['genres'] != "(no genres listed)"]

In [99]:
# show result
merged[['movieId', 'title', 'genres', 'tag']]

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62340,208800,Lady and the Tramp (2019),Comedy|Romance,might like
62342,208804,Spell (2018),Comedy|Drama|Thriller,Music|Patrick Stump
62346,208813,Noelle (2019),Children,might like
62373,208933,The Devil's Partner (1961),Horror,black and white|deal with the devil


In [100]:
# Randomly sample 1000 rows from data
sampled_merged = merged.sample(n=1000, random_state=10)

# One-hot encode genres
genres_one_hot = sampled_merged['genres'].str.get_dummies()

# One-hot encode tags
tags_one_hot = sampled_merged['tag'].str.get_dummies()

# Convert the one-hot encoded dataframes to sparse matrices
genres_sparse = csr_matrix(genres_one_hot.values)
tags_sparse = csr_matrix(tags_one_hot.values)

# Sum one-hot encoded genres and tags
genres_sum = genres_sparse.sum(axis=0)
tags_sum = tags_sparse.sum(axis=0)

print('Genres:')
print(genres_sum)
print('\nTags:')
print(tags_sum)

Genres:
[[135  65  42  47 302  94  94 478  37   9 100   3  20  63 122  51 143  37
   26]]

Tags:
[[1 1 1 ... 1 1 1]]


In [101]:
# sort the samples in alphabetical order
sampled_merged = sampled_merged.sort_values('title')
sampled_merged

Unnamed: 0,movieId,title,genres,tag
7669,8198,"1000 Eyes of Dr. Mabuse, The (Die 1000 Augen d...",Crime|Horror|Mystery|Thriller,dr. mabuse|manipulation|Fritz Lang|Fritz Lang|...
30345,136010,12 Days of Christmas Eve (2004),Children|Comedy|Drama|Fantasy,christmas eve|holiday|woman director
26876,127573,13 Lakes (2004),Documentary,James Benning
5842,5954,25th Hour (2002),Crime|Drama,Philip Seymour Hoffman|Edward Norton|powerful ...
41608,162070,3 Days of Normal (2012),Children|Comedy|Romance,new hampshire|small town|small town sheriff
...,...,...,...,...
14155,73266,Youth in Revolt (2009),Comedy|Drama|Romance,based on novel or book|coming of age|duringcre...
40416,159367,Zappa (1983),Drama,coming of age
12743,62851,Zen Noir (2004),Comedy|Drama|Mystery,religion:Buddism|dark comedy|death/fatality|DV...
34096,144646,Zombie Ninjas vs Black Ops (2015),Action|Horror,zombie


In [104]:
# Combine features (genres and tags) with other movie features
features = pd.concat([sampled_merged[['movieId', 'title']], genres_one_hot, tags_one_hot], axis=1)

# Assuming 'features' is a DataFrame containing movie features
features.set_index('movieId', inplace=True)

# Exclude non-numeric columns ('title' in this case) from similarity calculation
numeric_features = features.select_dtypes(include=['number'])
similarities = cosine_similarity(numeric_features)

# Get top 5 similar movies for movie with index 0
# Movies are sorted in alphabetical order, for instance, index 0 is equal to the first movie in that list
movie_index = 4
similar_indices = similarities[movie_index].argsort()[-6:-1][::-1]  # Exclude the movie itself
similar_movie_ids = numeric_features.iloc[similar_indices].index
similar_movies = features.loc[similar_movie_ids, 'title']

print('Recommendations for', features.iloc[movie_index]['title'])
print(similar_movies)

Recommendations for 3 Days of Normal (2012)
movieId
147047    Caramuru - A Invenção do Brasil (2001)
117744            The Farmer Takes a Wife (1935)
126233                     Seventh Heaven (1993)
121057         The Well-Digger's Daughter (1940)
27805                             Lucky 7 (2003)
Name: title, dtype: object
