In [119]:
# importing required libraries
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [120]:
# load the dataset
movies_df = pd.read_csv('movies.csv')
tags_df = pd.read_csv('tags.csv')

In [121]:
# dropping any unused columns
tags_df = tags_df.drop(['userId', 'timestamp'], axis = 1).reset_index(drop=True)

In [122]:
# check for missing values
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [123]:
# check for missing values
tags_df.isnull().sum()

movieId     0
tag        16
dtype: int64

In [124]:
# show result
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [125]:
# show result
tags_df

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good
...,...,...
1093355,66934,Neil Patrick Harris
1093356,103341,cornetto trilogy
1093357,189169,comedy
1093358,189169,disabled


In [126]:
# removing null values from tag
tags_df['tag'] = tags_df['tag'].apply(lambda x: '' if pd.isnull(x) else x)

# grouping tags with the same movieId
tags_combined_df = tags_df.groupby('movieId')['tag'].agg(lambda x: '|'.join(x)).reset_index()

# show result
tags_combined_df

Unnamed: 0,movieId,tag
0,1,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Robin Williams|time travel|fantasy|based on ch...
2,3,funny|best friend|duringcreditsstinger|fishing...
3,4,based on novel or book|chick flick|divorce|int...
4,5,aging|baby|confidence|contraception|daughter|g...
...,...,...
45246,208813,might like
45247,208933,black and white|deal with the devil
45248,209035,computer animation|Japan|mass behavior|mass sc...
45249,209037,chameleon|computer animation|gluttony|humorous...


In [127]:
# Merge tags with movies
merged = pd.merge(movies_df, tags_combined_df, on='movieId', how='left')
merged

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62418,209157,We (2018),Drama,
62419,209159,Window of the Soul (2001),Documentary,
62420,209163,Bad Poems (2018),Comedy|Drama,
62421,209169,A Girl Thing (2001),(no genres listed),


In [128]:
# Fill NaN values in 'tag' column with an empty string
merged['tag'] = merged['tag'].fillna('')

In [129]:
# removing any movies with no tags
merged = merged[merged['tag'] != ""]

In [130]:
# removing any movies with no genres
merged = merged[merged['genres'] != "(no genres listed)"]

In [131]:
# show result
merged[['movieId', 'title', 'genres', 'tag']]

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62340,208800,Lady and the Tramp (2019),Comedy|Romance,might like
62342,208804,Spell (2018),Comedy|Drama|Thriller,Music|Patrick Stump
62346,208813,Noelle (2019),Children,might like
62373,208933,The Devil's Partner (1961),Horror,black and white|deal with the devil


In [139]:
# Randomly sample 1000 rows from data
sampled_merged = merged.sample(n=1000, random_state=10)

# One-hot encode genres
genres_one_hot = sampled_merged['genres'].str.get_dummies()

# One-hot encode tags
tags_one_hot = sampled_merged['tag'].str.get_dummies()

# Convert the one-hot encoded dataframes to sparse matrices
genres_sparse = csr_matrix(genres_one_hot.values)
tags_sparse = csr_matrix(tags_one_hot.values)

# Sum one-hot encoded genres and tags
genres_sum = genres_sparse.sum(axis=0)
tags_sum = tags_sparse.sum(axis=0)

print('Genres:')
print(genres_sum)
print('\nTags:')
print(tags_sum)

Genres:
[[135  65  42  47 302  94  94 478  37   9 100   3  20  63 122  51 143  37
   26]]

Tags:
[[1 1 1 ... 1 1 1]]


In [140]:
# sort the samples in alphabetical order
sampled_merged = sampled_merged.sort_values('movieId')
sampled_merged

Unnamed: 0,movieId,title,genres,tag
8,9,Sudden Death (1995),Action,explosive|hostage|terrorist|vice president|Jea...
38,39,Clueless (1995),Comedy|Romance,teen movie|teen movie|chick flick|quotable|tee...
52,53,Lamerica (1994),Adventure|Drama,independent film|Gianni Amelio|immigrants|in n...
114,116,Anne Frank Remembered (1995),Documentary,auschwitz|burglary|fascism|interview|prejudice...
121,123,Chungking Express (Chung Hing sam lam) (1994),Drama|Mystery|Romance,BRIEF ENCOUNTERS|dreamlike|fanciful|lyrical|me...
...,...,...,...,...
61558,206216,Billion Dollar Bully (2019),Documentary,yelp
61811,206949,Untouchable (2019),Documentary,Metoo
61832,207039,The Calamari Wrestler (2004),Comedy,B-movie|over the top|trash
62105,207888,Black and Blue (2019),Action|Crime|Drama,might like


In [141]:
# Combine features (genres and tags) with other movie features
features = pd.concat([sampled_merged[['movieId', 'title']], genres_one_hot, tags_one_hot], axis=1)

# Assuming 'features' is a DataFrame containing movie features
features.set_index('movieId', inplace=True)

# Exclude non-numeric columns ('title' in this case) from similarity calculation
numeric_features = features.select_dtypes(include=['number'])
similarities = cosine_similarity(numeric_features)

# Ask the user to input a movie title
user_input = input("Enter a movie title: ")

# Find the index of the movie in the DataFrame that matches the input title
movie_index = features[features['title'].str.lower() == user_input.lower()].index

# Check if the movie exists in the DataFrame
if not movie_index.empty:
    movie_index = movie_index[0]
    similar_indices = similarities[movie_index].argsort()[-6:-1][::-1]  # Exclude the movie itself
    similar_movie_ids = numeric_features.iloc[similar_indices].index
    similar_movies = features.loc[similar_movie_ids, 'title']

    # Print recommendations for the input movie title
    print('Recommendations for', user_input)
    print(similar_movies)
else:
    print("Movie not found.")

Enter a movie title:  Clueless (1995)


Recommendations for Clueless (1995)
movieId
58209     Alex in Wonder (Sex and a Girl) (2001)
167540                María (y los demás) (2016)
175097              All the Wrong Reasons (2013)
181031         The Last Days of Chez Nous (1992)
100370          Prinsessa (Starring Maja) (2009)
Name: title, dtype: object
