In [1]:
# importing required libraries
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

from itertools import combinations

In [2]:
# load the dataset
movies_df = pd.read_csv('movies.csv', nrows=62424)
tags_df = pd.read_csv('tags.csv', nrows=62424)

In [3]:
# dropping any unused columns
tags_df = tags_df.drop(['userId', 'timestamp'], axis = 1).reset_index(drop=True)

In [4]:
# check for missing values
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
# check for missing values
tags_df.isnull().sum()

movieId    0
tag        0
dtype: int64

In [6]:
# show result
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:
# show result
tags_df

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good
...,...,...
62419,3289,money
62420,3289,mountain
62421,3289,promise
62422,3289,teacher


In [8]:
# removing null values from tag
tags_df['tag'] = tags_df['tag'].apply(lambda x: '' if pd.isnull(x) else x)

# grouping tags with the same movieId
tags_combined_df = tags_df.groupby('movieId')['tag'].agg(lambda x: '|'.join(x)).reset_index()

# show result
tags_combined_df

Unnamed: 0,movieId,tag
0,1,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Robin Williams|time travel|fantasy|based on ch...
2,3,funny|best friend|duringcreditsstinger|fishing...
3,4,based on novel or book|chick flick|divorce|int...
4,5,aging|baby|confidence|contraception|daughter|g...
...,...,...
6995,204878,funny|Wesley Snipes
6996,205383,breaking bad
6997,206024,MMsWL
6998,206399,MMsWL


In [9]:
# Merge tags with movies
merged = pd.merge(movies_df, tags_combined_df, on='movieId', how='left')
merged

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62418,209157,We (2018),Drama,
62419,209159,Window of the Soul (2001),Documentary,
62420,209163,Bad Poems (2018),Comedy|Drama,
62421,209169,A Girl Thing (2001),(no genres listed),


In [10]:
# Fill NaN values in 'tag' column with an empty string
merged['tag'] = merged['tag'].fillna('')

In [11]:
# removing any movies with no tags
merged = merged[merged['tag'] != ""]

In [12]:
# removing any movies with no genres
merged = merged[merged['genres'] != "(no genres listed)"]

In [13]:
# show result
merged[['movieId', 'title', 'genres', 'tag']]

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
61073,204878,Dolemite Is My Name (2019),Comedy|Drama,funny|Wesley Snipes
61253,205383,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller,breaking bad
61482,206024,The Swallows of Kabul (2019),Animation,MMsWL
61625,206399,For Sama (2019),Documentary,MMsWL


In [14]:
# One-hot encode genres
genres_one_hot = merged['genres'].str.get_dummies()

# One-hot encode tags
tags_one_hot = merged['tag'].str.get_dummies()

# Convert the one-hot encoded dataframes to sparse matrices
genres_sparse = csr_matrix(genres_one_hot.values)
tags_sparse = csr_matrix(tags_one_hot.values)

# Sum one-hot encoded genres and tags
genres_sum = genres_sparse.sum(axis=0)
tags_sum = tags_sparse.sum(axis=0)

print('Genres:')
print(genres_sum)
print('\nTags:')
print(tags_sum)

Genres:
[[1188  842  430  457 2313  846  263 3364  549   89  750  115  223  463
  1148  720 1431  272  114]]

Tags:
[[1 2 1 ... 1 1 1]]


In [15]:
# sort the samples in alphabetical order
merged = merged.sort_values('movieId')
merged.head(60)

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
5,6,Heat (1995),Action|Crime|Thriller,imdb top 250|great acting|realistic action|sus...
6,7,Sabrina (1995),Comedy|Romance,remake|chauffeur|fusion|long island|millionair...
8,9,Sudden Death (1995),Action,explosive|hostage|terrorist|vice president
9,10,GoldenEye (1995),Action|Adventure|Thriller,007|Bond|boys with toys|gadgets|secret service...
10,11,"American President, The (1995)",Comedy|Drama|Romance,Romance|white house|new love|usa president|whi...


In [16]:
# Combine features (genres and tags) with other movie features
features = pd.concat([merged[['movieId', 'title']], genres_one_hot, tags_one_hot], axis=1)

# Assuming 'features' is a DataFrame containing movie features
features.set_index('movieId', inplace=True)

# Exclude non-numeric columns ('title' in this case) from similarity calculation
numeric_features = features.select_dtypes(include=['number'])
similarities = cosine_similarity(numeric_features)

# Ask the user to input a movie title
user_input = input("Enter a movie title: ")

# Find the index of the movie in the DataFrame that matches the input title
movie_index = features[features['title'].str.lower() == user_input.lower()].index

# Check if the movie exists in the DataFrame
if not movie_index.empty:
    movie_index = movie_index[0]
    if movie_index < len(similarities):
        similar_indices = similarities[movie_index].argsort()[-6:-1][::-1]  # Exclude the movie itself
        similar_movie_ids = numeric_features.iloc[similar_indices].index
        similar_movies = features.loc[similar_movie_ids, 'title']

        # Print recommendations for the input movie title
        print('Recommendations for', user_input)
        print(similar_movies)
    else:
        print("Movie index out of bounds.")
else:
    print("Movie not found.")

Enter a movie title:  Big Green, The (1995)


Recommendations for Big Green, The (1995)
movieId
5394                      Don't Bother to Knock (1952)
32025                             Walk on Water (2004)
1165                          Bloody Child, The (1996)
2621      Xiu Xiu: The Sent-Down Girl (Tian yu) (1998)
102684                        Only God Forgives (2013)
Name: title, dtype: object


In [None]:
# Calculate content-based similarity using genres
genres_matrix = merged['genres'].str.get_dummies(sep='|')
genres_similarity = cosine_similarity(genres_matrix)

# Calculate content-based similarity using tags
tags_matrix = merged['tag'].str.get_dummies(sep='|')
tags_similarity = cosine_similarity(tags_matrix)

# Create a mapping between movie titles and indices
title_to_index = {title: idx for idx, title in enumerate(merged['title'])}

# Calculate diversity
pairwise_combinations = list(combinations(merged['title'], 2))
diversity_scores = []
for movie1, movie2 in pairwise_combinations:
    index1 = title_to_index[movie1]
    index2 = title_to_index[movie2]
    diversity_score = 1 - genres_similarity[index1][index2]
    diversity_scores.append(diversity_score)
diversity = sum(diversity_scores) / len(diversity_scores)

# Calculate feature coverage
total_features = set(merged['genres'].str.split('|').sum() + merged['tag'].str.split('|').sum())
recommended_features = set(merged.loc[1:3, 'genres'].str.split('|').sum() + merged.loc[1:3, 'tag'].str.split('|').sum())
feature_coverage = len(recommended_features) / len(total_features)

print(f'Content-Based Similarity (Genres): \n{genres_similarity}\n')
print(f'Content-Based Similarity (Tags): \n{tags_similarity}\n')
print(f'Feature Coverage: {feature_coverage}\n')
print(f'Diversity: {diversity}\n')