In [1]:
# importing required libraries
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score

In [2]:
# load the dataset
movies_df = pd.read_csv('movies.csv', low_memory=False)
tags_df = pd.read_csv('tags.csv', low_memory=False)

In [3]:
# dropping any unused columns
tags_df = tags_df.drop(['userId', 'timestamp'], axis = 1).reset_index(drop=True)

In [4]:
# check for missing values
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
# check for missing values
tags_df.isnull().sum()

movieId     0
tag        16
dtype: int64

In [6]:
# show result
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:
# show result
tags_df

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good
...,...,...
1093355,66934,Neil Patrick Harris
1093356,103341,cornetto trilogy
1093357,189169,comedy
1093358,189169,disabled


In [8]:
# removing null values from tag
tags_df['tag'] = tags_df['tag'].apply(lambda x: '' if pd.isnull(x) else x)

# grouping tags with the same movieId
tags_combined_df = tags_df.groupby('movieId')['tag'].agg(lambda x: '|'.join(x)).reset_index()

# show result
tags_combined_df

Unnamed: 0,movieId,tag
0,1,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Robin Williams|time travel|fantasy|based on ch...
2,3,funny|best friend|duringcreditsstinger|fishing...
3,4,based on novel or book|chick flick|divorce|int...
4,5,aging|baby|confidence|contraception|daughter|g...
...,...,...
45246,208813,might like
45247,208933,black and white|deal with the devil
45248,209035,computer animation|Japan|mass behavior|mass sc...
45249,209037,chameleon|computer animation|gluttony|humorous...


In [9]:
# Merge tags with movies
merged = pd.merge(movies_df, tags_combined_df, on='movieId', how='left')
merged

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62418,209157,We (2018),Drama,
62419,209159,Window of the Soul (2001),Documentary,
62420,209163,Bad Poems (2018),Comedy|Drama,
62421,209169,A Girl Thing (2001),(no genres listed),


In [10]:
# Fill NaN values in 'tag' column with an empty string
merged['tag'] = merged['tag'].fillna('')

In [11]:
# removing any movies with no tags
merged = merged[merged['tag'] != ""]

In [12]:
# removing any movies with no genres
merged = merged[merged['genres'] != "(no genres listed)"]

In [13]:
# show result
merged[['movieId', 'title', 'genres', 'tag']]

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned|imdb top 250|Pixar|Pixar|time travel|chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny|best friend|duringcreditsstinger|fishing...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...
4,5,Father of the Bride Part II (1995),Comedy,aging|baby|confidence|contraception|daughter|g...
...,...,...,...,...
62340,208800,Lady and the Tramp (2019),Comedy|Romance,might like
62342,208804,Spell (2018),Comedy|Drama|Thriller,Music|Patrick Stump
62346,208813,Noelle (2019),Children,might like
62373,208933,The Devil's Partner (1961),Horror,black and white|deal with the devil


In [19]:
# Randomly sample 1500 rows from data
sampled_merged = merged.sample(n=1500, random_state=1)

# One-hot encode genres
genres_one_hot = sampled_merged['genres'].str.get_dummies()

# One-hot encode tags
tags_one_hot = sampled_merged['tag'].str.get_dummies()

# Convert the one-hot encoded dataframes to sparse matrices
genres_sparse = csr_matrix(genres_one_hot.values)
tags_sparse = csr_matrix(tags_one_hot.values)

# Sum one-hot encoded genres and tags
genres_sum = genres_sparse.sum(axis=0)
tags_sum = tags_sparse.sum(axis=0)

print('Genres:')
print(genres_sum)
print('\nTags:')
print(tags_sum)

Genres:
[[153 113  86  86 447 139 135 661  72   7 169   5  29  84 227 103 229  48
   48]]

Tags:
[[1 1 1 ... 1 1 1]]


In [20]:
# sort the samples in alphabetical order
sampled_merged = sampled_merged.sort_values('movieId')
sampled_merged.head(25)

Unnamed: 0,movieId,title,genres,tag
10,11,"American President, The (1995)",Comedy|Drama|Romance,Romance|white house|new love|usa president|whi...
45,46,How to Make an American Quilt (1995),Drama|Romance,advice|extramarital affair|family holiday|gran...
71,72,Kicking and Screaming (1995),Comedy|Drama,chris eigeman|noah baumbach|quotable dialogue|...
111,113,Before and After (1996),Drama|Mystery,cover up|Massachusetts|murder|mystery|teenager
113,115,Happiness Is in the Field (Bonheur est dans le...,Comedy,best friend|country|identity swap|twins|woman ...
120,122,Boomerang (1992),Comedy|Romance,casanova|chefin|ladies' man|ladykiller|role of...
126,128,Jupiter's Wife (1994),Documentary,Michel Negroponte|DVD-Video
205,207,"Walk in the Clouds, A (1995)",Drama|Romance,own|abandoned woman|grape|harvest|love|pregnan...
216,218,Boys on the Side (1995),Comedy|Drama,aids|car journey|escape|friends|friendship|hom...
218,220,Castle Freak (1995),Horror,cannibalism|castle|death|voyeurism|H.P. Lovecr...


In [21]:
# Combine features (genres and tags) with other movie features
features = pd.concat([sampled_merged[['movieId', 'title']], genres_one_hot, tags_one_hot], axis=1)

# Assuming 'features' is a DataFrame containing movie features
features.set_index('movieId', inplace=True)

# Exclude non-numeric columns ('title' in this case) from similarity calculation
numeric_features = features.select_dtypes(include=['number'])
similarities = cosine_similarity(numeric_features)

# Ask the user to input a movie title
user_input = input("Enter a movie title: ")

# Find the index of the movie in the DataFrame that matches the input title
movie_index = features[features['title'].str.lower() == user_input.lower()].index

# Check if the movie exists in the DataFrame
if not movie_index.empty:
    movie_index = movie_index[0]
    if movie_index < len(similarities):
        similar_indices = similarities[movie_index].argsort()[-6:-1][::-1]  # Exclude the movie itself
        similar_movie_ids = numeric_features.iloc[similar_indices].index
        similar_movies = features.loc[similar_movie_ids, 'title']

        # Print recommendations for the input movie title
        print('Recommendations for', user_input)
        print(similar_movies)
    else:
        print("Movie index out of bounds.")
else:
    print("Movie not found.")

Enter a movie title:  Santa with Muscles (1996)


Recommendations for Santa with Muscles (1996)
movieId
157196    Mobile Suit Gundam 00: Awakening of the Trailb...
195185                                  Project A-Ko (1986)
125960                       The Trip to Squash Land (1967)
181813                   The Story of Anyburg U.S.A. (1957)
175913                       How A Sausage Dog Works (1971)
Name: title, dtype: object


In [None]:
# Split data into training and test sets
train_features, test_features = train_test_split(features, test_size=0.2)

# Exclude non-numeric columns ('title' in this case) from similarity calculation
train_numeric_features = train_features.select_dtypes(include=['number'])
test_numeric_features = test_features.select_dtypes(include=['number'])

# Calculate cosine similarity on the training set
similarities = cosine_similarity(train_numeric_features)

# Function to get movie recommendations for a given movie title
def get_recommendations(user_input):
    # Find the index of the movie in the DataFrame that matches the input title
    movie_index = features[features['title'].str.lower() == user_input.lower()].index

    # Check if the movie exists in the DataFrame
    if not movie_index.empty:
        movie_index = movie_index[0]
        similar_indices = similarities[movie_index].argsort()[-6:-1][::-1]  # Exclude the movie itself
        similar_movie_ids = train_numeric_features.iloc[similar_indices].index
        similar_movies = features.loc[similar_movie_ids, 'title']

        # Return recommendations for the input movie title
        return similar_movies.tolist()
    else:
        return None

# Evaluate the recommender system
predictions = []
actual_movies = []

for index, row in test_features.iterrows():
    user_input = row['title']
    actual_movie = row['title']
    actual_movies.append(actual_movie)
    recommendations = get_recommendations(user_input)
    if recommendations:
        predictions.append(recommendations)
    else:
        predictions.append([])

# Flatten the predictions list
predictions_flat = [item for sublist in predictions for item in sublist]

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_movies, predictions_flat))

# Calculate Precision and Recall
precision = precision_score(actual_movies, predictions_flat, average='micro')
recall = recall_score(actual_movies, predictions_flat, average='micro')

print(f"RMSE: {rmse}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")