Author: Nicolas Saenz
Class: DSC 630
Week 9 Assignment

https://www.geeksforgeeks.org/machine-learning/build-a-recommendation-engine-with-collaborative-filtering/

Link to the source used to make the recommender

In [294]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from functools import reduce

In [296]:
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [298]:

# Get dataset info
tags.info()
movies.info()
ratings.info()
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null 

In [300]:
# Attempt to drop duplicate titles
new_movies = movies.drop_duplicates('title')

In [302]:
movies.info()
new_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 9737 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9737 non-null   int64 
 1   title    9737 non-null   object
 2   genres   9737 non-null   object
dtypes: int64(1), object(2)
memory usage: 304.3+ KB


No change in structure means there are no duplicates. So we can procede with the original unaltered dataframe.

In [305]:
print(movies)


# Drop non-relevant columns
#movies.drop(['genres'], axis = 1, inplace = True)

print(movies)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  

In [307]:
#Remove the year of creation from every title
movies['title'] = movies['title'].str[:-7]
print(movies.head())

   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [317]:
# Merge ratings and movie df
ratings_with_title = pd.merge(ratings, movies, on='movieId', how='left')


In [319]:
print(ratings_with_title.head())

   userId  movieId  rating  timestamp                 title  \
0       1        1     4.0  964982703             Toy Story   
1       1        3     4.0  964981247      Grumpier Old Men   
2       1        6     4.0  964982224                  Heat   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en)   
4       1       50     5.0  964982931   Usual Suspects, The   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [321]:
# Filter down 'users_ratings_matrix' on the basis of users who gave many movie ratings
x = ratings_with_title.groupby('userId').count()['rating'] > 150
knowledgeable_users = x[x].index
filtered_users_ratings = ratings_with_title[ratings_with_title['userId'].isin(knowledgeable_users)]

# Filter down 'users_ratings_matrix' on the basis of movies with most ratings with a rating of 3.5 or higher
y = filtered_users_ratings.groupby('movieId').count()['rating'] >= 3.0
famous_movies = y[y].index
final_users_ratings = filtered_users_ratings[filtered_users_ratings['movieId'].isin(famous_movies)]

In [323]:
# Pivot table creation
pivot_table = final_users_ratings.pivot_table(index = 'title', columns = 'movieId', values = 'rating')

# Filling the NA values with '0'
pivot_table.fillna(0, inplace = True)
pivot_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,179819,180031,180985,182715,182823,183897,184471,187593,187595,188301
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*batteries not included,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [324]:
# Standardize the pivot table
scaler = StandardScaler(with_mean=True, with_std=True)
pivot_table_normalized = scaler.fit_transform(pivot_table)

In [325]:
# Calculate the similarity matrix for all the movie
similarity_score = cosine_similarity(pivot_table_normalized)

In [327]:
def recommend(title):
    
    # Returns the numerical index for the movie title
    index = np.where(pivot_table.index==title)[0][0]
    
    # Sorts the similarities for the movie_name in descending order
    similar_movies = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1], reverse=True)[1:6]
    
    # To return result in list format
    data = []
    
    for index,similarity in similar_movies:
        item = []
        # Get the movie details by index
        temp_df = movies[movies['title'] == pivot_table.index[index]]
        
        # Only add the title, author, and image-url to the result
        item.extend(temp_df['title'].values)
        #item.extend(temp_df['Movie-Author'].values)
        #item.extend(temp_df['Image-URL-M'].values)
        
        data.append(item)
    return data


In [329]:
# Call the recommend method
recommend('Shawshank Redemption, The')

[['Stargate'],
 ['Only You'],
 ['Quick and the Dead, The'],
 ['Ready to Wear (Pret-A-Porter)'],
 ['Santa Clause, The']]