In [31]:
import pandas as pd
from google.colab import files
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer  # used to convert a collection of text documents to a vector of term/token counts
from sklearn.metrics.pairwise import cosine_similarity
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [32]:
upload = files.upload()

Saving movies.csv to movies (1).csv
Saving ratings.csv to ratings (1).csv


In [47]:
ratings = pd.read_csv('ratings.csv', sep=",")
movies = pd.read_csv('movies.csv', sep=",")

## Checking for duplicated movieIDs
display(movies['movieId'].value_counts().sort_values(ascending=False).head()) 
# Checking for duplicated titles
display(movies['title'].value_counts().sort_values(ascending=False).head(10) > 1)

# Extracting duplicated movie ids
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']] 


1     1
8     1
41    1
18    1
3     1
Name: movieId, dtype: int64

Emma (1996)                                True
Confessions of a Dangerous Mind (2002)     True
Eros (2004)                                True
Saturn 3 (1980)                            True
War of the Worlds (2005)                   True
Tom and Huck (1995)                       False
Richard III (1995)                        False
Four Rooms (1995)                         False
Grumpier Old Men (1995)                   False
Waiting to Exhale (1995)                  False
Name: title, dtype: bool

In [48]:
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts()) # make df has "duplicated" movId and its # of duplications
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')     # has duplicated movies titles, ids, # of duplications
display(duplicated_df)

## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title",               # drop duplicated titles, keeping the last duplicate (it has the Lowest count) and get the ids
                     keep = 'last', inplace = False)['movieId']

# Removing duplicated ids with low review count from movie database
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
# Removing duplicated ids with low review count from rating database
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]


Unnamed: 0,movieId,title,count
0,838,Emma (1996),30
1,2851,Saturn 3 (1980),4
2,6003,Confessions of a Dangerous Mind (2002),15
3,26958,Emma (1996),1
4,32600,Eros (2004),1
5,34048,War of the Worlds (2005),50
6,64997,War of the Worlds (2005),2
7,144606,Confessions of a Dangerous Mind (2002),1
8,147002,Eros (2004),1
9,168358,Saturn 3 (1980),1


In [49]:
# Creating a column for each genre, val = 1 if this film has this genre and val = 0 otherwise

#creating list with unique genres
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

#Droping columns
ratings.drop('timestamp', axis=1,inplace= True) 
movies.drop('genres', axis=1,inplace= True) 
#Drop null values
ratings.dropna(axis=0, inplace=True)
movies.dropna(axis=0, inplace=True)

In [50]:
# Take the first "watched" 200 film by first 200 user
movies = movies.iloc[:200]
ratings = ratings[ratings['userId'] <= 200]

# Joining ratings, movies Data Frames
df = pd.merge(ratings, movies, on='movieId')

# Item-based collaborative recommender
def item_based_recom(input_dataframe,input_film_name):    
    pivot_item_based = pd.pivot_table(input_dataframe,                      # it is a df of films and users' ratings
                                      index='title',                        # the rows/ index of result df are titles, they should be the index as it's item based
                                      columns=['userId'], values='rating')  # each column is a userId, and the val of cell is userId's rating to film title
    
    sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
    recommender = pw.cosine_similarity(sparse_pivot)               # returns kernel matrix, ndarray of shape (n_samples_X, n_samples_Y)
    recommender_df = pd.DataFrame(recommender, 
                                  columns=pivot_item_based.index,  # it's a df of film titles row/col and val is their similarity
                                  index=pivot_item_based.index)
    
    # Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[input_film_name].sort_values(ascending=False))  # df: rows are titles, col is film name, val is similarity
    cosine_df.reset_index(level=0, inplace=True)        # make index/ rows for df starting from 0, columns are titles And similarity (but this col still has its name= film_name)
    cosine_df.columns = ['title','cosine_sim']          # rename cols
    return cosine_df      # a df of all titles and their similarity with film_name

## Item and Genre-based recommender
def item_and_genre_based_recom(cosine_df,movies_df,categories):    
    cos_genre = pd.merge(cosine_df, movies, on='title')         # df has titles, similarity between them and film_name, titles' genres
    # Creating column with genre cosine similarity
    cos_genre['genre_similarity'] = [pairwise_row_diff(cos_genre,0,row,categories)    # get genre similarity between film(row 0) and other films(row)
                                          for row in cos_genre.index.values]
    return cos_genre[['title','cosine_sim','genre_similarity']]

def pairwise_row_diff(dataframe,row1, row2,column_names):
    # Creates 2 Matrices to compare cosine similarity
    matrix_row1 = [[dataframe.loc[row1,cat] for cat in column_names]] # 2d matrix, of all row1 categories values (0/1)
    matrix_row2 = [[dataframe.loc[row2,cat] for cat in column_names]] 
    return round(pw.cosine_similarity(matrix_row1,matrix_row2)[0][0],5) # pw.cosine_similarity(matrix_row1,matrix_row2) -> returns 2d mat of one value (the similarity between mat_row1, mat_row2)

print(df)

      userId  movieId  rating                             title  Romance  \
0          1        1     4.0                  Toy Story (1995)        0   
1          5        1     4.0                  Toy Story (1995)        0   
2          7        1     4.5                  Toy Story (1995)        0   
3         15        1     2.5                  Toy Story (1995)        0   
4         17        1     4.5                  Toy Story (1995)        0   
...      ...      ...     ...                               ...      ...   
2137     191      148     5.0  Awfully Big Adventure, An (1995)        0   
2138     191      155     4.0             Beyond Rangoon (1995)        0   
2139     191      178     1.0       Love & Human Remains (1993)        0   
2140     191      229     5.0       Death and the Maiden (1994)        0   
2141     199      229     3.0       Death and the Maiden (1994)        0   

      Musical  Animation  Action  Adventure  Crime  ...  Mystery  War  \
0           0 

In [55]:
def generate_recomendations(df,film_name,top_results=10,cat=genres):
    print("***********************************************************************************************\n")
    print("Movie Recommender by Aya Ahmed:\n")
    print("Films you might enjoy based that you watched ", film_name)
    ## Item Rating Based Cosine Similarity
    cos_sim = item_based_recom(df,film_name)
    cos_genre_sim = item_and_genre_based_recom(cos_sim,movies,cat)
    cos_genre_sim['sim_mean'] = cos_genre_sim[['cosine_sim', 'genre_similarity']].mean(axis = 1)
    
    display(cos_genre_sim\
            .sort_values('sim_mean',ascending=False)[1:top_results+1])
    return None
    

generate_recomendations(df,"Toy Story (1995)")
generate_recomendations(df,"Waiting to Exhale (1995)")

***********************************************************************************************

Movie Recommender by Aya Ahmed:

Films you might enjoy based that you watched  Toy Story (1995)


Unnamed: 0,title,cosine_sim,genre_similarity,sim_mean
14,Jumanji (1995),0.364634,0.7746,0.569617
35,"Indian in the Cupboard, The (1995)",0.280137,0.7746,0.527369
24,Muppet Treasure Island (1996),0.331861,0.67082,0.50134
16,Dumb & Dumber (Dumb and Dumber) (1994),0.360625,0.63246,0.496543
19,Casper (1995),0.343982,0.63246,0.488221
72,"NeverEnding Story III, The (1994)",0.155499,0.7746,0.465049
4,Happy Gilmore (1996),0.452382,0.44721,0.449796
9,Batman Forever (1995),0.407208,0.44721,0.427209
138,Balto (1995),0.06271,0.7746,0.418655
12,Ace Ventura: When Nature Calls (1995),0.383177,0.44721,0.415194


***********************************************************************************************

Movie Recommender by Aya Ahmed:

Films you might enjoy based that you watched  Waiting to Exhale (1995)


Unnamed: 0,title,cosine_sim,genre_similarity,sim_mean
13,Something to Talk About (1995),0.492366,1.0,0.746183
25,Don Juan DeMarco (1995),0.382276,1.0,0.691138
5,Mad Love (1995),0.5,0.8165,0.65825
12,How to Make an American Quilt (1995),0.494975,0.8165,0.655737
14,"Scarlet Letter, The (1995)",0.447214,0.8165,0.631857
47,Eat Drink Man Woman (Yin shi nan nu) (1994),0.257513,1.0,0.628757
18,Up Close and Personal (1996),0.414644,0.8165,0.615572
61,Beautiful Girls (1996),0.223607,1.0,0.611803
19,"Bridges of Madison County, The (1995)",0.402492,0.8165,0.609496
24,Sense and Sensibility (1995),0.387843,0.8165,0.602171
