#Content Based Recommender

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Import Datasets

imdb=pd.read_csv('imdb_data.csv')
movies=pd.read_csv('movies.csv')
train=pd.read_csv('train.csv')

In [11]:
# Generate a dataframe df by mergin the train, movies and imdb datsets on movieId

df=pd.merge(movies,train,how='outer',on='movieId') # movies and train outer join
df=pd.merge(df,imdb,how='left',on='movieId') # movies and train left join

In [12]:
# Drop columns not needed

column_list=['timestamp','runtime','budget','plot_keywords']
df.drop(column_list,axis=1,inplace=True)

In [13]:
# Check df shape

row,column=df.shape
print('The dataframe has {} rows and {} columns.'.format(row, column))

The dataframe has 10014248 rows and 7 columns.


In [14]:
# Check df dataframe

df.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,title_cast,director
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,158849.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,97203.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,161871.0,3.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,45117.0,4.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27431.0,5.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter


In [15]:
# Function takes in aan input and removes pipe

def remove_pipe(text):
  text=str (text)
  text=text.replace("|",",")
  return text

In [16]:
# Apply remove_pipe to columns title_cast and genres

df['genres']=df['genres'].fillna('') #fill missing values with blanks
df['title_cast']=df['title_cast'].fillna('') #fill missing values with blanks

df['title_cast']=df['title_cast'].apply(remove_pipe) #apply remove_pipe sunction
df['genres']=df['genres'].apply(remove_pipe)
df.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,title_cast,director
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",158849.0,5.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter
1,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",97203.0,5.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter


Taking the df dataframe. A filtered dataframe with unique movie titles can be generated to remove duplicate movie titles due to multiple user ratings


In [17]:
# A content column can be generated from the title_cast and genres columns
# This column will be used to build the content recommender

df['content']=df['title_cast']+' '+ df['genres']
df['content'][0]

'Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wallace Shawn,John Ratzenberger,Annie Potts,John Morris,Erik von Detten,Laurie Metcalf,R. Lee Ermey,Sarah Freeman,Penn Jillette,Jack Angel,Spencer Aste Adventure,Animation,Children,Comedy,Fantasy'

In [18]:
# A ratings dataframe can be generated from uniw\que movie titles and their average ratings across all users

ratings=df.groupby(['title'])['rating'].mean() #group by title and aggregate ratings
ratings=pd.DataFrame(ratings)
ratings=ratings.fillna(2) # fill missing values with generic score of 2 (40%)
ratings['num of ratings']=df.groupby(['title'])['rating'].count() #generate a column witht the total number of ratings
ratings.head(5)

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""BLOW THE NIGHT!"" Let's Spend the Night Together (1983)",3.0,1
"""Great Performances"" Cats (1998)",2.753731,67
#1 Cheerleader Camp (2010),1.9,5
#Captured (2017),2.0,0
#Female Pleasure (2018),4.0,1


In [19]:
content=df.groupby(['title'])['content'].unique() #group by title and display unique values in a list
content=pd.DataFrame(content)
content.head(5)

Unnamed: 0_level_0,content
title,Unnamed: 1_level_1
"""BLOW THE NIGHT!"" Let's Spend the Night Together (1983)","[ Documentary,Drama]"
"""Great Performances"" Cats (1998)","[Elaine Paige,John Mills,Ken Page,Rosemarie Fo..."
#1 Cheerleader Camp (2010),"[ Comedy,Drama]"
#Captured (2017),[ Horror]
#Female Pleasure (2018),[ Documentary]


In [20]:
# Generate single dataframe

movie_df=pd.merge(ratings,content,on='title') #join on index
row,column=movie_df.shape
print('The dataframe has {} rows (representing unique movie titles) and {} columns.'.format(row, column))
movie_df.head(5)

The dataframe has 62325 rows (representing unique movie titles) and 3 columns.


Unnamed: 0_level_0,rating,num of ratings,content
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"""BLOW THE NIGHT!"" Let's Spend the Night Together (1983)",3.0,1,"[ Documentary,Drama]"
"""Great Performances"" Cats (1998)",2.753731,67,"[Elaine Paige,John Mills,Ken Page,Rosemarie Fo..."
#1 Cheerleader Camp (2010),1.9,5,"[ Comedy,Drama]"
#Captured (2017),2.0,0,[ Horror]
#Female Pleasure (2018),4.0,1,[ Documentary]


In [21]:
# Sort move_df from highest to lowest num of ratings

movie_df=movie_df.sort_values('num of ratings',ascending=False)
movie_df.head(5)

Unnamed: 0_level_0,rating,num of ratings,content
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Shawshank Redemption, The (1994)",4.417624,32831,"[Tim Robbins,Morgan Freeman,Bob Gunton,William..."
Forrest Gump (1994),4.047216,32383,"[Tom Hanks,Rebecca Williams,Sally Field,Michae..."
Pulp Fiction (1994),4.195097,31697,"[Tim Roth,Amanda Plummer,Laura Lovelace,John T..."
"Silence of the Lambs, The (1991)",4.144172,29444,"[Jodie Foster,Lawrence A. Bonney,Kasi Lemmons,..."
"Matrix, The (1999)",4.154253,29014,"[Keanu Reeves,Laurence Fishburne,Carrie-Anne M..."


In [22]:
# Function will remove items from lists into strings

def string(input):
  string=' '
  for i in input:
    string=string+str (i)
  return string

In [23]:
# Generate strings from lists

movie_df['content']=movie_df['content'].apply(string) #generate strings
movie_df['content'][1]

' Tom Hanks,Rebecca Williams,Sally Field,Michael Conner Humphreys,Harold G. Herthum,George Kelly,Bob Penny,John Randall,Sam Anderson,Margo Moorer,Ione M. Telech,Christine Seabrook,John Worsham,Peter Dobson,Siobhan Fallon Hogan Comedy,Drama,Romance,War'

The movie_df dataframe is the base from which the recommender is built. The content column will be the feature used, which will be vectorized and converted to a similarity matrix mxm


In [24]:
# A subset of the movies will be used. The greater the subset, the more memory is committed
# The cosine similiarity size is proportional to feature size
#First 200000 movies are selected

feature=movie_df['content'][0:20000]

In [25]:
# Set up vectorizer

vec=TfidfVectorizer(stop_words='english') # term frequency inverse document frequency

In [26]:
# Vectorize combined column

feature_vec=vec.fit_transform(feature)
shape,column=feature_vec.shape
print('feature_vec is a sparse matrix with {} rows (representing unique movie titles) and {} columns.'.format(row, column))

feature_vec is a sparse matrix with 62325 rows (representing unique movie titles) and 51902 columns.


The cosine similarity will be used to determine how similar two entities are based off the angle between their two vectors formed. The closer a value is to 1, to higher the similarity

The cosine similarity is formed from the dot product of the feature_vec matrix and its transpose divided by the product of their  magniudes.

In [27]:
# Use cosine similarity to get similarity matrix

sim_matrix =cosine_similarity(feature_vec,feature_vec)
row,column=sim_matrix.shape
print('The sim_mat matrix has {} rows and {} columns.'.format(row, column))

The sim_mat matrix has 20000 rows and 20000 columns.


In [28]:
del imdb
del movies

In [29]:
# Double check
# Reset the move_df index as it will be used to generate a Series that will map sim_mat

movie_df=movie_df.reset_index()
movie_df.head(5)

Unnamed: 0,title,rating,num of ratings,content
0,"Shawshank Redemption, The (1994)",4.417624,32831,"Tim Robbins,Morgan Freeman,Bob Gunton,William..."
1,Forrest Gump (1994),4.047216,32383,"Tom Hanks,Rebecca Williams,Sally Field,Michae..."
2,Pulp Fiction (1994),4.195097,31697,"Tim Roth,Amanda Plummer,Laura Lovelace,John T..."
3,"Silence of the Lambs, The (1991)",4.144172,29444,"Jodie Foster,Lawrence A. Bonney,Kasi Lemmons,..."
4,"Matrix, The (1999)",4.154253,29014,"Keanu Reeves,Laurence Fishburne,Carrie-Anne M..."


In [30]:
# Index mapping

index_map = pd.Series(movie_df.index,index = movie_df['title']) # contains a series of the movie titles in movie_df
index_map

title
Shawshank Redemption, The (1994)        0
Forrest Gump (1994)                     1
Pulp Fiction (1994)                     2
Silence of the Lambs, The (1991)        3
Matrix, The (1999)                      4
                                    ...  
Montana Trap (1976)                 62320
The Karate Dog (2004)               62321
Monte Cristo (1922)                 62322
The Karen Carpenter Story (1989)    62323
The Head Of The Family (1967)       62324
Length: 62325, dtype: int64

In [31]:
# Function takes in a movie title as an input and returns 10 movie title recommendations

def recommendation(title):

  index = index_map[title] # returns the index of the argument title in index_map

  similarity = list(enumerate(sim_matrix[index])) # returns a list of with index and similarity

  similarity = sorted(similarity, key=lambda x: x[1], reverse=True) # sorts list in descending order

  similarity = similarity[1:11] # top 10 recommendations

  #return movie names using the mapping series

  movie_indices = [i[0] for i in similarity] # obtain indices for recommended movies

  return (movie_df['title'].iloc[movie_indices]) # filter movie_df for titles

Using the animated movie Aladin from 1992. The recommender returns viewing options consisting of animated movies such as  Moana, Minions and Dragons: Dawn Of The Dragon Racers.

In [32]:
# Lets search for recommendations based off Aladdin

recommendations=recommendation('Aladdin (1992)')
recommend_list=list(recommendations)
recommend_list

['Moana (2016)',
 'The Good Dinosaur (2015)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 'Missing Link (2019)',
 "Olaf's Frozen Adventure (2017)",
 'Wonder Park (2019)',
 'Dragons: Dawn Of The Dragon Racers (2014)',
 'UglyDolls (2019)',
 'American Tail, An (1986)',
 'Minions (2015)']

Retrying the recommender with a different genre of movie, this time with the 1995 action classic Bad Boys.This returns the sequel Bad Boys II, Another 48 Hrs and Beverly Hills Cop II which are cop action movies. In addition, the stand up special Martin Lawrence: You So Crazy is recommended since Martin Lawrence starred in Bad Boys

In [33]:
recommendations=recommendation('Bad Boys (1995)')
recommend_list=list(recommendations)
recommend_list

['Spawn (1997)',
 'Martin Lawrence: You So Crazy (1994)',
 'Bad Boys II (2003)',
 'Another 48 Hrs. (1990)',
 'Midnight Run (1988)',
 'Beverly Hills Cop II (1987)',
 'Kindergarten Cop (1990)',
 'Tango & Cash (1989)',
 "Ocean's 8 (2018)",
 'Action Jackson (1988)']

In addtion. We could try predict what a user would rate the movie watched based off the movies recommended and their average ratings given by other user/viewers.


In [34]:
# Set the title column back to the index for movie_df
ave_rating_df=movie_df.copy()
ave_rating_df.set_index('title',inplace=True)
ave_rating_df.head(5)

Unnamed: 0_level_0,rating,num of ratings,content
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Shawshank Redemption, The (1994)",4.417624,32831,"Tim Robbins,Morgan Freeman,Bob Gunton,William..."
Forrest Gump (1994),4.047216,32383,"Tom Hanks,Rebecca Williams,Sally Field,Michae..."
Pulp Fiction (1994),4.195097,31697,"Tim Roth,Amanda Plummer,Laura Lovelace,John T..."
"Silence of the Lambs, The (1991)",4.144172,29444,"Jodie Foster,Lawrence A. Bonney,Kasi Lemmons,..."
"Matrix, The (1999)",4.154253,29014,"Keanu Reeves,Laurence Fishburne,Carrie-Anne M..."


In [35]:
# Function will take in a list of movies, and return a combined average for all movies

def ave_rating(input):
  
  score=[ave_rating_df.loc[i][0] for i in input]

  return round([sum(score)/len(score)  for i in score][0],1)

In the case for the movie Bad Boys. It can be predicted that the user would give a rating close to the outpuit of the ave_rating function

In [37]:
ave_rating(recommend_list)

2.9