# Content Based Movie Recommendation

In [1]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_columns', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from scipy.sparse import csr_matrix

### Import Dataset

In [5]:
# Import movie dataset (combined metadata), subset the data to 10k rows for computational conviencies
whole_df = pd.read_csv('/home/Movie_Recommendation/notebooks/combined_metadata_table.csv')

### Pick A Movie (Fake Search Engine)

In [6]:
def identify_movie(your_pick, whole_df):
    return whole_df[whole_df['title'].str.contains(your_pick, flags=re.IGNORECASE, regex=True)]

In [12]:
your_pick = 'Sky High'
identify_movie(your_pick, whole_df)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,id,overview,popularity,revenue,tagline
19712,tt0398563,Sky High,Sky High,2003,12/5/03,"Fantasy, Mystery, Action",122,Japan,Japanese,Ryûhei Kitamura,"Tsutomu Takahashi, Isao Kiriyama",Toei Company,"Yumiko Shaku, Takao Ohsawa, Shôsuke Tanihara, ...",A detective hunts a killer who is removing gir...,6.1,1361,,26.0,24.0,5846,A serial-murderer is on the loose who just so ...,0.791678,0.0,
21012,tt0405325,Sky High,Sky High,2005,10/21/05,"Adventure, Comedy, Family",100,USA,"English, Cantonese",Mike Mitchell,"Paul Hernandez, Robert Schooley",Walt Disney Pictures,"Michael Angarano, Kurt Russell, Kelly Preston,...",Set in an era where superheroes are commonly k...,6.2,73753,62.0,236.0,135.0,11459,Set in a world where superheroes are commonly ...,8.061423,86369815.0,Saving The World... One Homework Assignment At...


### Subset Dataset

In [13]:
def subset_by_genre(your_pick, whole_df):
    genres = whole_df[whole_df['title'] == your_pick]['genre']
    genres = list(genres)[0].split(', ')
    masks = []
    for genre in genres:
        mask = whole_df['genre'].str.contains(genre)
        masks.append(mask)
    mask = masks[0]
    for i in range(1,len(masks)):
        mask = mask | masks[i]
    return whole_df[mask]

In [14]:
def subset_by_year(your_pick, whole_df, year_range=30):
    year = whole_df[whole_df['title'] == your_pick]['year']
    year = list(year)[0]
    window = [year-year_range, year+year_range]
    mask = (whole_df['year'] >= window[0]) & (whole_df['year'] <= window[1])
    return whole_df[mask]

In [11]:
def subset_by_language(your_pick, whole_df):
    languages = whole_df[whole_df['title'] == your_pick]['language']
    languages = list(languages)[0].split(', ')
    masks = []
    for language in languages:
        mask = whole_df['language'].str.contains(language)
        masks.append(mask)
    mask = masks[0]
    for i in range(1,len(masks)):
        mask = mask | masks[i]
    return whole_df[mask]

In [None]:
source = subset_by_language(your_pick, whole_df)

In [8]:
subset_df = subset_by_genre(your_pick, whole_df)
source = subset_by_year(your_pick, subset_df)
source = subset_by_language(your_pick, source)

In [9]:
print('Subset Size:', source.shape[0])
source.head(3)

Subset Size: 11610


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,id,overview,popularity,revenue,tagline
3525,tt0045808,Genevieve,Genevieve,1953,8/14/53,Comedy,86,UK,English,Henry Cornelius,William Rose,Sirius Productions,"Dinah Sheridan, John Gregson, Kay Kendall, Ken...",It's time for the annual London to Brighton an...,7.1,2578,,51.0,19.0,43346,Two friends driving in the London to Brighton ...,1.152132,0.0,
3526,tt0046387,The Sword and the Rose,The Sword and the Rose,1953,12/18/53,"Adventure, Drama, Family",92,UK,English,Ken Annakin,"Lawrence Edward Watkin, Charles Major",Walt Disney Productions,"Glynis Johns, Richard Todd, James Robertson Ju...","During the reign of Henry VIII, Mary Tudor see...",6.5,669,,14.0,8.0,64851,Tells the story of Mary Tudor and her troubled...,0.444989,0.0,A Forbidden Love That Threatened Two Kingdoms!
3528,tt0046436,The Titfield Thunderbolt,The Titfield Thunderbolt,1953,3/30/53,Comedy,84,UK,English,Charles Crichton,T.E.B. Clarke,Ealing Studios,"Stanley Holloway, George Relph, Naunton Wayne,...",Volunteers take over their local passenger tra...,7.1,2355,,48.0,16.0,24381,When British Railways announce the closure of ...,2.069436,0.0,


### Data Processing

In [10]:
# Delete spaces in names to make names more unique
# eg. 'firstname lastname' -> 'firstnamelastname'
column_with_names = ['director', 'writer', 'production_company', 'actors']
source = source.copy()
for col in column_with_names:
    source[str(col)] = source[str(col)].str.replace(' ', '', regex=True)
    source[str(col)] = source[str(col)].str.replace(',', ' ', regex=True)

### Compute TFIDF

In [11]:
# Suggestion: remove the spaces for the names, to solving clustering ambiguity
# Issue: how to deal with numbers ???
columns = ['country', 'director','writer', 
           'production_company', 'actors',
           'description','overview', 'tagline']

In [12]:
# Converts the source dataframe into single string for tfidf computation
df = {}
movies = []
titles = []
imdbid = []
for i in range(source.shape[0]):
    row = source.iloc[i]
    row_str = ''
    titles.append(row['title'])
    imdbid.append(row['imdb_title_id'])
    for column in columns:
        row_str += str(row[column])
        row_str += ' '
    movies.append(row_str)

df['IMDBid'] = imdbid
df['Title'] = titles
df['Content'] = movies
df = pd.DataFrame(df)

In [13]:
df.head()

Unnamed: 0,IMDBid,Title,Content
0,tt0045808,Genevieve,UK HenryCornelius WilliamRose SiriusProduction...
1,tt0046387,The Sword and the Rose,UK KenAnnakin LawrenceEdwardWatkin CharlesMajo...
2,tt0046436,The Titfield Thunderbolt,UK CharlesCrichton T.E.B.Clarke EalingStudios ...
3,tt0046307,Rough Shoot,UK RobertParrish GeoffreyHousehold EricAmbler ...
4,tt0046286,Sea Devils,UK RaoulWalsh BordenChase CoronadoProductions ...


In [14]:
# Tranform to tfidf space
V = TfidfVectorizer()
X = V.fit_transform(df['Content'])
print('X shape:', X.shape)

X shape: (11610, 146793)


### Compute Similarity

In [15]:
'''
# Dimensionality reduction
svd = TruncatedSVD(n_components=10)
X = svd.fit_transform(X)
print('X shape after SVD:', X.shape)


# Compute similarity of movie for SVD
index = df[df['Title'] == your_pick].index[0]
d1 = X[index]
mag_d1 = np.linalg.norm(d1)
dist = []
for i in range(X.shape[0]):
    row = X[i]
    dot_product_xy = np.dot(d1, row)
    mag_row = np.linalg.norm(row)
    x_time_y = mag_d1 * mag_row
    dist.append(dot_product_xy/x_time_y) 
dist_series = pd.Series(dist)
dist_series = dist_series.sort_values(ascending=False)
dist_series.iloc[1:6]
dist_series = pd.DataFrame(dist_series)
'''

"\n# Dimensionality reduction\nsvd = TruncatedSVD(n_components=10)\nX = svd.fit_transform(X)\nprint('X shape after SVD:', X.shape)\n\n\n# Compute similarity of movie for SVD\nindex = df[df['Title'] == your_pick].index[0]\nd1 = X[index]\nmag_d1 = np.linalg.norm(d1)\ndist = []\nfor i in range(X.shape[0]):\n    row = X[i]\n    dot_product_xy = np.dot(d1, row)\n    mag_row = np.linalg.norm(row)\n    x_time_y = mag_d1 * mag_row\n    dist.append(dot_product_xy/x_time_y) \ndist_series = pd.Series(dist)\ndist_series = dist_series.sort_values(ascending=False)\ndist_series.iloc[1:6]\ndist_series = pd.DataFrame(dist_series)\n"

In [16]:
# Compute similarity of movie: Melvin and Howard
index = df[df['Title'] == your_pick].index[0]
d1 = list(csr_matrix.toarray(X[index]))
mag_d1 = np.linalg.norm(d1)
dist = []
for i in range(X.shape[0]):
    row = list(csr_matrix.toarray(X[i]))
    dot_product_xy = np.multiply(d1, row).sum(1)
    mag_row = np.linalg.norm(row)
    x_time_y = mag_d1 * mag_row
    dist.append(dot_product_xy/x_time_y) 
dist_series = pd.Series(dist)
dist_series = dist_series.sort_values(ascending=False)
dist_series.iloc[1:6]
dist_series = pd.DataFrame(dist_series)

### Recommend Top 5 Movies

In [17]:
# Merge similarity scores with original dataframe to visualize
result = pd.merge(dist_series, df, how='inner', left_index=True, right_index=True)
result = result.rename({0: 'Cosine Similarity Score'}, axis='columns')

In [18]:
# Show closest results
result.head(6)

Unnamed: 0,Cosine Similarity Score,IMDBid,Title,Content
2965,[1.0000000000000002],tt0085995,National Lampoon's Vacation,USA HaroldRamis JohnHughes JohnHughes WarnerBr...
4023,[0.23926701067907094],tt0097958,National Lampoon's Christmas Vacation,USA JeremiahS.Chechik JohnHughes WarnerBros. C...
5548,[0.1860146536597548],tt0120434,Vegas Vacation,USA StephenKessler ElisaBell BobDucsay WarnerB...
3307,[0.15279257267950555],tt0089670,National Lampoon's European Vacation,USA AmyHeckerling JohnHughes RobertKlane Warne...
4660,[0.14477408723062454],tt0107290,Jurassic Park,USA StevenSpielberg MichaelCrichton MichaelCri...
4180,[0.1159607089009845],tt0099785,Home Alone,USA ChrisColumbus JohnHughes HughesEntertainme...


### Some Ideas for Future

In [19]:
# make a window of year range
# take the difference between year to see the closest
# make it fast: compute the tfidf once per day, put in cache
# Use PCA to speed up cosine similarity computation