In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings;
warnings.simplefilter('ignore')

In [None]:
# basic recommender: we first try to recommend the top 10 movies based on ratings
# filter for vote number greater than 100
d_frame = pd.read_csv('./data/movies_metadata.csv')
keywords = pd.read_csv('./data/keywords.csv')
production = pd.read_csv('./data/credits.csv')
d_frame['id'] = pd.to_numeric(d_frame['id'], errors="coerce")
keywords['id'] = pd.to_numeric(keywords['id'], errors="coerce")
production['id'] = pd.to_numeric(production['id'], errors="coerce")

d_frame = d_frame.dropna(subset=['id'])
keywords = keywords.dropna(subset=['id'])
production = production.dropna(subset=['id'])

d_frame['id'] = d_frame['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
production['id'] = production['id'].astype('int')

d_frame = pd.merge(d_frame, keywords, on='id')
d_frame = pd.merge(d_frame, production, on='id')
d_frame_byratings = d_frame[d_frame['vote_count'] > 100].sort_values('vote_average', ascending=[False])
print("Top 10 of movies with more than 100 votes ranked by average user rating")
d_frame_byratings.head(10)

In [None]:
def director_and_writer(crew):
    crew_arr = []
    for rec in crew:
        if rec['job'] == 'Director' or rec['job'] == 'Screenplay':
            crew_arr.append(rec['name'].lower().replace(" ", ""))
    return crew_arr

In [None]:
# we only consider movies with more than 75% quantile of the votes  
v_counts = d_frame[d_frame['vote_count'].notnull()]['vote_count'].astype('int')
v_averages = d_frame[d_frame['vote_average'].notnull()]['vote_average'].astype('int')
v_75q = v_counts.quantile(0.75)
v_70q = v_counts.quantile(0.70)
count_threshold = v_counts.quantile(0.90)

# next we filter the movies by count of votes 
movies_v75q = d_frame[(d_frame['vote_count'].notnull()) & (d_frame['vote_count'] >= v_75q)]
filtered_movies = d_frame[(d_frame['vote_count'].notnull()) & (((d_frame['vote_count'] >= count_threshold) & (d_frame['vote_average'] >= 6.0)) | ((d_frame['vote_count'] >= v_70q) & (d_frame['vote_average'] >= 7.0)))]

# create base data frame based on movies_v75q
base_df = movies_v75q[['id','title', 'original_title', 'genres', 'release_date', 'vote_average', 'vote_count', 'tagline', 'overview', 'keywords', 'crew']]
base_df_pv = filtered_movies[['id','title', 'genres', 'overview', 'tagline', 'vote_count', 'vote_average', 'keywords', 'crew', 'production_countries', 'production_companies']]
print(base_df_pv.shape)

# process genres column to make only names appear 
base_df['genres'] = base_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
base_df_pv['genres'] = base_df_pv['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# process keywords column to make it only the tags
w_snow = SnowballStemmer('english')
base_df['keywords'] = base_df['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [j['name'] for j in x] if isinstance(x, list) else [])
base_df_pv['keywords'] = base_df_pv['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [j['name'] for j in x] if isinstance(x, list) else [])
base_df_pv['keywords'] = base_df_pv['keywords'].apply(lambda x: [w_snow.stem(w) for w in x])

# process crew to make it only director and writer 
# base_df['crew'] = base_df['crew'].fillna('[]').apply(literal_eval).apply(director_and_writer)
base_df_pv['crew'] = base_df_pv['crew'].fillna('[]').apply(literal_eval).apply(director_and_writer)

# process production_countries to make it only the country names
# base_df['countries'] = base_df['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
base_df_pv['production_countries'] = base_df_pv['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'].lower().replace(" ", "") for i in x] if isinstance(x, list) else [])

# process production_companies to make it only the company and studio names
# base_df['studios'] = base_df['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
base_df_pv['production_companies'] = base_df_pv['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'].lower().replace(" ", "") for i in x] if isinstance(x, list) else [])

# find the top movies by vote_average
df_by_va = base_df_pv.sort_values('vote_average', ascending=[False])

# find the top movies by number of votes 
df_by_nv = base_df_pv.sort_values('vote_count', ascending=[False])

# compute the median score of movies_v75q
median_rating = base_df_pv['vote_average'].median()

# Use Bayesian average to compute a combined measure for average user rating + popularity 
overall_average_rating = base_df_pv['vote_average'].mean()

average_counts = base_df_pv['vote_count'].mean()

global_c = average_counts * overall_average_rating

def bayesian_average(mrow):
    cur_movie_rating = mrow['vote_average']
    total_votes = mrow['vote_count']
    b_avg = ((cur_movie_rating * total_votes) + global_c) / (average_counts + total_votes)
    return b_avg

base_df_pv['bavg_rating'] = base_df_pv.apply(bayesian_average, axis=1)

df_by_bavg = base_df_pv.sort_values('bavg_rating', ascending=[False])

In [None]:
# base_df.iloc[0][10]

In [None]:
# top 10 movies by average ratings 
print("Top 10 movies from all genres by average ratings:")
df_by_va.head(10)

In [None]:
# top 10 movies by number of ratings 
print("Top 10 movies from all genres by number of ratings (popularity)")
df_by_nv.head(10)

In [None]:
# top 10 movies by Bayesian average of popularity and ratings 
print("Top 10 movies from all genres by the Bayesian average of popularity and ratings ")
df_by_bavg.head(10)

In [None]:
# a method to recommend the top movies from each genre 
def movies_by_genre(genre, query_type):
    match query_type:
        case "popularity":
            result = df_by_nv[(df_by_nv["genres"].notnull()) & (df_by_nv["genres"].apply(lambda x: genre in x))]
            return result
        case "ratings":
            result = df_by_va[(df_by_nv["genres"].notnull()) & (df_by_va["genres"].apply(lambda x: genre in x))]
            return result 
        case "bayesian":
            result = df_by_bavg[(df_by_nv["genres"].notnull()) & (df_by_bavg["genres"].apply(lambda x: genre in x))]
            return result 

# get top romance movies 
movies_by_genre('Romance', 'ratings').head(10)

In [None]:
# next, we build a content-based recommender on the base movies 
# we use overview + keywords + tagline for the basis of our comparison 
print(base_df_pv.shape)
base_df_pv['overview'] = base_df_pv['overview'].fillna('')
base_df_pv['tagline'] = base_df_pv['tagline'].fillna('')
base_df_pv['kw_str'] = base_df_pv['keywords'].apply(lambda x: ' '.join(x))
base_df_pv['genres_str'] = base_df_pv['genres'].apply(lambda x: ' '.join(x))
base_df_pv['crew_str'] = base_df_pv['crew'].apply(lambda x: ' '.join(x))
base_df_pv['countries_str'] = base_df_pv['production_countries'].apply(lambda x: ' '.join(x))
base_df_pv['studios_str'] = base_df_pv['production_companies'].apply(lambda x: ' '.join(x))
base_df_pv['desc'] = base_df_pv['overview'] + ' ' + base_df_pv['tagline'] + ' ' + base_df_pv['kw_str'] + ' ' + base_df_pv['genres_str'] + ' ' + base_df_pv['crew_str'] + ' ' + base_df_pv['countries_str']
# base_df_pv['overview'] + ' ' + + base_df_pv['genres_str'] + ' '
tfid_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english')
tfid_vec.fit(base_df_pv['desc'])
df_matrix = tfid_vec.transform(base_df_pv['desc'])

# calculate the cosine similarity score between each movie 
cos_score = linear_kernel(df_matrix, df_matrix)

In [None]:
df_matrix.shape

In [None]:
cos_score[0]

In [None]:
#base_df_pv[base_df_pv['title'] == 'Toy Story']
base_df_pv[base_df_pv['title'] == 'Your Name.']

In [None]:
base_df_pv[base_df_pv['title'] == 'Brokeback Mountain']

In [None]:
base_df_pv[base_df_pv['title'] == 'After the Wedding']

In [None]:
base_df_pv = base_df_pv.reset_index()
movie_ids = base_df_pv['title']
indices = pd.Series(base_df_pv.index, index=base_df_pv['title'])
def sim_movies_by_desc(movie_id):
    idx = indices[movie_id]
    scores = sorted(list(enumerate(cos_score[idx])), key=lambda x: x[1], reverse=True)
    top_match = scores[1:26]
    movie_indices = [m[0] for m in top_match]
    return base_df_pv.iloc[movie_indices]

In [None]:
sim_movies_by_desc('Castle in the Sky').head(25)

In [None]:
# come up with custom similarity scores on a scale of 1 - 10 on the top 25 movies returned 
def calculate_score(target_movie, src_kw, src_gr, src_st):
    # source movie is the movie we wish to find similar movies for
    # we take the keyword (25%), genres (25%) studios (25%), and ratings (25%) into consideration
    kw_sim = (len(np.intersect1d(src_kw, target_movie['keywords'])) / len(src_kw)) * 3.33
    gr_sim = (len(np.intersect1d(src_gr, target_movie['genres'])) / len(src_gr)) * 3.33
    st_sim = (len(np.intersect1d(src_st, target_movie['production_companies'])) / len(src_st)) * 3.33
    overall_sim = kw_sim + gr_sim + st_sim 
    return overall_sim

In [None]:
def find_matches(movie_title):
    target_movies = sim_movies_by_desc(movie_title).head(25)
    # find similaritie score for the movies returned 
    src = base_df_pv[base_df_pv['title'] == movie_title]
    target_movies['s_score'] = target_movies.apply(calculate_score, axis=1, args=(src['keywords'].to_list(), src['genres'].to_list(), src['production_companies'].to_list()))
    # sort by sim_score 
    sorted_list = target_movies.sort_values(by=['s_score'], ascending=False)
    return sorted_list.head(20)

In [None]:
find_matches('A Brighter Summer Day')