In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Data Wrangling as Usual

In [None]:
json_cols = ['collection','genres','production_companies','production_countries']
mv = pd.read_csv('movies_metadata.csv', parse_dates = ['release_date'])

for cols in json_cols:
    mv[cols] = mv[cols].apply(json.loads)
    
cr = pd.read_csv('credits.csv')

cr['cast'] = cr['cast'].apply(json.loads)

def access_json(data, index):
    result = data
    try:
        for idx in index:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

mv['franchise'] = mv.collection.apply(lambda x: access_json(x, ['name']))
mv['collection_poster'] = mv.collection.apply(lambda x: access_json(x, ['poster_path']))

genre_cols = ['genre1','genre2']
for i,col in enumerate(genre_cols):
    mv[col] = mv.genres.apply(lambda x: access_json(x, [i,'name']))


prod_comp_cols = ['company1','company2']
for i,col in enumerate(prod_comp_cols):
    mv[col] = mv.production_companies.apply(lambda x: access_json(x, [i,'name']))
    
mv['production_country'] = mv.production_countries.apply(lambda x: access_json(x, [0,'name']))

cr['actor_lead'] = cr.cast.apply(lambda x: access_json(x, [0, 'name']))

#Now let's merge them on id
df_movies = pd.merge(mv, cr, left_on='id', right_on='id')


df_movies = df_movies[['id','franchise','collection_poster','title','release_date','actor_lead','Director','genre1','genre2','original_language'
                   ,'production_country','company1','company2','runtime','vote_average','vote_count'
                   ,'budget','adjusted_budget','revenue','adjusted_revenue','overview','tagline','poster_path']]

for col in [ 'budget','revenue','adjusted_budget','adjusted_revenue']:
    df_movies[col] = df_movies[col].apply(lambda x: x/1000000)
    df_movies[col] = df_movies[col].fillna(0)

df_movies['year'] = df_movies.release_date.dt.year.fillna(0).astype('int')
df_movies['month'] = df_movies.release_date.dt.month.fillna(0).astype('int')


df_movies.head()

## Prepare Data for Recommender 

In [None]:
vote_df = df_movies.sort_values('vote_count').reset_index().drop(columns='index')
vote_df['vote_count'] = vote_df.vote_count.fillna(0).astype('int')

If you look at the data, there're lot of imbalance between vote_count and vote_average. In other word, many movies with rate of 10 but only have 1 vote. To filter out those values, we could take value of 95 percent of vote_count, so that our vote_count has vote greater than 95% of our data.

In [None]:
np.percentile(vote_df.vote_count,95)

So to be listed in our chart, vote_count has to be greater than 430

In [None]:
vote_df = vote_df[vote_df.vote_count>=430]
vote_df.shape

# Genre Recommender
We'll try to build a recommender based on highest rating in a particular genre and year, optionally.

In [None]:
def genre_recommender(gen,year_=1900):
    rec_gen = vote_df[((vote_df.genre1==gen) | (vote_df.genre2==gen))& (vote_df.year>=year_)].sort_values('vote_average', ascending=False)
    return rec_gen

gen = 'Science Fiction'
#year_ = 2012
rec_gen = genre_recommender(gen,year_)
rec_gen[['title','vote_average','year']].head(10)

# Content Recommender
Now, lets make a recommender based on similiarity of overview and tagline

In [None]:
con_rec = df_movies[df_movies.original_language=='en']
con_rec.shape

In [None]:
con_rec['tagline'] = con_rec.tagline.fillna('')
con_rec['overview'] = con_rec.overview.fillna('')
con_rec['description'] = con_rec['tagline'] + con_rec['overview']

We could use cosine similiarity and vectorizer to make a quantity measurement of similiarity.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(con_rec['description'])

In [None]:
tfidf_matrix.shape

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

In [None]:
con_rec = con_rec.reset_index()
titles = con_rec['title']
ind = pd.Series(con_rec.index,index=con_rec['title'])
ind.head()

In [None]:
def content_recommender(title):
    idx = ind[title]
    if idx.shape != ():
        idx = ind[title].iloc[0]          #I use iloc to choose the first title appear in case of duplicated index
        
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
content_recommender('The Dark Knight').head(10)

We could see even the title doesn't contain word 'Batman' but our recommender succeed to recommend another Batman movies. Let's try another

In [None]:
content_recommender('Justice League: The Flashpoint Paradox').head(10)