In [1]:
!wget https://github.com/ChanCheeKean/datasets/blob/main/compressed/movie_dataset.zip?raw=true
!unzip movie_dataset.zip?raw=true

--2022-12-16 18:40:54--  https://github.com/ChanCheeKean/datasets/blob/main/compressed/hourly_electricity.zip?raw=true
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/ChanCheeKean/datasets/raw/main/compressed/hourly_electricity.zip [following]
--2022-12-16 18:40:54--  https://github.com/ChanCheeKean/datasets/raw/main/compressed/hourly_electricity.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ChanCheeKean/datasets/main/compressed/hourly_electricity.zip [following]
--2022-12-16 18:40:54--  https://raw.githubusercontent.com/ChanCheeKean/datasets/main/compressed/hourly_electricity.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.co

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity

# Data Loading

In [None]:
metadata = pd.read_csv('./movies_metadata.csv', low_memory=True)
metadata = metadata.copy().loc[metadata['vote_count'] >= metadata['vote_count'].quantile(0.50)]
metadata.head(2)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
# weighted rating: average rating and the number of votes it has accumulated
def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

C = metadata['vote_average'].mean()
m = metadata['vote_count'].quantile(0.90)
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]

q_movies['score'] = q_movies.apply(lambda x: weighted_rating(x, m=m, C=C), axis=1)
q_movies = q_movies.sort_values('score', ascending=False)
indices = {v: k for k, v in q_movies['title'].to_dict().items()}
q_movies[['title', 'vote_count', 'vote_average', 'score']].head()

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.386181
834,The Godfather,6024.0,8.5,8.34502
12481,The Dark Knight,12269.0,8.3,8.228007
2843,Fight Club,9678.0,8.3,8.209552
292,Pulp Fiction,8670.0,8.3,8.199534


# Content-Based Recommender Through Overview

*   Compute TF-iDF of overview in each documents
*   Calculate Pairwise Similarity of each document
*   Return the Top 10 highest similarity



In [None]:
# smaller sample
print(metadata.shape)
metadata['overview'].head()

(22931, 24)


0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
metadata['overview'] = metadata['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
print(tfidf_matrix.shape)

# pairwise cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(22931, 49603)
(22931, 22931)


In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    idx = indices[title]

    # pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return metadata['title'].iloc[movie_indices]

get_recommendations('The Dark Knight Rises', cosine_sim)

34757             Gutterballs
22658                   Enemy
494             Mr. Wonderful
8752        Trilogy of Terror
12602               Meet Bill
36175                  Le Mac
11561                  Norbit
3409     Smoking / No Smoking
18624             Miss Nobody
6229              The In-Laws
Name: title, dtype: object

# Content-Based Recommender Through Metadata

*   Combine metadata ['title', 'cast', 'director', 'keywords'] into one sentences
*   Compute Keyword Count of combined sentence in each documents
*   Calculate Pairwise Similarity of each document
*   Return the Top 10 highest similarity

In [None]:
credits = pd.read_csv('./credits.csv')
keywords = pd.read_csv('./keywords.csv')

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names[:3]
    return []

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Parse the stringified features into their corresponding python objects
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)
    
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

metadata['soup'] = metadata.apply(create_soup, axis=1)
metadata[['title', 'cast', 'director', 'keywords', 'genres', 'soup']].head(3)

Unnamed: 0,title,cast,director,keywords,genres,soup
0,Toy Story,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy]","[animation, comedy, family]",jealousy toy boy tomhanks timallen donrickles ...
1,Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]",boardgame disappearance basedonchildren'sbook ...
2,Grumpier Old Men,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]",fishing bestfriend duringcreditsstinger walter...


In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])
get_recommendations('The Dark Knight Rises', cosine_sim2)

9750          The Dark Knight
7907            Batman Begins
7269                   Shiner
6136                 Mitchell
426         Romeo Is Bleeding
8861             The Prestige
15607               Quicksand
21542                    Sara
8382           Helter Skelter
22959    Payback: Straight Up
Name: title, dtype: object