# Modules

In [1]:
import os
import numpy as np
import pandas as pd
from ast import literal_eval
from google.colab import drive

# Dataset

In [5]:
drive.mount('gdrive')

Mounted at gdrive


In [6]:
%%shell

pip install -q kaggle
mkdir -p ~/.kaggle
cp gdrive/MyDrive/'Colab Notebooks'/kaggle.json ~/.kaggle/
chmod 600 /root/.kaggle/kaggle.json

ls ~/.kaggle

kaggle.json




In [7]:
%%shell
kaggle datasets download -d rounakbanik/the-movies-dataset
unzip the-movies-dataset.zip
ls

Downloading the-movies-dataset.zip to /content
 99% 226M/228M [00:02<00:00, 59.6MB/s]
100% 228M/228M [00:03<00:00, 79.0MB/s]
Archive:  the-movies-dataset.zip
  inflating: credits.csv             
  inflating: keywords.csv            
  inflating: links.csv               
  inflating: links_small.csv         
  inflating: movies_metadata.csv     
  inflating: ratings.csv             
  inflating: ratings_small.csv       
credits.csv   links.csv		   ratings.csv	      the-movies-dataset.zip
gdrive	      links_small.csv	   ratings_small.csv
keywords.csv  movies_metadata.csv  sample_data




In [8]:
# !wget --no-check-certificate smaller dataset : https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip ml-latest-small.zip

# !wget --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-latest.zip
# !unzip ml-latest.zip

In [2]:
data = pd.read_csv('movies_metadata.csv', dtype={'popularity' : 'string'})
print(data.shape)
data.sample(3)

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
11782,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,13352,tt0329737,en,It's a Very Merry Muppet Christmas Movie,The owner of a bank (Miss Bitterman) wants to ...,1.991539,/yUWnJnoVKJgD8rLfQizwQXCtd2A.jpg,"[{'name': 'National Broadcasting Company', 'id...","[{'iso_3166_1': 'US', 'name': 'United States o...",2002-11-29,0.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,It's a Very Merry Muppet Christmas Movie,False,5.4,15.0
28508,False,"{'id': 479319, 'name': 'George Carlin Comedy C...",0,"[{'id': 35, 'name': 'Comedy'}]",http://www.georgecarlin.com,13643,tt0963207,en,George Carlin: It's Bad for Ya!,"It's Bad For Ya, Carlin's Emmy nominated 14th ...",2.31176,/qRKCt403iRtcIgBDzWCiPln4fUz.jpg,"[{'name': 'Cable Stuff Productions', 'id': 2789}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2008-03-01,0.0,70.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Let's cut the crap.,George Carlin: It's Bad for Ya!,False,8.1,33.0
21743,False,,30000000,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",http://www.runnerrunnermovie.com/,146238,tt2364841,en,Runner Runner,When a poor college student who cracks an onli...,8.864607,/vh3pmcySB3sWkNwLRIyrbTugSrD.jpg,"[{'name': 'Double Feature Films', 'id': 215}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2013-09-24,62616646.0,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The house always wins.,Runner Runner,False,5.5,547.0


# Methodology
ref: [link](https://www.datacamp.com/community/tutorials/recommender-systems-python)

## Simple RS

$
Weighted Rating, WR = (\frac{v}{v+m} \cdot R) + (\frac{m}{v+m} \cdot C)
$
<br><br>
v : votes

m : minimum votes required

R : average rating

C : average vote

In [7]:
# vote describe quality of the movie

C = data['vote_average'].mean()
C
# average vote is 5.6 out of 10

5.618207215133889

In [8]:
# vote count describes movie popularity

m = data['vote_count'].quantile(0.90)
m
# 90% of movies has 160 vote counts

160.0

In [9]:
data_filtered = data.loc[data['vote_count'] >= m]
data_filtered.shape
# sized down data size to almost 10%

(4555, 24)

In [10]:
def weighted_rating(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
data_filtered['vote_weighted'] = data_filtered.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
data_filtered.sort_values('vote_weighted', ascending=False)[['title','vote_count','vote_average','vote_weighted']]

# System recommended movies based on top average vote with vote counts more than 90th percentile 

Unnamed: 0,title,vote_count,vote_average,vote_weighted
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
...,...,...,...,...
9710,Son of the Mask,346.0,3.6,4.238168
12911,Disaster Movie,250.0,3.1,4.082715
3471,Battlefield Earth,259.0,3.0,3.999793
11557,Epic Movie,334.0,3.2,3.983225


## Content-Based

In [19]:
def get_recommendations(title, cos_sim_mtx, movie_title_idx, data):
    idx = movie_title_idx[title]
    sim_scores = list(enumerate(cos_sim_mtx[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]       # get top 10 similarity score ([0] is itself)
    movie_idx = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_idx]

### Univariate - [Overview]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [12]:
N = int(0.6 * len(data))    # limit sample size to avoid notebook from crashing due to ram limit
data_cb = data.head(N).copy()
data_cb['overview'] = data_cb['overview'].fillna('')
data_cb.shape

(27279, 24)

In [13]:
# Apply TF-IDF to extract word importance in each `overview`

tfidf = TfidfVectorizer(stop_words='english', max_features=25000, lowercase=True, token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')       # ignore pure numeric tokens  # token_pattern=r'(?u)\b[A-Za-z]+\b'
# remove English stop words, e.g. the, a, that...
# limit vocabs to 30,000 to avoid notebook from crashing due to ram limit

tfidf_mtx = tfidf.fit_transform(data_cb['overview'])
print(tfidf_mtx.shape)
print(tfidf.get_feature_names()[:15])

(27279, 25000)
['10th', '11th', '12th', '13th', '14th', '15th', '1600s', '16mm', '16th', '1700s', '17th', '1800s', '1830s', '1840s', '1850s']


In [14]:
# Apply cosine similarity to recommend movie with similar overview

cos_sim_mtx = linear_kernel(tfidf_mtx, tfidf_mtx)
cos_sim_mtx.shape

(27279, 27279)

In [15]:
# reverse index & title

movie_idx = pd.Series(data_cb.index, index=data_cb['title']).drop_duplicates()
movie_idx

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                     ...  
The Great Rupert                                     27274
The Legend of Hell's Gate: An American Conspiracy    27275
Multiple Sarcasms                                    27276
A Woman Called Golda                                 27277
These Wilder Years                                   27278
Length: 27279, dtype: int64

In [16]:
get_recommendations('Batman Begins', cos_sim_mtx, movie_idx, data_cb)

# System recommended movies related to Batman based on movie overview.

21194    Batman Unmasked: The Psychology of the Dark Kn...
19792              Batman: The Dark Knight Returns, Part 1
18035                                     Batman: Year One
15511                           Batman: Under the Red Hood
585                                                 Batman
150                                         Batman Forever
3095                          Batman: Mask of the Phantasm
1328                                        Batman Returns
21400                      Batman: Mystery of the Batwoman
18252                                The Dark Knight Rises
Name: title, dtype: object

### Multivariate - [cast, crew, director, genres]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
N = int(0.55 * len(data))    # limit sample size to avoid notebook from crashing due to ram limit
data_cb = data.head(N).copy()
data_cb['overview'] = data_cb['overview'].fillna('')
data_cb.shape

(25006, 24)

In [5]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [6]:
# remove non-numeric value in ['id']
mask = pd.to_numeric(data_cb['id'], errors='coerce').isna() 
data_cb = data_cb.loc[~mask]

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
data_cb['id'] = data_cb['id'].astype('int')

data_cb = data_cb.merge(credits, on='id')
data_cb = data_cb.merge(keywords, on='id')

In [7]:
# find director name from [crew]
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# extract only top 3 elements from list in features
def get_top_3(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]

        if len(names) > 3:
            names = names[:3]
        return names

    return []   # return empty list if list is not found

# strip spaces and 
def preprocess_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))

    return ''

def create_criteria_col(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [8]:
features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    data_cb[feature] = data_cb[feature].apply(literal_eval)     # convert raw string to dictionary format

    if feature == 'crew':
        data_cb['director'] = data_cb['crew'].apply(get_director)
    else:
        data_cb[feature] = data_cb[feature].apply(get_top_3)

In [9]:
features = ['cast', 'director', 'keywords', 'genres']

for feature in features:
    data_cb[feature] = data_cb[feature].apply(preprocess_data)

In [10]:
data_cb['criteria'] = data_cb.apply(create_criteria_col, axis=1)
data_cb['criteria']

0        jealousy toy boy tomhanks timallen donrickles ...
1        boardgame disappearance basedonchildren'sbook ...
2        fishing bestfriend duringcreditsstinger walter...
3        basedonnovel interracialrelationship singlemot...
4        baby midlifecrisis confidence stevemartin dian...
                               ...                        
25152    adolescence jessejames reileymcclendon stephen...
25153    suicide stv independentfilm winonaryder wesben...
25154     stephanbender maximillianroeg thomasjayryan j...
25155    netherlands kidnapping delinquent theomaassen ...
25156    recycling communityservice salesman thurelindh...
Name: criteria, Length: 25157, dtype: object

In [11]:
cv = CountVectorizer(stop_words='english', max_features=35000, lowercase=True, token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b' )
cv_matrix = cv.fit_transform(data_cb['criteria'])
cv_matrix.shape     # (# sample, # vocabs)

(25157, 35000)

In [12]:
cos_sim_mtx = cosine_similarity(cv_matrix, cv_matrix)
cos_sim_mtx.shape

(25157, 25157)

In [13]:
# prep `for get_recommendation`
movie_idx = pd.Series(data_cb.index, index=data_cb['title']).drop_duplicates()      

In [28]:
get_recommendations('The Godfather', cos_sim_mtx, movie_idx, data_cb)

1930            The Godfather: Part III
1195             The Godfather: Part II
15571                   The Rain People
18894                         Last Exit
7981     The Night of the Following Day
18215                 The Son of No One
23494               Hide in Plain Sight
7752                           Mitchell
3636                            Serpico
16736                     The Organizer
Name: title, dtype: object

## Collaborative

### User-Based

### Item Based