# Requirement

## Download Data

In [1]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d rounakbanik/the-movies-dataset

Downloading the-movies-dataset.zip to /content
100% 227M/228M [00:05<00:00, 52.8MB/s]
100% 228M/228M [00:05<00:00, 44.2MB/s]


In [2]:
import zipfile
zipfile.ZipFile('/content/the-movies-dataset.zip').extractall()

In [3]:
import pandas as pd
import numpy as np

movies = pd.read_csv('/content/movies_metadata.csv')


  exec(code_obj, self.user_global_ns, self.user_ns)


## Useful Functions

In [4]:
# Get ID By Name
def GetIDByName(data,name):
    temp = data.loc[data['title'] == name]
    return temp['imdb_id'].value

In [5]:
# Convert to float and handling float
def to_float(x):
    try:
        x = float(x)
    except: 
        x = np.nan
    return x

In [6]:
# Extract Genres
def ExtractGenres(items):
    output = []
    for item in items:
        output.append(item["name"])
    return output

In [7]:
#Function to compute the IMDB weighted rating for each movie
def weighted_rating(data):
    m = np.quantile(data['vote_count'], 0.8)
    C = data['vote_average'].mean()
    v = data['vote_count']
    R = data['vote_average']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [9]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [10]:
# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [11]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [12]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Analysing

In [13]:
# Check columns
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [14]:
# Preview
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [16]:
# Chcek NANs
for col in range(movies.shape[1]):
  print(movies.columns[col],"\t",movies.iloc[:,col].isna().sum())

adult 	 0
belongs_to_collection 	 40972
budget 	 0
genres 	 0
homepage 	 37684
id 	 0
imdb_id 	 17
original_language 	 11
original_title 	 0
overview 	 954
popularity 	 5
poster_path 	 386
production_companies 	 3
production_countries 	 3
release_date 	 87
revenue 	 6
runtime 	 263
spoken_languages 	 6
status 	 87
tagline 	 25054
title 	 6
video 	 6
vote_average 	 6
vote_count 	 6


In [17]:
# Chekck status of movies
movies['status'].value_counts()

Released           45014
Rumored              230
Post Production       98
In Production         20
Planned               15
Canceled               2
Name: status, dtype: int64

## Cleaning and data processing

### Drop Nan

In [18]:
movies = movies.dropna(subset=['title'],how="all")

In [19]:
for col in range(movies.shape[1]):
  print(movies.columns[col],"\t",movies.iloc[:,col].isna().sum())

adult 	 0
belongs_to_collection 	 40970
budget 	 0
genres 	 0
homepage 	 37682
id 	 0
imdb_id 	 17
original_language 	 11
original_title 	 0
overview 	 954
popularity 	 0
poster_path 	 383
production_companies 	 0
production_countries 	 0
release_date 	 84
revenue 	 0
runtime 	 257
spoken_languages 	 0
status 	 81
tagline 	 25048
title 	 0
video 	 0
vote_average 	 0
vote_count 	 0


### Drop except "Release" column

In [20]:
movies = movies[movies.status == "Released"]

### Drop 0.0 runtime

### ِConvert data to float

In [21]:
#Apply the to_float function to all values in the budget column
movies['budget'] = movies['budget'].apply(to_float)

#Try converting to float using pandas astype
movies['budget'] = movies['budget'].astype('float')


## Calculating weighted rating

In [22]:
movies['score'] = weighted_rating(movies)

# Mess around with data

In [23]:
smallData = movies[['title', 'release_date', 'budget', 'revenue', 'runtime', 'genres', 'score']]

In [24]:
smallData

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,score
0,Toy Story,1995-10-30,30000000.0,373554033.0,81.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.680627
1,Jumanji,1995-12-15,65000000.0,262797249.0,104.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.873583
2,Grumpier Old Men,1995-12-22,0.0,0.0,101.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",6.187467
3,Waiting to Exhale,1995-12-22,16000000.0,81452156.0,127.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",5.814209
4,Father of the Bride Part II,1995-02-10,0.0,76578911.0,106.0,"[{'id': 35, 'name': 'Comedy'}]",5.682624
...,...,...,...,...,...,...,...
45461,Subdue,,0.0,0.0,90.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",5.592457
45462,Century of Birthing,2011-11-17,0.0,0.0,360.0,"[{'id': 18, 'name': 'Drama'}]",5.811255
45463,Betrayal,2003-08-01,0.0,0.0,90.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",5.431715
45464,Satan Triumphant,1917-10-21,0.0,0.0,87.0,[],5.623682


In [25]:
#Convert release_date into pandas datetime format
smallData['release_date'] = pd.to_datetime(smallData['release_date'], errors='coerce')

#Extract year from the datetime
smallData['year'] = smallData['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [26]:
import ast
import pandas as pd



smallData["genres"] = smallData["genres"].apply(lambda x: ast.literal_eval(x))
smallData["genres"] = smallData["genres"].apply(lambda x: ExtractGenres(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [27]:
smallData['overview'], smallData['id'], smallData['imdb_id'] = movies['overview'],movies['id'], movies['imdb_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
smallData.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,score,year,overview,id,imdb_id
0,Toy Story,1995-10-30,30000000.0,373554033.0,81.0,"[Animation, Comedy, Family]",7.680627,1995,"Led by Woody, Andy's toys live happily in his ...",862,tt0114709
1,Jumanji,1995-12-15,65000000.0,262797249.0,104.0,"[Adventure, Fantasy, Family]",6.873583,1995,When siblings Judy and Peter discover an encha...,8844,tt0113497
2,Grumpier Old Men,1995-12-22,0.0,0.0,101.0,"[Romance, Comedy]",6.187467,1995,A family wedding reignites the ancient feud be...,15602,tt0113228
3,Waiting to Exhale,1995-12-22,16000000.0,81452156.0,127.0,"[Comedy, Drama, Romance]",5.814209,1995,"Cheated on, mistreated and stepped on, the wom...",31357,tt0114885
4,Father of the Bride Part II,1995-02-10,0.0,76578911.0,106.0,[Comedy],5.682624,1995,Just when George Banks has recovered from his ...,11862,tt0113041


In [29]:
# Slice moives directed after 2005 (for control memory size)
smallData = smallData[smallData['year'] > '2003']

In [30]:
smallData.to_csv('metadata_cleaned.csv')

TODO : 1) Drop Nans for genres , years etc.
       2) Drop Score nans and cal. score again.
       3) Show vote average eq. 10 and explain why we should use weighted values
       4)
       

In [31]:
smallData = pd.read_csv('metadata_cleaned.csv',low_memory=True)

# TF-IDF

## Tensorflow

In [32]:

# import tensorflow as tf
# tk = tf.keras.preprocessing.text.Tokenizer(num_words=30000)
# tk.fit_on_texts(smallData['overview'])

# tfidf_matrix = tk.sequences_to_matrix(tk.texts_to_sequences(smallData['overview']), mode='tfidf')
# tfidf_matrix.shape


Take much time, so ignored!

## Scikit learn

In [33]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
smallData['overview'] = smallData['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(smallData['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(20408, 50598)



---



In [34]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [35]:
indices = pd.Series(smallData.index, index=smallData['title']).drop_duplicates()

In [36]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=smallData, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [37]:
try:
  print(content_recommender('Now You See Me 2'))
except:
  print("Invalid input!")

14335                                  Cold Deck
14154                          World of Tomorrow
12125                                After Death
4874                                       Setup
14792                                   Triple 9
17784                                 The Graves
373      Miss Congeniality 2: Armed and Fabulous
7049                                   Repeaters
9197                          Closer to the Moon
657                                 Just Friends
Name: title, dtype: object


# Crew & Keywords Dataset

In [38]:
# Load the keywords and credits files
credits = pd.read_csv('/content/credits.csv')
keywords = pd.read_csv('/content/keywords.csv')

In [39]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [40]:
#Clean the ids of df
smallData['id'] = smallData['id'].apply(clean_ids)

#Filter all rows that have a null ID
smallData = smallData[smallData['id'].notnull()]

In [41]:
# Convert IDs into integer
smallData['id'] = smallData['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
smallData = smallData.merge(credits, on='id')
smallData = smallData.merge(keywords, on='id')

#Display the head of df
smallData.head()

Unnamed: 0.1,Unnamed: 0,title,release_date,budget,revenue,runtime,genres,score,year,overview,id,imdb_id,cast,crew,keywords
0,711,War Stories Our Mother Never Told Us,,0.0,0.0,95.0,[],5.623682,NaT,Seven New Zealand women speak about their live...,365371,tt0114894,[],[],[]
1,734,Vermont Is for Lovers,,0.0,0.0,88.0,[],5.623682,NaT,Vermont is for Lovers is an independently prod...,215107,tt0105737,"[{'cast_id': 3, 'character': 'George', 'credit...","[{'credit_id': '52fe4df4c3a368484e20642b', 'de...",[]
2,868,Venice,2010-05-25,0.0,0.0,110.0,"['Drama', 'Romance']",5.760141,2010,An atmospheric coming-of-age story featuring a...,79782,tt1684935,"[{'cast_id': 1005, 'character': 'Marek', 'cred...","[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",[]
3,1081,The Sleepover,2013-10-12,0.0,0.0,6.0,"['Comedy', 'Horror']",5.66938,2013,"The town of Derry has a secret, but no one tol...",141210,tt2250194,"[{'cast_id': 2, 'character': 'Rachel', 'credit...","[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",[]
4,2114,The Farmer's Wife,2012-06-20,0.0,0.0,18.0,['Drama'],5.707841,2012,"As her surroundings are invaded by outsiders, ...",143750,tt2140519,"[{'cast_id': 10, 'character': 'The Auctioneer'...","[{'credit_id': '52fe4b169251416c750f7cd5', 'de...","[{'id': 214549, 'name': 'short'}]"


In [42]:
# Convert the stringified objects into the native python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    smallData[feature] = smallData[feature].apply(literal_eval)

In [43]:
#Define the new director feature
smallData['director'] = smallData['crew'].apply(get_director)
smallData['cast'] = smallData['cast'].apply(generate_list)
smallData['keywords'] = smallData['keywords'].apply(generate_list)
smallData['genres'] = smallData['genres'].apply(lambda x: x[:3])

In [44]:
smallData[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,War Stories Our Mother Never Told Us,[],,[],[]
1,Vermont Is for Lovers,"[George Thrush, Marya Cohn, Ann O'Brien]",John O'Brien,[],[]
2,Venice,"[Marcin Walewski, Magdalena Cielecka, Mariusz ...",Jan Jakub Kolski,[],"[Drama, Romance]"
3,The Sleepover,"[Josh Feldman, Gus Kamp, Carolyn Jania]",Chris Cullari,[],"[Comedy, Horror]"
4,The Farmer's Wife,"[James Cartwright, Geraldine James, Alex Kelly]",Francis Lee,[short],[Drama]


In [45]:
#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    smallData[feature] = smallData[feature].apply(sanitize)

In [46]:
smallData['soup'] = smallData.apply(create_soup, axis=1)

## Processing soup

In [47]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(smallData['soup'])

In [48]:
#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [49]:
# Reset index of your df and construct reverse mapping again
smallData = smallData.reset_index()
smallData.to_csv('metaData.csv')
indices2 = pd.Series(smallData.index, index=smallData['title'])

In [50]:
content_recommender('Now You See Me 2', cosine_sim2, smallData, indices2)

4686                30 Minutes or Less
9998     I Accidentally Domed Your Son
103        Around the World in 80 Days
14841                       Bang Bang!
14842                       Bang Bang!
20094                      InAlienable
6419             G.I. Joe: Retaliation
1234                        Epic Movie
1821                Witless Protection
1927                  Virgin Territory
Name: title, dtype: object