In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
def download_data():
    api.dataset_download_files('tmdb/tmdb-movie-metadata',path='../')
download_data()

In [4]:
import zipfile

with zipfile.ZipFile('../tmdb-movie-metadata.zip','r') as zipref:
    zipref.extractall('../data')

import os
os.remove('../tmdb-movie-metadata.zip')

In [1]:
import pandas as pd
import numpy as np
import json
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies= pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv')


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [5]:
movies = movies.merge(credits, on='title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [6]:
# genres
# id
# keywords
# title
# overview
# cast
# crew


movies = movies[['movie_id','title','overview','keywords','cast','crew','genres']]
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   keywords  4809 non-null   object
 4   cast      4809 non-null   object
 5   crew      4809 non-null   object
 6   genres    4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [7]:
# missing data
movies.isnull().sum()

movie_id    0
title       0
overview    3
keywords    0
cast        0
crew        0
genres      0
dtype: int64

In [8]:
# drop the those overview records
movies.dropna(inplace=True)

movies.isnull().sum()

movie_id    0
title       0
overview    0
keywords    0
cast        0
crew        0
genres      0
dtype: int64

In [9]:
# check for duplicated records


movies.duplicated().sum()

0

In [10]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [11]:
# getting the genre tags

def convert(obj):
    L = []

    obj = json.loads(obj)
    for i in obj:
        L.append(i['name'])
    
    return L




In [12]:
movies['genres'] = movies['genres'].apply(convert)

movies.head()

Unnamed: 0,movie_id,title,overview,keywords,cast,crew,genres
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]"
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[Adventure, Fantasy, Action]"
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[Action, Adventure, Crime]"
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[Action, Crime, Drama, Thriller]"
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[Action, Adventure, Science Fiction]"


In [13]:
movies['keywords'] = movies['keywords'].apply(convert)

movies.head(1)

Unnamed: 0,movie_id,title,overview,keywords,cast,crew,genres
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]"


In [14]:
def convert3(obj):
    L = []
    counter = 0
    obj = json.loads(obj)
    for i in obj:
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [15]:
movies['cast'] = movies['cast'].apply(convert3)
movies['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [16]:


def fetch_director(obj):
    L = []

    for i in json.loads(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L



In [17]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [18]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_df = movies[['movie_id','title','tags']]

In [21]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [22]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
new_df['lower_title'] = new_df['title'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['lower_title'] = new_df['title'].apply(lambda x: x.lower())


In [23]:
new_df.head()

Unnamed: 0,movie_id,title,tags,lower_title
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",avatar
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",pirates of the caribbean: at world's end
2,206647,Spectre,a cryptic message from bond’s past sends him o...,spectre
3,49026,The Dark Knight Rises,following the death of district attorney harve...,the dark knight rises
4,49529,John Carter,"john carter is a war-weary, former military ca...",john carter


In [24]:
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(text))
    
    return " ".join(y)



In [25]:
new_df['tags'] =  new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] =  new_df['tags'].apply(stem)


In [26]:
# text vectorization
cv = CountVectorizer(max_features=5000,stop_words='english')


In [27]:
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors.shape

(4806, 5000)

In [28]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [29]:
similarities = cosine_similarity(vectors)

In [30]:
def recommend(movie: str):
    movie = movie.lower()
    movie_id = new_df[new_df['lower_title'] == movie].index
    best5 = np.argsort(similarities[movie_id])[0,-6:-1]
    return new_df['title'].values[best5]
    

recommend('inception')



array(['Chicago Overcoat', 'Transformers: Revenge of the Fallen',
       'Timecop', 'Star Trek II: The Wrath of Khan', 'Duplex'],
      dtype=object)

In [32]:
import pickle

In [33]:
pickle.dump(new_df.to_dict(), open('movies.pkl','wb'))

In [34]:
pickle.dump(similarities, open('movie_similarities.pkl','wb'))