In [1]:
import pandas as pd
import numpy as np
import regex as re

import nltk
from nltk.tokenize import sent_tokenize
import contractions
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links = pd.read_csv('links.csv')
metadata = pd.read_csv('movies_metadata.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
metadata.drop(['belongs_to_collection','homepage','poster_path','tagline','video','popularity','spoken_languages','original_title'],axis = 1,inplace = True)
metadata.drop([19730,29503,35587],inplace = True)
metadata.id = metadata.id.astype(int)
metadata.reset_index(drop = True,inplace=True)
metadata.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,revenue,runtime,status,title,vote_average,vote_count
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,Released,Toy Story,7.7,5415.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,Released,Jumanji,6.9,2413.0
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,Released,Grumpier Old Men,6.5,92.0
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Just when George Banks has recovered from his ...,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0


In [4]:
df = pd.merge(metadata,credits,on = 'id',how = 'inner')
df.dropna(inplace=True)
df = pd.merge(df,keywords,on = 'id',how = 'inner')
df.dropna(inplace=True)

In [5]:
def get_name_value(x):
    return re.findall(r"'name':\s*'([^']*)'",x)

def get_director(x):
    return re.findall(r"\'job\': \'director\', \'name\':\s*'([^']*)'",x)

def get_character(x):
    return re.findall(r"\'character\':\s*'([^']*)'",x)[:5]

def get_cast(x):
    return re.findall(r"\'name\':\s*'([^']*)'",x)[:5]

In [6]:

df['genres'] = df['genres'].apply(lambda x: x.lower())
df['genres'] = df['genres'].apply(get_name_value)

df['production_companies'] = df['production_companies'].apply(lambda x: x.lower())
df['production_companies'] = df['production_companies'].apply(get_name_value)

df['production_countries'] = df['production_countries'].apply(lambda x: x.lower())
df['production_countries'] = df['production_countries'].apply(get_name_value)

df['crew'] = df['crew'].apply(lambda x: x.lower())
df['director'] = df['crew'].apply(get_director)

df['cast'] = df['cast'].apply(lambda x: x.lower())
df['film_cast'] = df['cast'].apply(get_cast)
df['film_characters'] = df['cast'].apply(get_character)

df['keywords'] = df['keywords'].apply(lambda x: x.lower())
df['keywords'] = df['keywords'].apply(get_name_value)

df['keywords'] = df['keywords'].apply(lambda x:' '.join(x))

df.drop(['crew','cast'],axis = 1,inplace = True)
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,revenue,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters
0,False,30000000,"[animation, comedy, family]",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",[pixar animation studios],[united states of america],1995-10-30,373554033.0,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,[john lasseter],"[tom hanks, tim allen, don rickles, jim varney...","[woody (voice), buzz lightyear (voice), mr. po..."
1,False,65000000,"[adventure, fantasy, family]",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,"[tristar pictures, teitler film, interscope co...",[united states of america],1995-12-15,262797249.0,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,[joe johnston],"[robin williams, jonathan hyde, kirsten dunst,...","[alan parrish, samuel alan parrish / van pelt,..."
2,False,0,"[romance, comedy]",15602,tt0113228,en,A family wedding reignites the ancient feud be...,"[warner bros., lancaster gate]",[united states of america],1995-12-22,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,[howard deutch],"[walter matthau, jack lemmon, ann-margret, sop...","[max goldman, john gustafson, ariel gustafson,..."
3,False,16000000,"[comedy, drama, romance]",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",[twentieth century fox film corporation],[united states of america],1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,[forest whitaker],"[whitney houston, angela bassett, loretta devi...","[robin stokes, marvin king, kenneth dawkins, j..."
4,False,0,[comedy],11862,tt0113041,en,Just when George Banks has recovered from his ...,"[sandollar productions, touchstone pictures]",[united states of america],1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,[charles shyer],"[steve martin, diane keaton, martin short, kim...","[george banks, nina banks, franck eggelhoffer,..."


In [7]:
def convert_2_to_1(x):
    new = []
    for i in x:
        i = i.replace(' ','')
        new.append(i)
    return new

In [8]:
df['genres'] = df['genres'].apply(convert_2_to_1)
df['director'] = df['director'].apply(convert_2_to_1)
df['film_cast'] = df['film_cast'].apply(convert_2_to_1)
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,revenue,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters
0,False,30000000,"[animation, comedy, family]",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",[pixar animation studios],[united states of america],1995-10-30,373554033.0,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,[johnlasseter],"[tomhanks, timallen, donrickles, jimvarney, wa...","[woody (voice), buzz lightyear (voice), mr. po..."
1,False,65000000,"[adventure, fantasy, family]",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,"[tristar pictures, teitler film, interscope co...",[united states of america],1995-12-15,262797249.0,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,[joejohnston],"[robinwilliams, jonathanhyde, kirstendunst, br...","[alan parrish, samuel alan parrish / van pelt,..."
2,False,0,"[romance, comedy]",15602,tt0113228,en,A family wedding reignites the ancient feud be...,"[warner bros., lancaster gate]",[united states of america],1995-12-22,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, sophi...","[max goldman, john gustafson, ariel gustafson,..."
3,False,16000000,"[comedy, drama, romance]",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",[twentieth century fox film corporation],[united states of america],1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...","[robin stokes, marvin king, kenneth dawkins, j..."
4,False,0,[comedy],11862,tt0113041,en,Just when George Banks has recovered from his ...,"[sandollar productions, touchstone pictures]",[united states of america],1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,[charlesshyer],"[stevemartin, dianekeaton, martinshort, kimber...","[george banks, nina banks, franck eggelhoffer,..."


In [9]:
def fix_contractions(x):
    return ' '.join([contractions.fix(word) for word in x.split(' ')])

def Lemmatize(x):
    lemma = WordNetLemmatizer()
    return ' '.join([lemma.lemmatize(word) for word in x.split(' ')])

In [10]:
df['overview'] = df['overview'].apply(lambda x: x.lower())
df['overview'] = df['overview'].apply(fix_contractions)
df['overview'] = df['overview'].apply(Lemmatize)

df['keywords'] = df['keywords'].apply(lambda x: x.lower())
df['keywords'] = df['keywords'].apply(fix_contractions)
df['keywords'] = df['keywords'].apply(Lemmatize)

df['genres'] = df['genres'].apply(lambda x: ' '.join(x))
df['director'] = df['director'].apply(lambda x: ' '.join(x))
df['film_cast'] = df['film_cast'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,revenue,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters
0,False,30000000,animation comedy family,862,tt0114709,en,"led by woody, andy's toy live happily in his r...",[pixar animation studios],[united states of america],1995-10-30,373554033.0,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friend rivalry boy...,johnlasseter,tomhanks timallen donrickles jimvarney wallace...,"[woody (voice), buzz lightyear (voice), mr. po..."
1,False,65000000,adventure fantasy family,8844,tt0113497,en,when sibling judy and peter discover an enchan...,"[tristar pictures, teitler film, interscope co...",[united states of america],1995-12-15,262797249.0,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,joejohnston,robinwilliams jonathanhyde kirstendunst bradle...,"[alan parrish, samuel alan parrish / van pelt,..."
2,False,0,romance comedy,15602,tt0113228,en,a family wedding reignites the ancient feud be...,"[warner bros., lancaster gate]",[united states of america],1995-12-22,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,howarddeutch,waltermatthau jacklemmon ann-margret sophialor...,"[max goldman, john gustafson, ariel gustafson,..."
3,False,16000000,comedy drama romance,31357,tt0114885,en,"cheated on, mistreated and stepped on, the wom...",[twentieth century fox film corporation],[united states of america],1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,forestwhitaker,whitneyhouston angelabassett lorettadevine lel...,"[robin stokes, marvin king, kenneth dawkins, j..."
4,False,0,comedy,11862,tt0113041,en,just when george bank ha recovered from his da...,"[sandollar productions, touchstone pictures]",[united states of america],1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,charlesshyer,stevemartin dianekeaton martinshort kimberlywi...,"[george banks, nina banks, franck eggelhoffer,..."


In [11]:
df['tags'] = df['overview']+' '+df['keywords']+' '+df['genres']+' '+df['director']+' '+df['film_cast']
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,...,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters,tags
0,False,30000000,animation comedy family,862,tt0114709,en,"led by woody, andy's toy live happily in his r...",[pixar animation studios],[united states of america],1995-10-30,...,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friend rivalry boy...,johnlasseter,tomhanks timallen donrickles jimvarney wallace...,"[woody (voice), buzz lightyear (voice), mr. po...","led by woody, andy's toy live happily in his r..."
1,False,65000000,adventure fantasy family,8844,tt0113497,en,when sibling judy and peter discover an enchan...,"[tristar pictures, teitler film, interscope co...",[united states of america],1995-12-15,...,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,joejohnston,robinwilliams jonathanhyde kirstendunst bradle...,"[alan parrish, samuel alan parrish / van pelt,...",when sibling judy and peter discover an enchan...
2,False,0,romance comedy,15602,tt0113228,en,a family wedding reignites the ancient feud be...,"[warner bros., lancaster gate]",[united states of america],1995-12-22,...,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,howarddeutch,waltermatthau jacklemmon ann-margret sophialor...,"[max goldman, john gustafson, ariel gustafson,...",a family wedding reignites the ancient feud be...
3,False,16000000,comedy drama romance,31357,tt0114885,en,"cheated on, mistreated and stepped on, the wom...",[twentieth century fox film corporation],[united states of america],1995-12-22,...,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,forestwhitaker,whitneyhouston angelabassett lorettadevine lel...,"[robin stokes, marvin king, kenneth dawkins, j...","cheated on, mistreated and stepped on, the wom..."
4,False,0,comedy,11862,tt0113041,en,just when george bank ha recovered from his da...,"[sandollar productions, touchstone pictures]",[united states of america],1995-02-10,...,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,charlesshyer,stevemartin dianekeaton martinshort kimberlywi...,"[george banks, nina banks, franck eggelhoffer,...",just when george bank ha recovered from his da...


In [14]:
df['tags'] = df['tags'].apply(lambda x: x.replace("'s",""))
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,production_companies,production_countries,release_date,...,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters,tags
0,False,30000000,animation comedy family,862,tt0114709,en,"led by woody, andy's toy live happily in his r...",[pixar animation studios],[united states of america],1995-10-30,...,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friend rivalry boy...,johnlasseter,tomhanks timallen donrickles jimvarney wallace...,"[woody (voice), buzz lightyear (voice), mr. po...","led by woody, andy toy live happily in his roo..."
1,False,65000000,adventure fantasy family,8844,tt0113497,en,when sibling judy and peter discover an enchan...,"[tristar pictures, teitler film, interscope co...",[united states of america],1995-12-15,...,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,joejohnston,robinwilliams jonathanhyde kirstendunst bradle...,"[alan parrish, samuel alan parrish / van pelt,...",when sibling judy and peter discover an enchan...
2,False,0,romance comedy,15602,tt0113228,en,a family wedding reignites the ancient feud be...,"[warner bros., lancaster gate]",[united states of america],1995-12-22,...,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,howarddeutch,waltermatthau jacklemmon ann-margret sophialor...,"[max goldman, john gustafson, ariel gustafson,...",a family wedding reignites the ancient feud be...
3,False,16000000,comedy drama romance,31357,tt0114885,en,"cheated on, mistreated and stepped on, the wom...",[twentieth century fox film corporation],[united states of america],1995-12-22,...,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,forestwhitaker,whitneyhouston angelabassett lorettadevine lel...,"[robin stokes, marvin king, kenneth dawkins, j...","cheated on, mistreated and stepped on, the wom..."
4,False,0,comedy,11862,tt0113041,en,just when george bank ha recovered from his da...,"[sandollar productions, touchstone pictures]",[united states of america],1995-02-10,...,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,charlesshyer,stevemartin dianekeaton martinshort kimberlywi...,"[george banks, nina banks, franck eggelhoffer,...",just when george bank ha recovered from his da...


In [15]:
df.to_csv('cleaned.csv',index = False)