# Data Preprocessing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import json

sns.set(style='whitegrid')

## 1 Data Integration

### 1.1 Read tmdb_5000_movies.csv 

In [3]:
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
movies_df.shape

(4803, 20)

Look into the first data input for more details of the features

In [5]:
movies_df.loc[0, 'overview']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [6]:
avatar_genres = pd.DataFrame(json.loads(movies_df.loc[0, 'genres']))
avatar_genres

Unnamed: 0,id,name
0,28,Action
1,12,Adventure
2,14,Fantasy
3,878,Science Fiction


In [7]:
avatar_keywords =  pd.DataFrame(json.loads(movies_df.loc[0, 'keywords']))
avatar_keywords

Unnamed: 0,id,name
0,1463,culture clash
1,2964,future
2,3386,space war
3,3388,space colony
4,3679,society
5,3801,space travel
6,9685,futuristic
7,9840,romance
8,9882,space
9,9951,alien


In [8]:
avatar_production_companies =  pd.DataFrame(json.loads(movies_df.loc[0, 'production_companies']))
avatar_production_companies

Unnamed: 0,name,id
0,Ingenious Film Partners,289
1,Twentieth Century Fox Film Corporation,306
2,Dune Entertainment,444
3,Lightstorm Entertainment,574


In [9]:
avatar_production_countries =  pd.DataFrame(json.loads(movies_df.loc[0, 'production_countries']))
avatar_production_countries

Unnamed: 0,iso_3166_1,name
0,US,United States of America
1,GB,United Kingdom


In [10]:
avatar_spoken_languages =  pd.DataFrame(json.loads(movies_df.loc[0, 'spoken_languages']))
avatar_spoken_languages

Unnamed: 0,iso_639_1,name
0,en,English
1,es,Español


### 1.2 Read tmdb_5000_credits.csv

In [11]:
credits_df = pd.read_csv('data/tmdb_5000_credits.csv')
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [12]:
credits_df.shape

(4803, 4)

Look into the first data input for more details of the features

In [13]:
credits_df.iloc[0]

movie_id                                                19995
title                                                  Avatar
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

In [14]:
avatar_cast = pd.DataFrame(json.loads(credits_df.loc[0, 'cast']))
avatar_cast.head(20)

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order
0,242,Jake Sully,5602a8a7c3a3685532001c9a,2,65731,Sam Worthington,0
1,3,Neytiri,52fe48009251416c750ac9cb,1,8691,Zoe Saldana,1
2,25,Dr. Grace Augustine,52fe48009251416c750aca39,1,10205,Sigourney Weaver,2
3,4,Col. Quaritch,52fe48009251416c750ac9cf,2,32747,Stephen Lang,3
4,5,Trudy Chacon,52fe48009251416c750ac9d3,1,17647,Michelle Rodriguez,4
5,8,Selfridge,52fe48009251416c750ac9e1,2,1771,Giovanni Ribisi,5
6,7,Norm Spellman,52fe48009251416c750ac9dd,2,59231,Joel David Moore,6
7,9,Moat,52fe48009251416c750ac9e5,1,30485,CCH Pounder,7
8,11,Eytukan,52fe48009251416c750ac9ed,2,15853,Wes Studi,8
9,10,Tsu'Tey,52fe48009251416c750ac9e9,2,10964,Laz Alonso,9


In [15]:
avatar_crew = pd.DataFrame(json.loads(credits_df.loc[0, 'crew']))
avatar_crew.head(20)

Unnamed: 0,credit_id,department,gender,id,job,name
0,52fe48009251416c750aca23,Editing,0,1721,Editor,Stephen E. Rivkin
1,539c47ecc3a36810e3001f87,Art,2,496,Production Design,Rick Carter
2,54491c89c3a3680fb4001cf7,Sound,0,900,Sound Designer,Christopher Boyes
3,54491cb70e0a267480001bd0,Sound,0,900,Supervising Sound Editor,Christopher Boyes
4,539c4a4cc3a36810c9002101,Production,1,1262,Casting,Mali Finn
5,5544ee3b925141499f0008fc,Sound,2,1729,Original Music Composer,James Horner
6,52fe48009251416c750ac9c3,Directing,2,2710,Director,James Cameron
7,52fe48009251416c750ac9d9,Writing,2,2710,Writer,James Cameron
8,52fe48009251416c750aca17,Editing,2,2710,Editor,James Cameron
9,52fe48009251416c750aca29,Production,2,2710,Producer,James Cameron


### 1.3 Merge of two data frame

In [16]:
movies_df['id'].nunique()

4803

ID is unique for each row in the data frame

In [17]:
movies_df['title'].nunique()

4800

Duplicated titles are found in the data frame

In [18]:
movies_df[movies_df['title'].duplicated()]['title']

2877           The Host
3693    Out of the Blue
4267             Batman
Name: title, dtype: object

In [19]:
movies_df[movies_df['title']=='The Host']

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
972,44000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",,72710,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,The Host,A parasitic alien soul is injected into the bo...,42.933027,"[{""name"": ""Nick Wechsler Productions"", ""id"": 8...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-03-22,63327201,125.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,You will be one of us,The Host,6.0,1817
2877,11000000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 18, ""nam...",http://www.hostmovie.com/,1255,"[{""id"": 1261, ""name"": ""river""}, {""id"": 1880, ""...",ko,괴물,Gang-du is a dim-witted man working at his fat...,27.65527,"[{""name"": ""Cineclick Asia"", ""id"": 685}, {""name...","[{""iso_3166_1"": ""KR"", ""name"": ""South Korea""}]",2006-07-27,88489643,119.0,"[{""iso_639_1"": ""ko"", ""name"": ""\ud55c\uad6d\uc5...",Released,Monsters are real.,The Host,6.7,537


In [20]:
movies_df[movies_df['title']=='Out of the Blue']

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
3647,0,"[{""id"": 18, ""name"": ""Drama""}]",,39269,"[{""id"": 4470, ""name"": ""punk""}, {""id"": 10183, ""...",en,Out of the Blue,Dennis Hopper is a hard-drinking truck driver ...,0.679351,"[{""name"": ""Robson Street"", ""id"": 71953}]","[{""iso_3166_1"": ""CA"", ""name"": ""Canada""}]",1980-05-01,0,94.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A harrowing drama from the director of Easy Rider,Out of the Blue,6.5,17
3693,0,"[{""id"": 18, ""name"": ""Drama""}]",,10844,"[{""id"": 2658, ""name"": ""new zealand""}, {""id"": 3...",en,Out of the Blue,Ordinary people find extraordinary courage in ...,0.706355,[],"[{""iso_3166_1"": ""NZ"", ""name"": ""New Zealand""}]",2006-10-12,0,103.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The true story of a small town massacre,Out of the Blue,5.9,18


In [21]:
movies_df[movies_df['title']=='Batman']

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
1359,35000000,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""na...",,268,"[{""id"": 848, ""name"": ""double life""}, {""id"": 84...",en,Batman,The Dark Knight of Gotham City begins his war ...,44.104469,"[{""name"": ""PolyGram Filmed Entertainment"", ""id...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",1989-06-23,411348924,126.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Have you ever danced with the devil in the pal...,Batman,7.0,2096
4267,1377800,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 12, ""...",,2661,"[{""id"": 339, ""name"": ""submarine""}, {""id"": 849,...",en,Batman,The Dynamic Duo faces four super-villains who ...,9.815394,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1966-07-30,0,105.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,He's Here Big As Life In A Real Bat-Epic,Batman,6.1,203


Duplicated titles in the data frame come from different movies having the same title

In [22]:
np.sum(movies_df['id']==credits_df['movie_id'])

4803

ID column and Movie ID column in two data frames are prefectly aligned

In [23]:
tmdb_df = movies_df.copy()
tmdb_df[['cast', 'crew']] = credits_df[['cast', 'crew']]
tmdb_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [24]:
tmdb_features = sorted(list(tmdb_df.columns.drop(['id','title'])))
tmdb_df = tmdb_df.reindex(['id','title']+tmdb_features, axis=1)
tmdb_df.head()

Unnamed: 0,id,title,budget,cast,crew,genres,homepage,keywords,original_language,original_title,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
0,19995,Avatar,237000000,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,7.2,11800
1,285,Pirates of the Caribbean: At World's End,300000000,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",6.9,4500
2,206647,Spectre,245000000,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,...,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,6.3,4466
3,49026,The Dark Knight Rises,250000000,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,...,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,7.6,9106
4,49529,John Carter,260000000,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",6.1,2124


In [25]:
tmdb_df.to_csv('output/tmdb_5000.csv', index=False)

## 2 Data Cleaning

In [42]:
tmdb_df = pd.read_csv('output/tmdb_5000.csv')
tmdb_df.head()

Unnamed: 0,id,title,budget,cast,crew,genres,homepage,keywords,original_language,original_title,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
0,19995,Avatar,237000000,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,7.2,11800
1,285,Pirates of the Caribbean: At World's End,300000000,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",6.9,4500
2,206647,Spectre,245000000,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,...,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,6.3,4466
3,49026,The Dark Knight Rises,250000000,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,...,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,7.6,9106
4,49529,John Carter,260000000,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",6.1,2124


### 2.1 Feature "Homepage"

In [43]:
valid_homepage = tmdb_df['homepage'].count()
valid_homepage, valid_homepage/len(tmdb_df)

(1712, 0.3564438892358942)

In [44]:
tmdb_df.drop(['homepage'], axis=1, inplace=True)

Less than 40% of the movies in the table have valid homepage, inconsistent and irrelevant data

### 2.2 Feature "Status"

In [45]:
tmdb_df['status'].value_counts()

Released           4795
Rumored               5
Post Production       3
Name: status, dtype: int64

In [46]:
tmdb_df[tmdb_df['status']=='Rumored']

Unnamed: 0,id,title,budget,cast,crew,genres,keywords,original_language,original_title,overview,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
4401,43630,The Helix... Loaded,0,[],[],"[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",[],en,The Helix... Loaded,,...,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2005-01-01,0,97.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Rumored,,4.8,2
4453,57294,Crying with Laughter,0,"[{""cast_id"": 3, ""character"": ""Joey Frisk"", ""cr...","[{""credit_id"": ""52fe492dc3a36847f818d031"", ""de...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 1930, ""name"": ""kidnapping""}, {""id"": 97...",en,Crying with Laughter,Powerfully redemptive and darkly comedic reven...,...,"[{""name"": ""Scottish Screen"", ""id"": 698}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2009-06-01,0,93.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Rumored,A Bad Trip Down Memory Lane,7.0,1
4508,70875,The Harvest (La Cosecha),56000,[],"[{""credit_id"": ""52fe4816c3a368484e0e8bbd"", ""de...","[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 1729, ""name"": ""migration""}, {""id"": 190...",en,The Harvest (La Cosecha),The story of the children who work 12-14 hour ...,...,[],[],2011-07-29,0,80.0,[],Rumored,,0.0,0
4662,40963,Little Big Top,0,"[{""cast_id"": 0, ""character"": ""Seymour"", ""credi...",[],"[{""id"": 35, ""name"": ""Comedy""}]","[{""id"": 10183, ""name"": ""independent film""}]",en,Little Big Top,An aging out of work clown returns to his smal...,...,"[{""name"": ""Fly High Films"", ""id"": 24248}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2006-01-01,0,0.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Rumored,,10.0,1
4754,84659,The Naked Ape,0,"[{""cast_id"": 1, ""character"": ""Alex"", ""credit_i...","[{""credit_id"": ""52fe49049251416c910a00b3"", ""de...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",[],en,The Naked Ape,The Naked Ape is a coming-of-age film followin...,...,[],[],2006-09-16,0,110.0,[],Rumored,,5.0,1


In [47]:
tmdb_df['status'].value_counts()[1]/len(tmdb_df)

0.0010410160316468874

In [48]:
tmdb_df[tmdb_df['status']=='Post Production']

Unnamed: 0,id,title,budget,cast,crew,genres,keywords,original_language,original_title,overview,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
2906,357837,Dancin' It's On,0,"[{""cast_id"": 0, ""character"": ""Jennifer"", ""cred...","[{""credit_id"": ""55e85c30c3a3682c62002869"", ""de...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...","[{""id"": 246, ""name"": ""dancing""}, {""id"": 271, ""...",cs,Dancin' It's On,"This coming of age Dance Film, in the spirit o...",...,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2015-10-16,0,89.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Post Production,Dancin' Like You've Never Seen Before,4.3,2
4169,295886,Brotherly Love,1900000,"[{""cast_id"": 1, ""character"": ""Jackie Taylor"", ...","[{""credit_id"": ""54d3585cc3a3686ab90036ff"", ""de...","[{""id"": 18, ""name"": ""Drama""}]",[],en,Brotherly Love,West Philadelphia basketball star Sergio Taylo...,...,[],[],2015-04-24,0,89.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Post Production,,6.9,21
4178,50875,Higher Ground,2000000,"[{""cast_id"": 6, ""character"": ""Corinne Walker"",...","[{""credit_id"": ""52fe47dac3a36847f814bcb5"", ""de...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 53, ""name...","[{""id"": 1009, ""name"": ""baby""}, {""id"": 1157, ""n...",en,Higher Ground,A chronicle of one woman's lifelong struggle w...,...,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2011-08-26,841733,109.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Post Production,,5.3,14


In [49]:
tmdb_df['status'].value_counts()[2]/len(tmdb_df)

0.0006246096189881324

In [50]:
tmdb_df = tmdb_df[tmdb_df['status']=='Released']
tmdb_df.drop(['status'], axis=1, inplace=True)

Insignificant portion of movies, most of which are not popular, of status 'Rumored' and 'Post Production'

In [51]:
tmdb_df.count()==len(tmdb_df)

id                       True
title                    True
budget                   True
cast                     True
crew                     True
genres                   True
keywords                 True
original_language        True
original_title           True
overview                False
popularity               True
production_companies     True
production_countries     True
release_date            False
revenue                  True
runtime                 False
spoken_languages         True
tagline                 False
vote_average             True
vote_count               True
dtype: bool

### 2.3 Feature "Overview"

In [52]:
tmdb_df['overview'].apply(type).value_counts()

<class 'str'>      4792
<class 'float'>       3
Name: overview, dtype: int64

In [53]:
tmdb_df[tmdb_df['overview'].apply(lambda x: type(x)!=str)]

Unnamed: 0,id,title,budget,cast,crew,genres,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,vote_average,vote_count
2656,370980,Chiamatemi Francesco - Il Papa della gente,15000000,"[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de...","[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...",it,Chiamatemi Francesco - Il Papa della gente,,0.738646,"[{""name"": ""Taodue Film"", ""id"": 45724}]","[{""iso_3166_1"": ""IT"", ""name"": ""Italy""}]",2015-12-03,0,,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",,7.3,12
4140,459488,"To Be Frank, Sinatra at 100",2,"[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de...","[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...",en,"To Be Frank, Sinatra at 100",,0.050625,"[{""name"": ""Eyeline Entertainment"", ""id"": 60343}]","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2015-12-12,0,,[],,0.0,0
4431,292539,Food Chains,913000,[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de...","[{""id"": 99, ""name"": ""Documentary""}]",[],de,Food Chains,,0.795698,[],[],2014-04-26,0,83.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",,7.4,8


In [54]:
tmdb_df['overview'].str.len().describe()

count    4792.000000
mean      305.595576
std       162.663506
min        23.000000
25%       178.000000
50%       284.000000
75%       392.000000
max      1000.000000
Name: overview, dtype: float64

In [55]:
tmdb_df = tmdb_df[tmdb_df['overview'].apply(lambda x: type(x)==str)]

Insignificant portion of movies, most of which are not popular, without a proper overview

In [57]:
tmdb_df.count()==len(tmdb_df)

id                       True
title                    True
budget                   True
cast                     True
crew                     True
genres                   True
keywords                 True
original_language        True
original_title           True
overview                 True
popularity               True
production_companies     True
production_countries     True
release_date            False
revenue                  True
runtime                  True
spoken_languages         True
tagline                 False
vote_average             True
vote_count               True
dtype: bool

### 2.4 Feature "Release Date"

In [56]:
tmdb_df['release_date'].apply(type).value_counts()

<class 'str'>      4791
<class 'float'>       1
Name: release_date, dtype: int64

In [59]:
tmdb_df[tmdb_df['release_date'].apply(lambda x: type(x)!=str)]

Unnamed: 0,id,title,budget,cast,crew,genres,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,vote_average,vote_count
4553,380097,America Is Still the Place,0,[],[],[],[],en,America Is Still the Place,1971 post civil rights San Francisco seemed li...,0.0,[],[],,0,0.0,[],,0.0,0


In [60]:
tmdb_df = tmdb_df[tmdb_df['release_date'].apply(lambda x: type(x)==str)]

Obviously invalid data

In [61]:
tmdb_df.count()==len(tmdb_df)

id                       True
title                    True
budget                   True
cast                     True
crew                     True
genres                   True
keywords                 True
original_language        True
original_title           True
overview                 True
popularity               True
production_companies     True
production_countries     True
release_date             True
revenue                  True
runtime                  True
spoken_languages         True
tagline                 False
vote_average             True
vote_count               True
dtype: bool

### 2.5 Feature "Tagline"

In [62]:
tmdb_df['tagline'].apply(type).value_counts()

<class 'str'>      3957
<class 'float'>     834
Name: tagline, dtype: int64

In [63]:
tmdb_df['tagline'].apply(type).value_counts()[1]/len(tmdb_df)

0.17407639323731997

In [64]:
tmdb_df['tagline'].str.len().describe()

count    3957.000000
mean       41.993935
std        24.268613
min         3.000000
25%        26.000000
50%        36.000000
75%        52.000000
max       252.000000
Name: tagline, dtype: float64

In [73]:
tmdb_df['tagline'].head()

0                       Enter the World of Pandora.
1    At the end of the world, the adventure begins.
2                             A Plan No One Escapes
3                                   The Legend Ends
4              Lost in our world, found in another.
Name: tagline, dtype: object

In [74]:
tmdb_df.drop(['tagline'], axis=1, inplace=True)

Tagline is not important for data mining in this case as more detailed information is given by the overview

In [75]:
tmdb_df.count()==len(tmdb_df)

id                      True
title                   True
budget                  True
cast                    True
crew                    True
genres                  True
keywords                True
original_language       True
original_title          True
overview                True
popularity              True
production_companies    True
production_countries    True
release_date            True
revenue                 True
runtime                 True
spoken_languages        True
vote_average            True
vote_count              True
dtype: bool

In [80]:
tmdb_df.to_csv('output/tmdb_5000_cleaned.csv', index=False)