In [1]:
import pandas as pd 
import numpy as np 
import re

In [2]:
# Load the datasets
movies = pd.read_csv('Datasets//tmdb_5000_movies.csv')

In [3]:
# look and understand the dataset
movies.tail()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
4798,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238
4799,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5
4800,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6
4801,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7
4802,0,"[{""id"": 99, ""name"": ""Documentary""}]",,25975,"[{""id"": 1523, ""name"": ""obsession""}, {""id"": 224...",en,My Date with Drew,Ever since the second grade when he first saw ...,1.929883,"[{""name"": ""rusty bear entertainment"", ""id"": 87...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2005-08-05,0,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,My Date with Drew,6.3,16


In [4]:
# lets look at the datatype of each column
movies.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object

In [5]:
# Drop unnecessary columns
cols_to_drop = ['original_title', 'homepage', 'status', 'vote_average', 'id', 'spoken_languages']
movies.drop(cols_to_drop, axis=1, inplace=True)

In [6]:
# change "release_date" to datetime data type
movies['release_date'] = pd.to_datetime(movies['release_date'], infer_datetime_format=True)

### Now let's deal with Missing values

In [7]:
# A glimpse at what's missing
for column in movies.columns:
    null_count = movies[column].isnull().sum()
    print(f'{column} — {round((null_count/movies.shape[0]) * 100)}% MISSING and total missing is: {null_count}')

budget — 0% MISSING and total missing is: 0
genres — 0% MISSING and total missing is: 0
keywords — 0% MISSING and total missing is: 0
original_language — 0% MISSING and total missing is: 0
overview — 0% MISSING and total missing is: 3
popularity — 0% MISSING and total missing is: 0
production_companies — 0% MISSING and total missing is: 0
production_countries — 0% MISSING and total missing is: 0
release_date — 0% MISSING and total missing is: 1
revenue — 0% MISSING and total missing is: 0
runtime — 0% MISSING and total missing is: 2
tagline — 18% MISSING and total missing is: 844
title — 0% MISSING and total missing is: 0
vote_count — 0% MISSING and total missing is: 0


### Tagline has the most missing values: 18% OF THE DATA and 844 values in total. We can fill the missing with 'No Tagline'

In [8]:
movies.tagline.fillna('No Tagline', inplace=True)

In [9]:
# let's drop the remaining rows with missing values
movies.dropna(axis=0, inplace=True)

In [10]:
# let's take another look at the dataset
movies.head(3)

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,tagline,title,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,Enter the World of Pandora.,Avatar,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,A Plan No One Escapes,Spectre,4466


In [11]:
# extract the values of the key 'name' in genres column
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
# regex pattern to subtract any non string characters
pattern = r"[^a-zA-Z_]+"

# function to wrap up to regex
def extract(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('id', '').replace('name', '').split('  '))
    return list(rows_list)

In [13]:
# update the column
movies['genres'] = extract(movies['genres'])

In [14]:
movies['genres'][0]

['', ' Action', ' Adventure', ' Fantasy', ' Science Fiction ']

## strip the spaces and remove the empty string

In [15]:
def strip(col):
    column_list = []
    for row in col:
        row = [x.strip(' ') for x in row if x]
        column_list.append(row)
    return column_list

In [16]:
movies['genres'] = strip(movies['genres'])

In [17]:
movies['genres'][0]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

## Exactly what we want

In [18]:
# let's look at keywords column
movies['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [19]:
# we can apply previous function to this column too
movies['keywords'] = extract(movies['keywords'])

In [20]:
# strip the white spaces and empty string
movies['keywords'] = strip(movies['keywords'])

In [21]:
# what we've done so far
movies['keywords'][0]

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 'd']

In [22]:
# let's take a closer look at this column
movies['production_companies'][1]

'[{"name": "Walt Disney Pictures", "id": 2}, {"name": "Jerry Bruckheimer Films", "id": 130}, {"name": "Second Mate Productions", "id": 19936}]'

In [23]:
# regex pattern to work with
pattern = r"[^a-zA-Z_]+"

# function to do the work
def extract_values(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('id', '').replace('name', '').split('  '))
    return list(rows_list)

In [24]:
# let's apply previous function here too
movies['production_companies'] = extract_values(movies['production_companies'])

In [25]:
movies['production_companies'][1]

['',
 'Walt Disney Pictures',
 ' Jerry Bruckheimer Films',
 ' Second Mate Productions',
 '']

In [26]:
movies['production_companies'] = strip(movies['production_companies'])

In [27]:
# see what the function does
movies['production_companies'][1]

['Walt Disney Pictures', 'Jerry Bruckheimer Films', 'Second Mate Productions']

In [28]:
# now let's look at production countries
movies['production_countries'][2]

'[{"iso_3166_1": "GB", "name": "United Kingdom"}, {"iso_3166_1": "US", "name": "United States of America"}]'

In [29]:
# we need to modify previous function a bit

pattern = r"[^a-zA-Z]+"
def extract_prod_countries(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('iso', '').replace('name', '').split('  '))
    return list(rows_list)

In [30]:
# update the column
movies['production_countries'] = extract_prod_countries(movies['production_countries'])

In [31]:
movies['production_countries'][2]

['', 'GB', 'United Kingdom', 'US', 'United States of America ']

In [32]:
movies['production_countries'] = strip(movies['production_countries'])

In [33]:
movies['production_countries'][2]

['GB', 'United Kingdom', 'US', 'United States of America']

### now let's take a look at all the columns we cleaned

In [34]:
movies.head()

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,tagline,title,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[US, United States of America, GB, United King...",2009-12-10,2787965087,162.0,Enter the World of Pandora.,Avatar,11800
1,300000000,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...","[US, United States of America]",2007-05-19,961000000,169.0,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,4500
2,245000000,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B]","[GB, United Kingdom, US, United States of Amer...",2015-10-26,880674609,148.0,A Plan No One Escapes,Spectre,4466
3,250000000,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret e...",en,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros, DC Entertain...","[US, United States of America]",2012-07-16,1084939099,165.0,The Legend Ends,The Dark Knight Rises,9106
4,260000000,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",en,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],"[US, United States of America]",2012-03-07,284139100,132.0,"Lost in our world, found in another.",John Carter,2124


### let's add another column to calculate the profit of each movie

In [35]:
movies['Profit'] = movies['revenue'] - movies['budget']

In [36]:
movies.isna().sum()

budget                  0
genres                  0
keywords                0
original_language       0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
tagline                 0
title                   0
vote_count              0
Profit                  0
dtype: int64

In [37]:
# everything is cleaned
movies.head(3)

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,tagline,title,vote_count,Profit
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[US, United States of America, GB, United King...",2009-12-10,2787965087,162.0,Enter the World of Pandora.,Avatar,11800,2550965087
1,300000000,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...","[US, United States of America]",2007-05-19,961000000,169.0,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,4500,661000000
2,245000000,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B]","[GB, United Kingdom, US, United States of Amer...",2015-10-26,880674609,148.0,A Plan No One Escapes,Spectre,4466,635674609


In [38]:
movies.to_csv('../EDA/Datasets/tmdb_cleaned.csv')