## Preparation for Feature Engineer of Movie Data

#### Convert Nested Categorical Columns to 'name' Key Value:
   - **genres**
   - **keywords**
   - **production_companies**
   - **production_countries**
   - **spoken_languages**

#### Convert movie_df Datetime object to:
   - **release_day_of_week** (string) Day of Week movie was released on
   - **release_month** (numerical) Month of movie release

In [1]:
%load_ext watermark
%watermark -a "Emily Schoof" -d -t -v -p numpy,pandas,re

Emily Schoof 2019-08-22 15:34:28 

CPython 3.7.3
IPython 7.4.0

numpy 1.16.2
pandas 0.24.2
re 2.2.1


In [2]:
# Load the dataset
%store -r movie_df_cleaner
movie_df = movie_df_cleaner.copy()
print(movie_df.shape)
movie_df.head()

(4641, 10)


Unnamed: 0,title,revenue,genres,keywords,original_language,production_companies,production_countries,runtime,spoken_languages,release_date_dt
0,Avatar,2787965000.0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",2009-12-10
1,Titanic,1845034000.0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...","[{""id"": 2580, ""name"": ""shipwreck""}, {""id"": 298...",en,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",194.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",1997-11-18
2,The Avengers,1519558000.0,"[{""id"": 878, ""name"": ""Science Fiction""}, {""id""...","[{""id"": 242, ""name"": ""new york""}, {""id"": 5539,...",en,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",143.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",2012-04-25
3,Jurassic World,1513529000.0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1299, ""name"": ""monster""}, {""id"": 1718,...",en,"[{""name"": ""Universal Studios"", ""id"": 13}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",124.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",2015-06-09
4,Furious 7,1506249000.0,"[{""id"": 28, ""name"": ""Action""}]","[{""id"": 830, ""name"": ""car race""}, {""id"": 3428,...",en,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",137.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",2015-04-01


## Extract 'name' key from Nested Columns

In [3]:
# Import necessary modules
import re
import pandas as pd
import numpy as np

In [4]:
# Define patterns to strip from nested data columns
name_pattern = re.compile('[A-Z]+\s?[A-Za-z]+')
keyword_pattern = re.compile('[a-z]+\s?[a-z]+')

In [5]:
def extract_names(series):
    """ Create New Series of Nested Names """
    
    new_series = []
    
    for row in series:
        s = ' '.join(re.findall(name_pattern, row))
        new_series.append(s)
        
    return pd.Series(new_series)

In [6]:
def extract_keywords(series):
    """ Create New Series of Nested Keyword Names """
    
    new_series = []
    
    for row in series:
        s = ' '.join(re.findall(keyword_pattern, row))
        new_s = s.replace('id', '').replace('name', '')
        new_series.append(new_s)
        
    return pd.Series(new_series)

Genres

In [7]:
# Extract names
new_genres = extract_names(movie_df.genres)

# Replace original column
movie_df['genres'] = new_genres

Keywords

In [8]:
# Extract names
new_keywords = extract_keywords(movie_df.keywords)

# Replace original column
movie_df['keywords'] = new_keywords

Production Companies

In [9]:
# Extract names
new_production_companies = extract_names(movie_df.production_companies)

# Replace original column
movie_df['production_companies'] = new_production_companies

Production Countries

In [10]:
# Extract names
new_production_countries = extract_names(movie_df.production_countries)

# Replace original column
movie_df['production_countries'] = new_production_countries

Spoken Languages

In [11]:
# Extract names
new_spoken_languages = extract_names(movie_df.spoken_languages)

# Replace original column
movie_df['spoken_languages'] = new_spoken_languages

## Convert Timestamp Object to String Values

Create a function that converts timestamp objects to integers

In [12]:
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [13]:
def dt_column_to_int(df, column):
    
    list_int = []
    
    for dt in df[column]:
        new_dt = to_integer(dt)
        list_int.append(new_dt)
    
    return list_int

Release Date

In [14]:
# Replace release_date_dt with int values
movie_df['release_date'] = dt_column_to_int(movie_df, 'release_date_dt')

Release Day of Week

In [15]:
len(movie_df['release_date'])

4641

In [16]:
# Identify which month and day of the week of the movie release
months = []
days = []
for date in movie_df['release_date_dt']:
    day = date.day_name()
    month = date.month
    days.append(day)
    months.append(month)

In [17]:
# Add days to dataframe
movie_df['release_day_of_week'] = pd.Series(days, index=np.arange(0, 4641))
movie_df['release_month'] = pd.Series(months, index=np.arange(0, 4641))

In [18]:
movie_df.dtypes

title                           object
revenue                        float64
genres                          object
keywords                        object
original_language               object
production_companies            object
production_countries            object
runtime                        float64
spoken_languages                object
release_date_dt         datetime64[ns]
release_date                     int64
release_day_of_week             object
release_month                    int64
dtype: object

In [19]:
# Verify Output
movie_df.head()

Unnamed: 0,title,revenue,genres,keywords,original_language,production_companies,production_countries,runtime,spoken_languages,release_date_dt,release_date,release_day_of_week,release_month
0,Avatar,2787965000.0,Action Adventure Fantasy Science Fiction,culture clash future space war space c...,en,Ingenious Film Partners Twentieth Century Fox ...,US United States America GB United Kingdom,162.0,English Espa,2009-12-10,20091210,Thursday,12
1,Titanic,1845034000.0,Drama Romance Thriller,shipwreck iceberg ship panic titanic...,en,Paramount Pictures Twentieth Century Fox Film ...,US United States America,194.0,English Fran Deutsch Italiano,1997-11-18,19971118,Tuesday,11
2,The Avengers,1519558000.0,Science Fiction Action Adventure,new york shield marvel comic superhero...,en,Paramount Pictures Marvel Studios,US United States America,143.0,English,2012-04-25,20120425,Wednesday,4
3,Jurassic World,1513529000.0,Action Adventure Science Fiction Thriller,monster dna tyrannosaurus rex velocira...,en,Universal Studios Amblin Entertainment Legenda...,US United States America,124.0,English,2015-06-09,20150609,Tuesday,6
4,Furious 7,1506249000.0,Action,car race speed revenge suspense car ...,en,Universal Pictures Original Film Fuji Televisi...,JP Japan US United States America,137.0,English,2015-04-01,20150401,Wednesday,4


In [20]:
movie_df.shape

(4641, 13)

In [21]:
# Store modified movie_df globally
movie_df_unnested = movie_df.copy().drop(columns='release_date_dt')
%store movie_df_unnested 

Stored 'movie_df_unnested' (DataFrame)
