## Feature Engineer of Movie Data

#### A. Create Variables for Categorical Columns with Nested Data:
   - **genres**
   - **keywords**
   - **production_companies**
   - **production_countries**
   - **spoken_languages**
- Organize Nested List of Dictionaries by Movie Title
- Create feature variables [via pd.dummies] based on Column trends of each Movie Title
- Merge column dataframes to movie_df data
   
#### B.  Apply Random Forest for *tagline* column predictions

In [1]:
%load_ext watermark
%watermark -a "Emily Schoof" -d -t -v -p numpy,pandas,matplotlib

Emily Schoof 2019-08-14 21:30:55 

CPython 3.7.3
IPython 7.4.0

numpy 1.16.2
pandas 0.24.2
matplotlib 3.0.3


In [2]:
# Import necessary modules
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
PROJ_ROOT = os.path.join(os.pardir)

In [3]:
# Load the dataset
%store -r movie_df
movie_df.shape

(4651, 15)

#### Part 1: Organize Nested List of Dictionaries by Movie Title

In [4]:
# Import necessary modules
import ast

Create function to convert strings to list of dictionaries 

In [5]:
def listdictstr_to_listdictkey(data):
    """Convert Column of Dictionary Strings to Column of Lists of Dictionaries and Return Unique Keys"""
    
    dict_list = []
    unique_keys = []
    
    # Convert dictionary string to list of lists of dictionaries
    for instance in data:     
        dl = ast.literal_eval(instance)
        dict_list.append(dl)
    
    # Select unique keys
    for lists in dict_list:
        for d in lists:
            for key in d:
                unique_keys.append(key)
    unique_keys = set(unique_keys)
    
    return dict_list, unique_keys

Create a function that produces a dataframe from the column list of dictionaries

In [6]:
def dictlist_to_dataframe(dict_list, unique_keys):
    """Convert Column of Lists of Dictionaries to one Merged Dataframe"""
    
    columns = list(unique_keys)
    new_df = pd.DataFrame(columns=columns)

    for i in range(len(dict_list)):
    
        # Select movie title by matching index in list
        dlist = dict_list[i]
        movie = movie_df.iloc[i, 0]
    
        # Create dataframe with columns for nested data
        df = pd.DataFrame(dlist, columns=columns) 
        df['title'] = movie
    
        # Merge movie title instances into one combined dataframe
        new_df = new_df.append(df, sort=False, ignore_index=True)
    
    return new_df

Genres

In [7]:
# Genres to list/keys
genres_list, genres_keys = listdictstr_to_listdictkey(movie_df.genres)
print(len(genres_list))

# Genres Dataframe
genres_df = dictlist_to_dataframe(genres_list, genres_keys)
print(len(genres_df))

4651
11818


In [8]:
# Drop id column
genres_df = genres_df.drop(columns='id')

# Define unique entries
unique_genres = genres_df['name'].unique()
len(unique_genres)

20

In [9]:
unique_genres

array(['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime',
       'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy',
       'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music',
       'Documentary', 'Foreign', 'TV Movie'], dtype=object)

Keywords

In [10]:
# Keywords to list/kets
keywords_list, keywords_keys = listdictstr_to_listdictkey(movie_df.keywords)
print(len(keywords_list))

# Keywords Dataframe
keywords_df = dictlist_to_dataframe(keywords_list, keywords_keys)
print(len(keywords_df))

4651
35049


In [11]:
# Drop id column
keywords_df = keywords_df.drop(columns='id')

# Define unique entries
unique_keys = keywords_df['name'].unique()
len(unique_keys)

9595

In [12]:
unique_keys

array(['culture clash', 'future', 'space war', ..., 'paper knife',
       'guitar case', 'postal worker'], dtype=object)

Production Companies

In [13]:
# Production Companies to list/keys
prod_comp_list, prod_comp_keys = listdictstr_to_listdictkey(movie_df.production_companies)
print(len(prod_comp_list))

# Production Companies Dataframe
prod_companies_df = dictlist_to_dataframe(prod_comp_list, prod_comp_keys)
print(len(prod_companies_df))

4651
13461


In [14]:
# Drop id column
prod_companies_df = prod_companies_df.drop(columns='id')

# Define unique entries
unique_companies = prod_companies_df['name'].unique()
len(unique_companies)

4931

In [15]:
unique_companies

array(['Ingenious Film Partners',
       'Twentieth Century Fox Film Corporation', 'Dune Entertainment',
       ..., 'Front Street Pictures', 'rusty bear entertainment',
       'lucky crow films'], dtype=object)

Production Countries

In [16]:
# Production Countries to list/keys
prod_count_list, prod_count_keys = listdictstr_to_listdictkey(movie_df.production_countries)
print(len(prod_count_list))

# Production Countries Dataframe
prod_countries_df = dictlist_to_dataframe(prod_count_list, prod_count_keys)
print(len(prod_countries_df))
prod_countries_df.head(1)

4651
6276


Unnamed: 0,name,iso_3166_1,title
0,United States of America,US,Avatar


In [17]:
# Drop name column
prod_countries_df= prod_countries_df.drop(columns='name')

# Rename iso column
prod_countries_df.columns = ['name', 'title']

# Define unique entries
unique_countries = prod_countries_df['name'].unique()
len(unique_countries)

88

In [18]:
unique_countries

array(['US', 'GB', 'JM', 'BS', 'DM', 'CZ', 'PL', 'SI', 'NZ', 'DE', 'CN',
       'CA', 'IT', 'JP', 'MT', 'AU', 'FR', 'BE', 'IN', 'NL', 'ES', 'AE',
       'HK', 'TW', 'IE', 'MA', 'HU', 'SG', 'NO', 'SE', 'ZA', 'RU', 'RO',
       'MX', 'MC', 'CH', 'PK', 'MY', 'FI', 'IS', 'DK', 'TN', 'PH', 'BG',
       'KR', 'BR', 'PE', 'LU', 'BA', 'KZ', 'PT', 'AW', 'LY', 'RS', 'UA',
       'CL', 'AR', 'PA', 'AT', 'GR', 'LT', 'KH', 'TH', 'SK', 'IL', 'FJ',
       'CS', 'TR', 'NG', 'CY', 'JO', 'BO', 'EC', 'CO', 'EG', 'BT', 'LB',
       'KG', 'DZ', 'ID', 'GY', 'IR', 'GP', 'AF', 'AO', 'DO', 'CM', 'KE'],
      dtype=object)

Spoken Languages

In [19]:
# Spoken Languages to list/keys
spoken_lang_list, spoken_lang_keys = listdictstr_to_listdictkey(movie_df.spoken_languages)
print(len(spoken_lang_list))

# Spoken Languages Dataframe
spoken_lang_df = dictlist_to_dataframe(spoken_lang_list, spoken_lang_keys)
print(len(spoken_lang_df))
spoken_lang_df.head(1)

4651
6742


Unnamed: 0,name,iso_639_1,title
0,English,en,Avatar


In [20]:
# Drop name column
spoken_lang_df = spoken_lang_df.drop(columns='name')

# Rename iso column
spoken_lang_df.columns = ['name', 'title']

# Define unique entries
unique_language = spoken_lang_df['name'].unique()
len(unique_language)

87

In [21]:
unique_language

array(['en', 'es', 'fr', 'it', 'de', 'tr', 'el', 'zh', 'th', 'is', 'ru',
       'sv', 'ro', 'ja', 'la', 'hi', 'pt', 'bo', 'fa', 'ur', 'ar', 'sa',
       'gd', 'cs', 'cn', 'ko', 'no', 'ta', 'nv', 'he', 'da', 'nl', 'af',
       'ga', 'so', 'fi', 'sw', 'bg', 'yi', 'vi', 'hu', 'uk', 'eo', 'am',
       'km', 'ce', 'pl', 'co', 'pa', 'et', 'sq', 'sr', 'bs', 'hr', 'tl',
       'sh', 'sk', 'kk', 'ml', 'te', 'cy', 'hy', 'iu', 'wo', 'xh', 'ny',
       'st', 'zu', 'kw', 'si', 'ne', 'ps', 'mn', 'gl', 'xx', 'ka', 'bn',
       'ku', 'mi', 'to', 'ca', 'br', 'dz', 'ky', 'id', 'bm', 'sl'],
      dtype=object)

#### Part 2: Create feature variables based on Column trends of each Movie Title

In [35]:
# Make copy of movie_df
movie_df_modified = movie_df.copy()

# Drop original columns that have been modified with pd.Dummies
movie_df_modified = movie_df_modified.drop(columns=['genres', 'keywords', 'production_companies',
                                  'production_countries', 'spoken_languages'])
print(movie_df_modified.shape)
movie_df_modified.columns

(4651, 10)


Index(['title', 'tagline', 'revenue', 'budget', 'id', 'original_language',
       'overview', 'runtime', 'status', 'release_date_dt'],
      dtype='object')

In [36]:
# Create combined lists for value prefixes, unique lists, dataframes, and dict_titles
value_prefixes = ['genre_', 'key_', 'company_', 'country_', 'lang_']
unique_list = [unique_genres, unique_keys, unique_companies, unique_countries, unique_language]
dataframes = [genres_df, keywords_df, prod_companies_df, prod_countries_df, spoken_lang_df]
dict_title = [{}, {}, {}, {}, {}]

In [37]:
# Remove spaces with underscores in all unique_list values
for ulist in unique_list:
    for idx, val in enumerate(ulist):
        ulist[idx] = val.replace(" ", "_")

In [38]:
# Test output
unique_list[0], unique_list[1]

(array(['Action', 'Adventure', 'Fantasy', 'Science_Fiction', 'Crime',
        'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy',
        'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music',
        'Documentary', 'Foreign', 'TV_Movie'], dtype=object),
 array(['culture_clash', 'future', 'space_war', ..., 'paper_knife',
        'guitar_case', 'postal_worker'], dtype=object))

Add Feature Columns for Each Unique Value per Categorical Column -- **WARNING: SLOW FUNCTION**

In [39]:
# Add dataframe columns for each value in lists
for idx in range(0, 5):
    
    ulist = unique_list[idx]
    prefix = value_prefixes[idx]
    
    # Iterate over each unique value within a categorical columns
    for entry in ulist:
        col_name = prefix + entry 
        movie_df_modified[col_name] = 0 #set default to 0

Create Title:List_Value Key:Value Pairs -- **WARNING: SLOW FUNCTION**

In [40]:
# Add dataframe columns for each value in lists
for idx in range(0, 5):
    
    dfc = dataframes[idx]
    dict_t = dict_title[idx]

    # Convert dataframe rows to title:valuelist key pairs
    for t in dfc['title']:
        title_df = dfc.loc[(dfc['title'] == t)]
        name_list = list(title_df['name'])
        dict_t[t] = name_list

Replaces column contents with new values in specific rows/movie instances -- **WARNING: SLOW FUNCTION**

In [41]:
for idx in range(0, 5):
    
    prefix = value_prefixes[idx]
    dict_t = dict_title[idx]
    
    for t, val in dict_t.items():
        t_index = movie_df_modified.loc[movie_df_modified['title'] == t].index.astype(int)[0] # Define title index
        
        for i,v in enumerate(val):
            val[i] = v.replace(" ", "_")
        
        for feature in val:
            column_name = prefix + feature # Define column name
        
            if column_name in movie_df_modified.columns:
                # Replace 0 with 1 if present
                movie_df_modified.at[t_index, column_name] = 1

In [29]:
movie_df_modified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4651 entries, 0 to 4802
Columns: 14731 entries, title to lang_sl
dtypes: datetime64[ns](1), float64(1), int64(14724), object(5)
memory usage: 522.9+ MB


In [30]:
movie_df_modified.describe()

Unnamed: 0,revenue,budget,id,runtime,genre_Action,genre_Adventure,genre_Fantasy,genre_Science_Fiction,genre_Crime,genre_Drama,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
count,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,...,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0,4651.0
mean,84177290.0,29876640.0,57286.919802,106.826059,0.243604,0.165771,0.090088,0.113094,0.14685,0.475167,...,0.000215,0.00043,0.000215,0.000215,0.000215,0.000215,0.000215,0.00043,0.000215,0.000215
std,164880400.0,41088980.0,88208.072246,21.115017,0.429302,0.371915,0.286339,0.316742,0.353995,0.499437,...,0.014663,0.020735,0.014663,0.014663,0.014663,0.014663,0.014663,0.020735,0.014663,0.014663
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,900000.0,9095.5,94.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,20211390.0,15000000.0,14635.0,103.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,95711670.0,40000000.0,59961.5,117.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2787965000.0,380000000.0,447027.0,338.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


*Observations*: This snippet of code shows that, now, the maximum in each engineered feature column is 1 while the minimum is 0. This is exactly the goal of this section; thus, feature engineering will be considered a success, then verified via visualization after a Random Forest model is used to predict the tagline column.

In [31]:
movie_df_modified.head()

Unnamed: 0,title,tagline,revenue,budget,id,original_language,overview,runtime,status,release_date_dt,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
0,Avatar,Enter the World of Pandora.,2787965087,237000000,19995,en,"In the 22nd century, a paraplegic Marine is di...",162.0,Released,2009-12-10,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,"At the end of the world, the adventure begins.",961000000,300000000,285,en,"Captain Barbossa, long believed to be dead, ha...",169.0,Released,2007-05-19,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,A Plan No One Escapes,880674609,245000000,206647,en,A cryptic message from Bond’s past sends him o...,148.0,Released,2015-10-26,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,The Legend Ends,1084939099,250000000,49026,en,Following the death of District Attorney Harve...,165.0,Released,2012-07-16,...,0,0,0,0,0,0,0,0,0,0
4,John Carter,"Lost in our world, found in another.",284139100,260000000,49529,en,"John Carter is a war-weary, former military ca...",132.0,Released,2012-03-07,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Make a copy for random forest
movie_df_rfc = movie_df_modified.copy()

# Convert copy for rcf release_date_dt column to numbers
movie_df_rfc['release_date_dt'] = movie_df_rfc['release_date_dt'].astype('datetime64').astype(int).astype(float)

In [43]:
# Drop tagline column
movie_df_modified = movie_df_modified.drop(columns='tagline')

# Store dataframe globally
%store movie_df_modified

Stored 'movie_df_modified' (DataFrame)


#### Part 4: Apply Random Forest for *tagline* column predictions

In [44]:
# Import necessary modules
from sklearn.ensemble import RandomForestRegressor

In [60]:
# Separate rows with NaN from rows with taglines
train_with_tagline = movie_df_rfc[pd.isnull(movie_df_rfc['tagline']) == False]
print(train_with_tagline.shape)
test_with_nan = movie_df_rfc[pd.isnull(movie_df_rfc['tagline'])]
print(test_with_nan.shape)

KeyError: 'tagline'

Create function to convert column value strings to numeric values

In [46]:
def string_to_numeric(data):
    """Convert Column value strings and dates to a numeric value"""
    
    for i, column in enumerate(list([str(d) for d in data.dtypes])):
        if column == "object":
            data.iloc[:,i] = data.iloc[:,i].astype("category").cat.codes
        if column == "<M8[ns]":
            data.iloc[:,i] = data.iloc[:,i].astype("category").cat.codes
    return data

In [47]:
# Training data
string_to_numeric(train_with_tagline)
train_with_tagline.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,title,tagline,revenue,budget,id,original_language,overview,runtime,status,release_date_dt,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
0,301,717,2787965087,237000000,19995,7,2020,162.0,1,1.260403e+18,...,0,0,0,0,0,0,0,0,0,0
1,2117,402,961000000,300000000,285,7,1322,169.0,1,1.179533e+18,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Testing Data
string_to_numeric(test_with_nan)
test_with_nan.head(2)

Unnamed: 0,title,tagline,revenue,budget,id,original_language,overview,runtime,status,release_date_dt,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
10,563,-1,391081192,270000000,1452,5,593,154.0,1,1.151453e+18,...,0,0,0,0,0,0,0,0,0,0
56,547,-1,343471816,185000000,188927,5,626,122.0,1,1.46785e+18,...,0,0,0,0,0,0,0,0,0,0


Start Prediction with Random Forest Regressor

In [49]:
# Define independent and dependent variables in dataset
# Train
X_train = train_with_tagline.drop('tagline', axis=1)
y_train = train_with_tagline['tagline']

# Test
X_test = test_with_nan.drop('tagline', axis=1)
y_test = test_with_nan['tagline']

In [50]:
# Create a RFR model instance with half default number of estimators, due to size of dataset
rfr_tagline = RandomForestRegressor(n_estimators=50)

# Fit to model
rfr_tagline.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
# Generate predicted tagline values
generated_taglines = rfr_tagline.predict(X_test)

In [52]:
# Replace column contents
test_with_nan.loc[:, 'tagline'] = generated_taglines.astype(int)

# Create new movie dataframe with generated taglines
movie_generated_taglines = train_with_tagline.append(test_with_nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,index,title,tagline,revenue,budget,id,original_language,overview,runtime,status,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
0,0,301,717,2787965087,237000000,19995,7,2020,162.0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,2117,402,961000000,300000000,285,7,1322,169.0,1,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Drop index column
movie_generated_taglines.drop('index',inplace=True,axis=1)
movie_generated_taglines.head(2)

Unnamed: 0,title,tagline,revenue,budget,id,original_language,overview,runtime,status,release_date_dt,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
0,301,717,2787965087,237000000,19995,7,2020,162.0,1,1.260403e+18,...,0,0,0,0,0,0,0,0,0,0
1,2117,402,961000000,300000000,285,7,1322,169.0,1,1.179533e+18,...,0,0,0,0,0,0,0,0,0,0


In [56]:
movie_generated_taglines.head()

Unnamed: 0,title,tagline,revenue,budget,id,original_language,overview,runtime,status,release_date_dt,...,lang_ku,lang_mi,lang_to,lang_ca,lang_br,lang_dz,lang_ky,lang_id,lang_bm,lang_sl
0,301,717,2787965087,237000000,19995,7,2020,162.0,1,1.260403e+18,...,0,0,0,0,0,0,0,0,0,0
1,2117,402,961000000,300000000,285,7,1322,169.0,1,1.179533e+18,...,0,0,0,0,0,0,0,0,0,0
2,2536,93,880674609,245000000,206647,7,153,148.0,1,1.445818e+18,...,0,0,0,0,0,0,0,0,0,0
3,2886,2623,1084939099,250000000,49026,7,1655,165.0,1,1.342397e+18,...,0,0,0,0,0,0,0,0,0,0
4,1531,1779,284139100,260000000,49529,7,2240,132.0,1,1.331078e+18,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Store dataframe globally
%store movie_generated_taglines

Stored 'movie_generated_taglines' (DataFrame)
