# Luther - Data Cleaning and Merging Dataframes

This is part 2 of the Luther Project. In part 1, I've created 6 different dataframes that will be merged and cleaned in this notebook. The final merged dataframe "merged2.pkl" will be used in the last notebook "03 - Luther - Linear Regression" to develop a final linear regression model for predicting movie ticket sales on opening week (as well as opening gross adjusted to ticket price).

In [22]:
import pickle
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import seaborn as sns
import datetime

In [23]:
# Import all the dataframes
mojog_df = pd.read_pickle('data/mojog_2018.pkl')
#unemp_df = pd.read_pickle("data/unemp_df.pkl")
youtube_df = pd.read_pickle('data/youtube_2018.pkl')
omdb_df = pd.read_pickle('data/omdb_2018.pkl')
director_df = pd.read_pickle('data/director_df.pkl')
actor_df = pd.read_pickle('data/actor_df.pkl')

In [24]:
#clean mojo df
mojog_df.opening = mojog_df.opening.replace("\$","", regex = True).replace(",","", regex = True).str.strip()
mojog_df.opening = pd.to_numeric(mojog_df.opening)
mojog_df.tot_gross = mojog_df.tot_gross.replace("\$","", regex = True).replace(",","", regex = True).str.strip()
mojog_df.tot_gross = pd.to_numeric(mojog_df.tot_gross)
mojog_df.theaters = mojog_df.theaters.replace(",","", regex = True).str.strip()
mojog_df.theaters = pd.to_numeric(mojog_df.theaters)
mojog_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 9 columns):
movie_name    2 non-null object
movie         2 non-null object
opening       2 non-null int64
date_time     2 non-null datetime64[ns]
theaters      2 non-null int64
tot_gross     2 non-null int64
actors        2 non-null object
directors     2 non-null object
gtrend        2 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 160.0+ bytes


In [25]:
# Clean youtube df 
# select youtube df relevant data
youtube_df = youtube_df[["movie_name", "viewCount", "commentCount", "dislikeCount", "likeCount"]]

#replace nulls with 0
youtube_df = youtube_df.fillna(0)
# convert numeric str values to int
youtube_df.viewCount = pd.to_numeric(youtube_df.viewCount)
youtube_df.dislikeCount = pd.to_numeric(youtube_df.dislikeCount)
youtube_df.commentCount = pd.to_numeric(youtube_df.commentCount)
youtube_df.likeCount = pd.to_numeric(youtube_df.likeCount)
youtube_df = youtube_df.rename(columns ={"viewCount":"Yviews", "commentCount": "Ycomments", 
                                         "dislikeCount": "Ydislikes", "likeCount":"Ylikes"})
youtube_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
movie_name    2 non-null object
Yviews        2 non-null int64
Ycomments     2 non-null int64
Ydislikes     2 non-null int64
Ylikes        2 non-null int64
dtypes: int64(4), object(1)
memory usage: 160.0+ bytes


In [26]:
omdb_df

Unnamed: 0,imdb,metacritic,movie,rated,rotten_tomatoes,runtime,year
0,7.7,88,Black Panther,PG-13,96,134,2018
1,9.0,68,Avengers: Infinity War,PG-13,84,149,2018


In [27]:
# Clean omdb data
# omdb_df.imdb = pd.to_numeric(omdb_df.imdb.replace("/10","", regex = True).str.strip())
# omdb_df.metacritic = pd.to_numeric(omdb_df.metacritic.replace("/100","", regex = True).str.strip())
# omdb_df.rotten_tomatoes = pd.to_numeric(omdb_df.rotten_tomatoes.replace("%","", regex = True).str.strip())
# omdb_df.runtime = pd.to_numeric(omdb_df.runtime.replace("min","", regex = True).str.strip())
# omdb_df.year = pd.to_numeric(omdb_df.year)
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 7 columns):
imdb               2 non-null float64
metacritic         2 non-null int64
movie              2 non-null object
rated              2 non-null object
rotten_tomatoes    2 non-null int64
runtime            2 non-null int64
year               2 non-null int64
dtypes: float64(1), int64(4), object(2)
memory usage: 192.0+ bytes


In [28]:
# Clean directors data
# Convert str to numeric
director_df.dir_agross = pd.to_numeric(director_df.dir_agross.replace("\$","", regex = True).str.strip())
director_df.dir_gross = pd.to_numeric(director_df.dir_gross.replace("\$","", regex = True)
                                      .replace(",","", regex = True).str.strip())
director_df.dir_nmovies = pd.to_numeric(director_df.dir_nmovies)
director_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
dir_agross     500 non-null float64
dir_gross      500 non-null float64
dir_nmovies    500 non-null int64
director       500 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 15.7+ KB


In [29]:
# Clean directors data
# Convert str to numeric
actor_df.act_agross = pd.to_numeric(actor_df.act_agross.replace("\$", "", regex = True)
                                    .replace(",", "", regex = True).str.strip())
actor_df.act_gross = pd.to_numeric(actor_df.act_gross.replace("\$", "", regex = True)
                                      .replace(",", "", regex = True).str.strip())
actor_df.act_nmovies = pd.to_numeric(actor_df.act_nmovies)
actor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
act_agross     500 non-null float64
act_gross      500 non-null float64
act_nmovies    500 non-null int64
actor          500 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 15.7+ KB


In [30]:
actor_df.head()

Unnamed: 0,act_agross,act_gross,act_nmovies,actor
0,70.5,5149.1,73,Samuel L. Jackson
1,118.2,4963.8,42,Harrison Ford
2,96.0,4605.6,48,Tom Hanks
3,71.8,4522.2,63,Morgan Freeman
4,166.7,4333.5,26,Andy Serkis


In [31]:
# # Split actor & director strings into lists
# # for index in range(len(omdb_df)):
# #     omdb_df.actors[index] = [x.strip() for x in omdb_df.actors[index].split(',')]
# omdb_df.actors = omdb_df.actors.apply(lambda x :[x.strip() for x in x.split(',')])    
# omdb_df.director = omdb_df.director.apply(lambda x :[x.strip() for x in x.split(',')])
# omdb_df.head()

In [32]:
mojog_df

Unnamed: 0,movie_name,movie,opening,date_time,theaters,tot_gross,actors,directors,gtrend
0,Black Panther,Black Panther,202003951,2018-02-16,4020,689626132,"[Chadwick Boseman, Lupita Nyong'o, Michael B. ...",[Ryan Coogler],0.458
1,Avengers: Infinity War,Avengers: Infinity War,257698183,2018-04-27,4474,322831270,"[Robert Downey, Jr., Chris Hemsworth, Mark Ruf...","[Joe Russo, Anthony Russo]",0.905


In [33]:

# unemp rate for this april 2018 was 4.1
mojog_df["unemp_rate"] = 4.1
mojog_df.head()

Unnamed: 0,movie_name,movie,opening,date_time,theaters,tot_gross,actors,directors,gtrend,unemp_rate
0,Black Panther,Black Panther,202003951,2018-02-16,4020,689626132,"[Chadwick Boseman, Lupita Nyong'o, Michael B. ...",[Ryan Coogler],0.458,4.1
1,Avengers: Infinity War,Avengers: Infinity War,257698183,2018-04-27,4474,322831270,"[Robert Downey, Jr., Chris Hemsworth, Mark Ruf...","[Joe Russo, Anthony Russo]",0.905,4.1


# Merge the dataframes

In [34]:
# Merge mojo and youtube
merged = pd.DataFrame.merge(mojog_df, youtube_df,on='movie_name', how = 'inner')
merged.head()

Unnamed: 0,movie_name,movie,opening,date_time,theaters,tot_gross,actors,directors,gtrend,unemp_rate,Yviews,Ycomments,Ydislikes,Ylikes
0,Black Panther,Black Panther,202003951,2018-02-16,4020,689626132,"[Chadwick Boseman, Lupita Nyong'o, Michael B. ...",[Ryan Coogler],0.458,4.1,36956315,58997,14891,407336
1,Avengers: Infinity War,Avengers: Infinity War,257698183,2018-04-27,4474,322831270,"[Robert Downey, Jr., Chris Hemsworth, Mark Ruf...","[Joe Russo, Anthony Russo]",0.905,4.1,74357037,217017,39549,1713010


In [35]:
# Merge omdb
merged2 = pd.DataFrame.merge(merged,omdb_df,on="movie",how="inner")
merged2.head()

Unnamed: 0,movie_name,movie,opening,date_time,theaters,tot_gross,actors,directors,gtrend,unemp_rate,Yviews,Ycomments,Ydislikes,Ylikes,imdb,metacritic,rated,rotten_tomatoes,runtime,year
0,Black Panther,Black Panther,202003951,2018-02-16,4020,689626132,"[Chadwick Boseman, Lupita Nyong'o, Michael B. ...",[Ryan Coogler],0.458,4.1,36956315,58997,14891,407336,7.7,88,PG-13,96,134,2018
1,Avengers: Infinity War,Avengers: Infinity War,257698183,2018-04-27,4474,322831270,"[Robert Downey, Jr., Chris Hemsworth, Mark Ruf...","[Joe Russo, Anthony Russo]",0.905,4.1,74357037,217017,39549,1713010,9.0,68,PG-13,84,149,2018


In [36]:
# Adjust the youtube views based on google trends data
merged2["Yviews_adj"] = round(merged2.Yviews * merged2.gtrend)
merged2["Ylikes_adj"] = round(merged2.Ylikes * merged2.gtrend)
merged2["Ydis_adj"] = round(merged2.Ydislikes * merged2.gtrend)
merged2["Ycom_adj"] = round(merged2.Ycomments * merged2.gtrend)
#merged2["Yviews_adj2"] = round(merged2.Yviews * merged2.gtrend)
merged2 = merged2.drop(["gtrend","Yviews","Ycomments","Ylikes","Ydislikes"],1)
#merged2 = merged2.drop(["gtrend","gtrend2"],1)
merged2

Unnamed: 0,movie_name,movie,opening,date_time,theaters,tot_gross,actors,directors,unemp_rate,imdb,metacritic,rated,rotten_tomatoes,runtime,year,Yviews_adj,Ylikes_adj,Ydis_adj,Ycom_adj
0,Black Panther,Black Panther,202003951,2018-02-16,4020,689626132,"[Chadwick Boseman, Lupita Nyong'o, Michael B. ...",[Ryan Coogler],4.1,7.7,88,PG-13,96,134,2018,16925992.0,186560.0,6820.0,27021.0
1,Avengers: Infinity War,Avengers: Infinity War,257698183,2018-04-27,4474,322831270,"[Robert Downey, Jr., Chris Hemsworth, Mark Ruf...","[Joe Russo, Anthony Russo]",4.1,9.0,68,PG-13,84,149,2018,67293118.0,1550274.0,35792.0,196400.0


In [37]:
# First create a dataframe for year and corresponding movie ticket
merged2["tick"] = 9.16
# Create new column for merged
est_tick = (merged2.opening // merged2.tick).astype("int64")
merged2.insert(loc=3, column='est_tick', value=est_tick)
merged2 = merged2.rename(columns={"opening":"op_gross"})

In [38]:
merged2 = merged2.fillna(0)

In [39]:
merged2.columns

Index(['movie_name', 'movie', 'op_gross', 'est_tick', 'date_time', 'theaters',
       'tot_gross', 'actors', 'directors', 'unemp_rate', 'imdb', 'metacritic',
       'rated', 'rotten_tomatoes', 'runtime', 'year', 'Yviews_adj',
       'Ylikes_adj', 'Ydis_adj', 'Ycom_adj', 'tick'],
      dtype='object')

# Generate a metric for actor and directors

#### actors
I took the max total gross or total # movies featured for the actor/director for each given movie.

In [40]:
import numpy as np

act_list = []
for i in range(len(merged2)):
    #for each of the actors in a given movie, sum up their total gross & movies
    gross_list = []
    nmovies_list = []
    if (merged2.actors[i] == 0) or (merged2.actors[i] == []):
        act_gross, act_nmovies = 0, 0
    else:
        actor_count = len(merged2.actors[i])
        for actoriter in merged2.actors[i]:
            if actor_df[actor_df.actor == actoriter].empty:
                actor_count -= 1
            #Subtract the movie total gross (movie i want to predict) from actor total gross
            else:
                gross_list.append(actor_df[actor_df.actor == actoriter].act_gross.iloc[0] - \
                merged2.tot_gross[i]/(1e6))
                nmovies_list.append(actor_df[actor_df.actor == actoriter].act_nmovies.iloc[0])
    #take the sum and average over number of actors featured
    if (actor_count <= 0) or (merged2.actors[i] == []) or (merged2.actors[i] == 0):
        act_gross, act_nmovies = 0, 0
    else:
        act_gross = round(max(gross_list),1)
        act_nmovies = round(max(nmovies_list),1)
    act_dict = {'movie': merged2.movie[i], 'act_gross':act_gross, 
                'act_nmovies': act_nmovies}
    act_list.append(act_dict)
actor_metric = pd.DataFrame(act_list)

#### directors

In [41]:
import numpy as np

dir_list = []
for i in range(len(merged2)):
    #for each of the actors in a given movie, sum up their total gross & movies
    gross_list = []
    nmovies_list = []
    if (merged2.directors[i] == 0) or (merged2.directors[i] == []):
        dir_gross, dir_nmovies = 0, 0
    else:
        director_count = len(merged2.directors[i])
        for directoriter in merged2.directors[i]:
            if director_df[director_df.director == directoriter].empty:
                director_count -= 1
            #Subtract the movie total gross (movie i want to predict) from actor total gross
            else:
                gross_list.append(director_df[director_df.director == directoriter].
                                  dir_gross.iloc[0] - \
                merged2.tot_gross[i]/(1e6))
                nmovies_list.append(director_df[director_df.director == directoriter]
                                    .dir_nmovies.iloc[0])
    #take the sum and average over number of actors featured
    if (director_count <= 0) or (merged2.directors[i] == []) or (merged2.directors[i] == 0):
        dir_gross, dir_nmovies = 0, 0
    else:
        dir_gross = round(max(gross_list),1)
        dir_nmovies = round(max(nmovies_list),1)
    dir_dict = {'movie': merged2.movie[i], 'dir_gross':dir_gross, 
                'dir_nmovies': dir_nmovies}
    dir_list.append(dir_dict)
director_metric = pd.DataFrame(dir_list)

# Merge the director & actor metrics to df

In [42]:
merged2 = pd.merge(merged2, actor_metric, on="movie", how="inner")
merged2 = pd.merge(merged2, director_metric, on="movie", how="inner")
merged2 = merged2.drop(["actors","directors"],1)
merged2.head()

Unnamed: 0,movie_name,movie,op_gross,est_tick,date_time,theaters,tot_gross,unemp_rate,imdb,metacritic,...,year,Yviews_adj,Ylikes_adj,Ydis_adj,Ycom_adj,tick,act_gross,act_nmovies,dir_gross,dir_nmovies
0,Black Panther,Black Panther,202003951,22052833,2018-02-16,4020,689626132,4.1,7.7,88,...,2018,16925992.0,186560.0,6820.0,27021.0,9.16,3643.9,56,112.2,3
1,Avengers: Infinity War,Avengers: Infinity War,257698183,28132989,2018-04-27,4474,322831270,4.1,9.0,68,...,2018,67293118.0,1550274.0,35792.0,196400.0,9.16,3372.6,41,421.0,4


### Save the merged dataframe

In [43]:
merged2.to_pickle("data/merged_2018.pkl")

In [None]:
len(merged2[(merged2.dir_nmovies == 0)])# | (merged2.act_gross == 0)])