In [1]:
import json
import pandas as pd
import numpy as np
import re

import time

In [2]:
# Load numbers_sql data
numbers = pd.read_csv("Resources/numbers_sql.csv",encoding= 'unicode_escape')
numbers

Unnamed: 0,index,original_title,production_budget,worldwide_gross
0,0,Avengers: Endgame,"Â $400,000,000","Â $2,797,800,564"
1,1,Pirates of the Caribbean: On Stranger Tides,"Â $379,000,000","Â $1,045,713,802"
2,2,Avengers: Age of Ultron,"Â $365,000,000","Â $1,395,316,979"
3,3,Star Wars Ep. VII: The Force Awakens,"Â $306,000,000","Â $2,064,615,817"
4,4,Avengers: Infinity War,"Â $300,000,000","Â $2,048,359,754"
...,...,...,...,...
6290,6290,,,
6291,6291,,,
6292,6292,,,
6293,6293,,,


In [3]:
# drop null production_budget rows rows
production_budget = numbers['production_budget'].dropna()
production_budget

0       Â $400,000,000
1       Â $379,000,000
2       Â $365,000,000
3       Â $306,000,000
4       Â $300,000,000
             ...      
6278          Â $6,000
6279          Â $5,000
6280          Â $1,400
6281          Â $1,100
6282             Â $86
Name: production_budget, Length: 6283, dtype: object

In [4]:
def is_not_a_string(x):
    return type(x) != str

In [5]:
git

Series([], Name: production_budget, dtype: object)

In [6]:
budget_form = r'\w\s\$\d{1,3}(?:,\d{3})+'
production_budget.str.contains(budget_form, flags=re.IGNORECASE, na=False).sum()

6282

In [7]:
# List of production_budget values that match form
matches_budget_form = production_budget.str.contains(budget_form, flags=re.IGNORECASE, na=False)

In [8]:
# Check to see values that don't match budget_form
production_budget[~matches_budget_form]

6282    Â $86
Name: production_budget, dtype: object

In [9]:
small_budget_form = r'\w\s\$\d{1,3}$'
production_budget.str.contains(small_budget_form, flags=re.IGNORECASE, na=False).sum()

1

In [10]:
matches_small_budget_form = production_budget.str.contains(small_budget_form, flags=re.IGNORECASE, na=False)

In [11]:
production_budget[~matches_budget_form & ~matches_small_budget_form]

Series([], Name: production_budget, dtype: object)

In [12]:
def parse_budget(s):
    
    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan
    
    # if input matches budget_form
    if re.match(r'\w\s\$\d{1,3}(?:,\d{3})+', s, flags=re.IGNORECASE):
        
        # Remove Â, dollar sign and commas
        s = re.sub('\w\s\$|,','', s)
        
        #convert to int
        value = int(s)
        
        # return value
        return value
    
    # if input is of the small_budget_form
    elif re.match(r'\w\s\$\d{1,3}$', s, flags=re.IGNORECASE):
        
        # remove Remove Â and dollar sign
        s = re.sub('\w\s\$','', s)
        
        #convert to int
        value = int(s)
        
        # return value
        return value
    
    # otherwise, return NAN
    else:
        return np.nan

In [13]:
# Apply parse_budget format to production_budget
production_budget = production_budget.apply(parse_budget)
production_budget

0       400000000
1       379000000
2       365000000
3       306000000
4       300000000
          ...    
6278         6000
6279         5000
6280         1400
6281         1100
6282           86
Name: production_budget, Length: 6283, dtype: int64

In [14]:
# Assign float Production_Budget data to numbers df
numbers = numbers.assign(Production_Budget = production_budget)
numbers.head(10)

Unnamed: 0,index,original_title,production_budget,worldwide_gross,Production_Budget
0,0,Avengers: Endgame,"Â $400,000,000","Â $2,797,800,564",400000000.0
1,1,Pirates of the Caribbean: On Stranger Tides,"Â $379,000,000","Â $1,045,713,802",379000000.0
2,2,Avengers: Age of Ultron,"Â $365,000,000","Â $1,395,316,979",365000000.0
3,3,Star Wars Ep. VII: The Force Awakens,"Â $306,000,000","Â $2,064,615,817",306000000.0
4,4,Avengers: Infinity War,"Â $300,000,000","Â $2,048,359,754",300000000.0
5,5,Pirates of the Caribbean: At WorldÂs End,"Â $300,000,000","Â $960,996,492",300000000.0
6,6,Justice League,"Â $300,000,000","Â $655,945,209",300000000.0
7,7,Spectre,"Â $300,000,000","Â $879,500,760",300000000.0
8,8,Mission: Impossible Dead Reckoning Part One,"Â $290,000,000",Â $0,290000000.0
9,9,Star Wars: The Rise of Skywalker,"Â $275,000,000","Â $1,072,848,487",275000000.0


In [15]:
# drop old production_budget
numbers.drop('production_budget', axis=1, inplace=True)

In [16]:
numbers

Unnamed: 0,index,original_title,worldwide_gross,Production_Budget
0,0,Avengers: Endgame,"Â $2,797,800,564",400000000.0
1,1,Pirates of the Caribbean: On Stranger Tides,"Â $1,045,713,802",379000000.0
2,2,Avengers: Age of Ultron,"Â $1,395,316,979",365000000.0
3,3,Star Wars Ep. VII: The Force Awakens,"Â $2,064,615,817",306000000.0
4,4,Avengers: Infinity War,"Â $2,048,359,754",300000000.0
...,...,...,...,...
6290,6290,,,
6291,6291,,,
6292,6292,,,
6293,6293,,,


In [17]:
# create list of worldwide_gross and drop null values
worldwide_gross = numbers['worldwide_gross'].dropna()
worldwide_gross

0       Â $2,797,800,564
1       Â $1,045,713,802
2       Â $1,395,316,979
3       Â $2,064,615,817
4       Â $2,048,359,754
              ...       
6278          Â $240,495
6279            Â $1,338
6280                Â $0
6281          Â $181,041
6282                Â $0
Name: worldwide_gross, Length: 6283, dtype: object

In [18]:
# check to see worldwide_gross thats not a string
worldwide_gross[worldwide_gross.map(is_not_a_string)]

Series([], Name: worldwide_gross, dtype: object)

In [19]:
# Check to see number of worldwide_gross that matches prexisting budget_form
worldwide_gross.str.contains(budget_form, flags=re.IGNORECASE, na=False).sum()

5853

In [20]:
# Create list of worldwide_gross that matches budget form
matches_worldwide_gross_form = worldwide_gross.str.contains(budget_form, flags=re.IGNORECASE, na=False)

In [21]:
# Check to see data entries which do not match budget form
worldwide_gross[~matches_worldwide_gross_form]

8         Â $0
83        Â $0
234       Â $0
259       Â $0
346       Â $0
         ...  
6271      Â $0
6276    Â $900
6277      Â $0
6280      Â $0
6282      Â $0
Name: worldwide_gross, Length: 430, dtype: object

In [22]:
# Create list of smaller worldwide_gross amounts
small_worldwide_gross_form = r'\w\s\$\d{1,3}$'
worldwide_gross.str.contains(small_worldwide_gross_form, flags=re.IGNORECASE, na=False).sum()

430

In [23]:
# Build list of worldwide gross that match small form
matches_small_worldwide_gross_form = worldwide_gross.str.contains(small_worldwide_gross_form, flags=re.IGNORECASE, na=False)

In [24]:
# Check to see any worldwide gross that do not match forms 
worldwide_gross[~matches_worldwide_gross_form & ~matches_small_worldwide_gross_form]

Series([], Name: worldwide_gross, dtype: object)

In [25]:
def parse_gross(s):
    
    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan
    
    # if input matches budget_form
    if re.match(r'\w\s\$\d{1,3}(?:,\d{3})+', s, flags=re.IGNORECASE):
        
        # Remove Â, dollar sign and commas
        s = re.sub('\w\s\$|,','', s)
        
        #convert to int
        value = int(s)
        
        # return value
        return value
    
    # if input is of the small_worldwide_gross_form
    elif re.match(r'\w\s\$\d{1,3}$', s, flags=re.IGNORECASE):
        
        # remove Remove Â and dollar sign
        s = re.sub('\w\s\$','', s)
        
        #convert to int
        value = int(s)
        
        # return value
        return value
    
    # otherwise, return NAN
    else:
        return np.nan

In [26]:
# Apply parse_gross format to worldwide_gross
worldwide_gross = worldwide_gross.apply(parse_gross)
worldwide_gross

0       2797800564
1       1045713802
2       1395316979
3       2064615817
4       2048359754
           ...    
6278        240495
6279          1338
6280             0
6281        181041
6282             0
Name: worldwide_gross, Length: 6283, dtype: int64

In [27]:
# Add cleaned worldwide_gross to numbers DF
numbers = numbers.assign(Worldwide_Gross = worldwide_gross)
numbers.head(10)

Unnamed: 0,index,original_title,worldwide_gross,Production_Budget,Worldwide_Gross
0,0,Avengers: Endgame,"Â $2,797,800,564",400000000.0,2797801000.0
1,1,Pirates of the Caribbean: On Stranger Tides,"Â $1,045,713,802",379000000.0,1045714000.0
2,2,Avengers: Age of Ultron,"Â $1,395,316,979",365000000.0,1395317000.0
3,3,Star Wars Ep. VII: The Force Awakens,"Â $2,064,615,817",306000000.0,2064616000.0
4,4,Avengers: Infinity War,"Â $2,048,359,754",300000000.0,2048360000.0
5,5,Pirates of the Caribbean: At WorldÂs End,"Â $960,996,492",300000000.0,960996500.0
6,6,Justice League,"Â $655,945,209",300000000.0,655945200.0
7,7,Spectre,"Â $879,500,760",300000000.0,879500800.0
8,8,Mission: Impossible Dead Reckoning Part One,Â $0,290000000.0,0.0
9,9,Star Wars: The Rise of Skywalker,"Â $1,072,848,487",275000000.0,1072848000.0


In [28]:
# drop old worldwide_gross
numbers.drop('worldwide_gross', axis=1, inplace=True)

In [29]:
numbers

Unnamed: 0,index,original_title,Production_Budget,Worldwide_Gross
0,0,Avengers: Endgame,400000000.0,2.797801e+09
1,1,Pirates of the Caribbean: On Stranger Tides,379000000.0,1.045714e+09
2,2,Avengers: Age of Ultron,365000000.0,1.395317e+09
3,3,Star Wars Ep. VII: The Force Awakens,306000000.0,2.064616e+09
4,4,Avengers: Infinity War,300000000.0,2.048360e+09
...,...,...,...,...
6290,6290,,,
6291,6291,,,
6292,6292,,,
6293,6293,,,


In [30]:
# Drop null values 
numbers = numbers.dropna()
numbers

Unnamed: 0,index,original_title,Production_Budget,Worldwide_Gross
0,0,Avengers: Endgame,400000000.0,2.797801e+09
1,1,Pirates of the Caribbean: On Stranger Tides,379000000.0,1.045714e+09
2,2,Avengers: Age of Ultron,365000000.0,1.395317e+09
3,3,Star Wars Ep. VII: The Force Awakens,306000000.0,2.064616e+09
4,4,Avengers: Infinity War,300000000.0,2.048360e+09
...,...,...,...,...
6278,6278,Following,6000.0,2.404950e+05
6279,6279,Return to the Land of Wonders,5000.0,1.338000e+03
6280,6280,A Plague So Pleasant,1400.0,0.000000e+00
6281,6281,My Date With Drew,1100.0,1.810410e+05


In [32]:
movies = pd.read_csv("Resources/movies_sql.csv", on_bad_lines='skip', encoding= 'unicode_escape')
movies

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,index,imdb_id,kaggle_id,original_title,runtime,budget,revenue,release_date,popularity,vote_average,...,production_companies,production_countries,distributor,producers,director,starring,cinematography,editors,writers,composers
0,tt0098987,9548,The Adventures of Ford Fairlane,104,49000000,20423389,1990-07-11 00:00:00,3.834949,6.2,72,"[{''id'': 28, ''name'': ''Action''}, {''id'': ...",United States,[{''name'': ''Twentieth Century Fox Film Corpo...,"[{''iso_3166_1'': ''US'', ''name'': ''United S...",20th Century Fox,"{'Steve Perry'""","'""Joel Silver'""}""",Renny Harlin,...,"'""Lauren Holly'""","'""Morris Day'""","'""Robert Englund'""","'""Ed O''Neill'""}""",Oliver Wood,Michael Tronick,"{'David Arnott'""","'""James Cappe'""}""","{'Cliff Eidelman'""","Yello}"""
1,tt0098994,25501,"After Dark, My Sweet",114,6000000,2700000,1990-08-24 00:00:00,7.349189,6.5,17,"[{''id'': 80, ''name'': ''Crime''}, {''id'': 1...",United States,"[{''name'': ''Avenue Pictures Productions'', '...","[{''iso_3166_1'': ''US'', ''name'': ''United S...",Avenue Pictures,"{'Ric Kidney'""","'""Robert Redlin'""}""",James Foley,...,"'""George Dickerson'""}""",Mark Plummer,Howard E. Smith,"{'James Foley'""","'""Robert Redlin'""}""",Maurice Jarre,,,,
2,tt0099005,11856,Air America,112,35000000,33461269,1990-08-10 00:00:00,10.274376,5.3,146,"[{''id'': 28, ''name'': ''Action''}, {''id'': ...",United States,"[{''name'': ''IndieProd Company Productions'',...","[{''iso_3166_1'': ''US'', ''name'': ''United S...",TriStar Pictures,Daniel Melnick,Roger Spottiswoode,"{'Mel Gibson'""",...,"'""Lane Smith'""}""",Roger Deakins,"{'John Bloom'""","'""Lois Freeman-Fox'""}""","{'John Eskow'""","'""Richard Rush'""}""",Charles Gross,,,
3,tt0099012,8217,Alice,102,12000000,7331647,1990-12-25 00:00:00,7.196816,6.3,57,"[{''id'': 35, ''name'': ''Comedy''}, {''id'': ...",United States,"[{''name'': ''Orion Pictures'', ''id'': 41}]","[{''iso_3166_1'': ''US'', ''name'': ''United S...",Orion Pictures,Robert Greenhut,Woody Allen,"{'Alec Baldwin'""",...,"'""William Hurt'""","'""Keye Luke'""","'""Joe Mantegna'""","'""Bernadette Peters'""}""",Carlo Di Palma,Susan E. Morse,Woody Allen,,,
4,tt0099018,25943,Almost an Angel,95,25000000,6939946,1990-12-21 00:00:00,2.862209,5.6,23,"[{''id'': 14, ''name'': ''Fantasy''}, {''id'':...",US,"[{''name'': ''Paramount Pictures'', ''id'': 4}]","[{''iso_3166_1'': ''US'', ''name'': ''United S...",Paramount Pictures,John Cornell,John Cornell,"{'Paul Hogan'""",...,David Stiven,Paul Hogan,Maurice Jarre,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6066,tt5726616,398818,Call Me by Your Name,130,4696772,41900000,2017-10-27 00:00:00,4.300874,0,18,"[{''id'': 10749, ''name'': ''Romance''}, {''id...","{Italy,France,Brazil,'United States'""}""","[{''name'': ''Sony Pictures Classics'', ''id''...",''id'': 88564},{''name'': ''M.Y.R.A. Entertainment'',"''id'': 88565}]""","[{''iso_3166_1'': ''BR'', ''name'': ''Brazil''...","{'Sony Pictures Classics'""",...,"'""Memento Films International",,,,,,,,,
"(worldwide)'""}""","{'Peter Spears'""","'""Luca Guadagnino'""","'""Emilie Georges'""","'""Rodrigo Teixeira'""","'""Marco Morabito'""","'""James Ivory'""","'""Howard Rosenman'""}""",Luca Guadagnino,"{'Armie Hammer'""","'""TimothÃ©e Chalamet'""","'""Michael Stuhlbarg'""","'""Amira Casar'""","'""Esther Garrel'""","'""Victoire Du Bois'""}""",Sayombhu Mukdeeprom,Walter Fasano,James Ivory,,...,,,,,,,,,,
6069,tt3567666,348389,Stratton,94,,,2017-07-06 00:00:00,2.876994,4.8,26,"[{''id'': 28, ''name'': ''Action''}, {''id'': ...",United Kingdom,"[{''name'': ''Twickenham Studios'', ''id'': 23...","[{''iso_3166_1'': ''GB'', ''name'': ''United K...",,Matthew Jenkins,Simon West,"{'Dominic Cooper'""",...,"'""Tom Felton'""}""",Felix Wiedemann,Andrew MacRitchie,"{'Duncan Falconer'""","'""Warren Davis II'""}""",Nathaniel MÃ©chaly,,,,
6070,tt5639354,429191,Una mujer fantÃ¡stica,104,,3700000,2017-04-06 00:00:00,4.059329,7.2,13,"[{''id'': 18, ''name'': ''Drama''}]","{Chile,Germany,Spain,'United States'""","[2]}""","[{''name'': ''Komplizen Film'', ''id'': 1618},...","[{''iso_3166_1'': ''FR'', ''name'': ''France''...","{'Participant Media (Chile)'""","'""Piffl Medien (Germany)'""","'""Bteam Pictures (Spain)'""",...,"{'Juan de Dios LarraÃ­n'""","'""Pablo LarraÃ­n'""}""",SebastiÃ¡n Lelio,"{'Daniela Vega'""","'""Francisco Reyes'""}""",BenjamÃ­n Echazarreta,Soledad Salfate,"{'SebastiÃ¡n Lelio'""","'""Gonzalo Maza'""}""",Matthew Herbert
