#### **DATA CLEANING: THE NUMBERS(TNDB)**
The Numbers' production budget data is crucial for calculating true return on investment (ROI), which is often more relevant than gross revenue alone. This enables YE Studios to identify which genres and budget ranges offer the best profit margins, not just the highest revenue. The budget-to-revenue ratio analysis helps determine the minimum viable investment for different film categories.

In [1]:
#LOAD DATA:
import pandas as pd
import numpy as np
import gzip

tndb = pd.read_csv("../data/zippedData/tn.movie_budgets.csv.gz", compression = "gzip")
tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [2]:
#CLEAN DATA:
##CLEAN COLUMN NAMES:Make column names lowercase and replace spaces with underscores, and remove parantheses.
tndb.columns = tndb.columns.str.strip().str.lower().str.replace(" ","_").str.replace("(","").str.replace(")","")
tndb.head()
tndb.dtypes

id                    int64
release_date         object
movie                object
production_budget    object
domestic_gross       object
worldwide_gross      object
dtype: object

In [3]:
#CLEAN DATES:
#Convert release date to datetime formart.
tndb["release_date"]= pd.to_datetime(tndb["release_date"], errors = "coerce")

#Extract year for analysis:
tndb["release_year"]= tndb["release_date"].dt.year
tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year
0,1,2009-12-18,Avatar,"$425,000,000","$760,507,625","$2,776,345,279",2009
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011
2,3,2019-06-07,Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",2019
3,4,2015-05-01,Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",2015
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747",2017


In [4]:
#CLEAN FINACIAL DATA:
#Remove $ sign and convert to numeric.
tndb["production_budget"]= tndb["production_budget"].str.replace("$","").str.replace(",","")
tndb["production_budget"] = pd.to_numeric(tndb["production_budget"], errors="coerce")


tndb["domestic_gross"]= tndb["domestic_gross"].str.replace("$","").str.replace(",","")
tndb["domestic_gross"] = pd.to_numeric(tndb["domestic_gross"], errors="coerce")


tndb["worldwide_gross"]= tndb["worldwide_gross"].str.replace("$","").str.replace(",","")
tndb["worldwide_gross"] = pd.to_numeric(tndb["worldwide_gross"], errors="coerce")

tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017


In [5]:
#CREATE NEW COLUMNS:
#Calcuate profit
tndb["profit"]= tndb["worldwide_gross"]- tndb["production_budget"]

#Calculate return on investement(ROI)
tndb["roi"] = (tndb["profit"] / tndb["production_budget"] * 100)

#Calulate profit margin
tndb["foreign_gross"]= tndb["worldwide_gross"]- tndb["domestic_gross"]

tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,profit,roi,foreign_gross
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,2351345279,553.257713,2015837654
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,635063875,154.667286,804600000
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019,-200237650,-57.210757,107000000
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,1072413963,324.384139,944008095
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,999721747,315.369636,696540365


In [6]:
#HANDLE DUPLICATES AND REMOVE MISSING VALUES:
#Sort by worldwide_gross and keep highest grosssing movie for each title.
tndb = tndb.sort_values("worldwide_gross", ascending= False)
tndb = tndb.drop_duplicates(subset= "movie", keep = "first")

#Drop rows where budget or worldwide_gross is missing
tndb = tndb.dropna(subset=["production_budget", "worldwide_gross"])
tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,profit,roi,foreign_gross
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,2351345279,553.257713,2015837654
42,43,1997-12-19,Titanic,200000000,659363944,2208208395,1997,2008208395,1004.104198,1548844451
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,2015,1747311220,571.016739,1116648995
6,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,2018,1748134200,582.7114,1369318718
33,34,2015-06-12,Jurassic World,215000000,652270625,1648854864,2015,1433854864,666.909239,996584239


In [7]:
##FINAL CHECK OF CLEANED DATA:
#View cleaned data
tndb.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,profit,roi,foreign_gross
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,2351345279,553.257713,2015837654
42,43,1997-12-19,Titanic,200000000,659363944,2208208395,1997,2008208395,1004.104198,1548844451
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,2015,1747311220,571.016739,1116648995
6,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,2018,1748134200,582.7114,1369318718
33,34,2015-06-12,Jurassic World,215000000,652270625,1648854864,2015,1433854864,666.909239,996584239


In [8]:
tndb.isnull().sum()

id                   0
release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
release_year         0
profit               0
roi                  0
foreign_gross        0
dtype: int64

In [9]:
##Saving cleaned Data:
tndb.to_csv('tn_budgets_clean.csv', index=False)