In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import math 
import sys

In [2]:
csv_files = glob("./zippedData/*.csv.gz")
csv_files

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\imdb.title.ratings.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [3]:
csv_files_dict = {}
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_").replace("_gz","") # cleaning the filenames
    filename_df = pd.read_csv(filename, index_col=0)
    csv_files_dict[filename_cleaned] = filename_df

In [4]:
csv_files_dict.keys()

dict_keys(['bom_movie_gross', 'imdb_name_basics', 'imdb_title_akas', 'imdb_title_basics', 'imdb_title_crew', 'imdb_title_principals', 'imdb_title_ratings', 'tmdb_movies', 'tn_movie_budgets'])

In [5]:
bom_movie_gross = csv_files_dict['bom_movie_gross']
imdb_name_basics = csv_files_dict['imdb_name_basics']
imdb_title_akas = csv_files_dict['imdb_title_akas']
imdb_title_basics = csv_files_dict['imdb_title_basics']
imdb_title_crew = csv_files_dict['imdb_title_crew']
imdb_title_principals = csv_files_dict['imdb_title_principals']
imdb_title_ratings = csv_files_dict['imdb_title_ratings']
tmdb_movies = csv_files_dict['tmdb_movies']
movie_budgets = csv_files_dict['tn_movie_budgets']

In [6]:
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3387 entries, Toy Story 3 to An Actor Prepares
Data columns (total 4 columns):
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 132.3+ KB


In [7]:
bom_movie_gross
#drop foreign_gross,use studio if need

Unnamed: 0_level_0,studio,domestic_gross,foreign_gross,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story 3,BV,415000000.0,652000000,2010
Alice in Wonderland (2010),BV,334200000.0,691300000,2010
Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
Inception,WB,292600000.0,535700000,2010
Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...
The Quake,Magn.,6200.0,,2018
Edward II (2018 re-release),FM,4800.0,,2018
El Pacto,Sony,2500.0,,2018
The Swan,Synergetic,2400.0,,2018


In [8]:
print('Percentage of Null foreign_gross values:', len(bom_movie_gross[bom_movie_gross.foreign_gross.isna()])/ len(bom_movie_gross))

Percentage of Null foreign_gross values: 0.3985828166519043


We are going to remove the foreign_gross column as 40% of values are null

In [9]:
bom_movie_gross = bom_movie_gross.drop(['foreign_gross'], axis = 1)

In [10]:
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3387 entries, Toy Story 3 to An Actor Prepares
Data columns (total 3 columns):
studio            3382 non-null object
domestic_gross    3359 non-null float64
year              3387 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 105.8+ KB


Still need to remove null values from other columns

In [11]:
bom_movie_gross = bom_movie_gross.dropna()
bom_movie_gross.isna().sum()

studio            0
domestic_gross    0
year              0
dtype: int64

In [12]:
bom_movie_gross.to_csv('bom_movie_gross_clean')

Null values removed! Data cleaned in bom_movie_gross_gz

In [13]:
movie_budgets.head() 

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [14]:
movie_budgets.sort_values(by='domestic_gross')

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
82,"Oct 26, 2012",Mientras duermes,"$5,000,000",$0,"$9,109,597"
9,"Nov 4, 2005",Wal-Mart: The High Cost of Low Price,"$1,500,000",$0,"$58,692"
8,"Jul 20, 2018",Teefa in Trouble,"$1,500,000",$0,"$98,806"
7,"Oct 17, 2014",Housebound,"$1,500,000",$0,"$236,863"
58,"Dec 31, 2008",Bathory,"$15,000,000",$0,"$3,436,763"
...,...,...,...,...,...
8,"Jun 13, 1997",Hercules,"$70,000,000","$99,112,101","$250,700,000"
74,"Jun 4, 1999",Desert Blue,"$5,000,000","$99,147","$99,147"
43,"Aug 3, 2018",Christopher Robin,"$75,000,000","$99,215,042","$197,504,758"
6,"Feb 11, 2011",Gnomeo and Juliet,"$36,000,000","$99,967,670","$193,737,977"


In [15]:
movie_budgets.worldwide_gross.value_counts(normalize= True)

$0              0.063473
$8,000,000      0.001557
$2,000,000      0.001038
$7,000,000      0.001038
$11,000,000     0.000692
                  ...   
$103,787,401    0.000173
$21,774,432     0.000173
$302,710,615    0.000173
$16,638,300     0.000173
$31,187,727     0.000173
Name: worldwide_gross, Length: 5356, dtype: float64

In [16]:
movie_budgets.production_budget.value_counts(normalize= True)

$20,000,000    0.039952
$10,000,000    0.036666
$30,000,000    0.030612
$15,000,000    0.029920
$25,000,000    0.029575
                 ...   
$379,000       0.000173
$245,000       0.000173
$98,000,000    0.000173
$640,000       0.000173
$19,100,000    0.000173
Name: production_budget, Length: 509, dtype: float64

In [17]:
movie_budgets.domestic_gross.value_counts(normalize= True)

$0             0.094777
$8,000,000     0.001557
$2,000,000     0.001211
$7,000,000     0.001211
$10,000,000    0.001038
                 ...   
$55,340,730    0.000173
$10,680,275    0.000173
$2,326,407     0.000173
$870,067       0.000173
$9,652,000     0.000173
Name: domestic_gross, Length: 5164, dtype: float64

In [18]:
movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 1 to 82
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: object(5)
memory usage: 271.0+ KB


In [19]:
def convert_amt_to_int(df, col):
    df[col] = df[col].str.replace("$", "").str.replace(",", "").astype('int64')
    return df

In [20]:
money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']

for col in money_cols:
    movie_budgets = convert_amt_to_int(movie_budgets, col)

In [21]:
movie_budgets = movie_budgets[movie_budgets['domestic_gross'] > 0]

In [22]:
movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5234 entries, 1 to 82
Data columns (total 5 columns):
release_date         5234 non-null object
movie                5234 non-null object
production_budget    5234 non-null int64
domestic_gross       5234 non-null int64
worldwide_gross      5234 non-null int64
dtypes: int64(3), object(2)
memory usage: 245.3+ KB


In [23]:
movie_budgets.to_csv('movie_budgets_clean.csv')

In [24]:
imdb_name_basics
#contains actors, directors etc

Unnamed: 0_level_0,primary_name,birth_year,death_year,primary_profession,known_for_titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"
...,...,...,...,...,...
nm9990381,Susan Grobes,,,actress,
nm9990690,Joo Yeon So,,,actress,"tt9090932,tt8737130"
nm9991320,Madeline Smith,,,actress,"tt8734436,tt9615610"
nm9991786,Michelle Modigliani,,,producer,


In [80]:
line_1 =imdb_name_basics.iloc[0]

In [83]:
def print_line(row):
    list_of_rows=[]
    for title_id in row['known_for_titles']:
        list_of_rows.append((row[['nconst', 'primary_profession', 'known_for_title']]))
        


KeyError: "['known_for_title', 'nconst'] not in index"

In [25]:
imdb_name_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 606648 entries, nm0061671 to nm9993380
Data columns (total 5 columns):
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(3)
memory usage: 27.8+ MB


In [26]:
print('Percentage of Null birth_year Values:', len(imdb_name_basics[imdb_name_basics.birth_year.isna()])/ len(imdb_name_basics))

Percentage of Null birth_year Values: 0.8636177816460286


In [27]:
print('Percentage of Null death_year Values:', len(imdb_name_basics[imdb_name_basics.death_year.isna()])/ len(imdb_name_basics))

Percentage of Null death_year Values: 0.9888188867349764


In [28]:
imdb_name_basics = imdb_name_basics.drop('birth_year', axis = 1)

In [29]:
imdb_name_basics = imdb_name_basics.drop('death_year', axis = 1)

In [30]:
imdb_name_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 606648 entries, nm0061671 to nm9993380
Data columns (total 3 columns):
primary_name          606648 non-null object
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: object(3)
memory usage: 18.5+ MB


In [31]:
imdb_name_basics = imdb_name_basics.dropna()
imdb_name_basics.isna().sum()

primary_name          0
primary_profession    0
known_for_titles      0
dtype: int64

In [32]:
imdb_name_basics.to_csv('imdb_name_basics_clean.csv')

In [33]:
imdb_title_akas.sort_values(by='region')

Unnamed: 0_level_0,ordering,title,region,language,types,attributes,is_original_title
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt3156000,3,Nick,AD,,,,0.0
tt6079382,1,Impacto,AD,,,,0.0
tt5162282,1,73',AD,,,,0.0
tt1811329,2,"Teta, Alf Marra",AE,,,,0.0
tt7480896,1,Desert Dream,AE,,,,0.0
...,...,...,...,...,...,...,...
tt9723084,2,Anderswo. Allein in Afrika,,,original,,1.0
tt9726638,2,Monkey King: The Volcano,,,original,,1.0
tt9755806,3,Big Shark,,,original,,1.0
tt9827784,2,Sayonara kuchibiru,,,original,,1.0


In [34]:
imdb_title_akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331703 entries, tt0369610 to tt9880178
Data columns (total 7 columns):
ordering             331703 non-null int64
title                331703 non-null object
region               278410 non-null object
language             41715 non-null object
types                168447 non-null object
attributes           14925 non-null object
is_original_title    331678 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 20.2+ MB


In [35]:
print('Percentage of Null death_year Values:', len(imdb_title_akas[imdb_title_akas.language.isna()])/ len(imdb_title_akas))

Percentage of Null death_year Values: 0.87423990738703


In [36]:
print('Percentage of Null death_year Values:', len(imdb_title_akas[imdb_title_akas.attributes.isna()])/ len(imdb_title_akas))

Percentage of Null death_year Values: 0.9550049291082685


In [37]:
print('Percentage of Null death_year Values:', len(imdb_title_akas[imdb_title_akas.types.isna()])/ len(imdb_title_akas))

Percentage of Null death_year Values: 0.49217522904526034


In [38]:
imdb_title_akas = imdb_title_akas.drop('types', axis = 1)

In [39]:
imdb_title_akas = imdb_title_akas.drop('attributes', axis = 1)

In [40]:
imdb_title_akas = imdb_title_akas.drop('language', axis = 1)

In [41]:
imdb_title_akas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331703 entries, tt0369610 to tt9880178
Data columns (total 4 columns):
ordering             331703 non-null int64
title                331703 non-null object
region               278410 non-null object
is_original_title    331678 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 12.7+ MB


In [42]:
imdb_title_akas['is_original_title'] = imdb_title_akas['is_original_title'].fillna(value = 0)

In [43]:
imdb_title_akas['region'] = imdb_title_akas['region'].fillna(value= 'no region')

In [44]:
imdb_title_akas.to_csv('imdb_title_akas_clean.csv')

In [45]:
imdb_title_crew

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0285252,nm0899854,nm0899854
tt0438973,,"nm0175726,nm1802864"
tt0462036,nm1940585,nm1940585
tt0835418,nm0151540,"nm0310087,nm0841532"
tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943
...,...,...
tt8999974,nm10122357,nm10122357
tt9001390,nm6711477,nm6711477
tt9001494,"nm10123242,nm10123248",
tt9004986,nm4993825,nm4993825


In [46]:
imdb_title_crew.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146144 entries, tt0285252 to tt9010172
Data columns (total 2 columns):
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(2)
memory usage: 3.3+ MB


In [47]:
imdb_title_crew.isna().sum()

directors     5727
writers      35883
dtype: int64

In [48]:
imdb_title_crew['writers'] = imdb_title_crew['writers'].fillna(value= 'unknown')
imdb_title_crew['directors'] = imdb_title_crew['directors'].fillna(value= 'unknown')

In [49]:
imdb_title_crew.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146144 entries, tt0285252 to tt9010172
Data columns (total 2 columns):
directors    146144 non-null object
writers      146144 non-null object
dtypes: object(2)
memory usage: 3.3+ MB


In [50]:
imdb_title_crew.to_csv('imdb_title_crew_clean.csv')

In [51]:
imdb_title_principals.head()

Unnamed: 0_level_0,ordering,nconst,category,job,characters
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0111414,1,nm0246005,actor,,"[""The Man""]"
tt0111414,2,nm0398271,director,,
tt0111414,3,nm3739909,producer,producer,
tt0323808,10,nm0059247,editor,,
tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [52]:
imdb_title_principals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1028186 entries, tt0111414 to tt9692684
Data columns (total 5 columns):
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(4)
memory usage: 47.1+ MB


In [53]:
imdb_title_principals = imdb_title_principals.drop('job', axis = 1)
# Dropped job because it was redundent of catagory

In [54]:
imdb_title_principals['characters'] = imdb_title_principals['characters'].fillna(value= 'not applicable')

In [55]:
imdb_title_principals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1028186 entries, tt0111414 to tt9692684
Data columns (total 4 columns):
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
characters    1028186 non-null object
dtypes: int64(1), object(3)
memory usage: 39.2+ MB


In [56]:
imdb_title_principals.to_csv('imdb_title_principals_clean.csv')

In [57]:
imdb_title_ratings

Unnamed: 0_level_0,averagerating,numvotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt10356526,8.3,31
tt10384606,8.9,559
tt1042974,6.4,20
tt1043726,4.2,50352
tt1060240,6.5,21
...,...,...
tt9805820,8.1,25
tt9844256,7.5,24
tt9851050,4.7,14
tt9886934,7.0,5


In [58]:
imdb_title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73856 entries, tt10356526 to tt9894098
Data columns (total 2 columns):
averagerating    73856 non-null float64
numvotes         73856 non-null int64
dtypes: float64(1), int64(1)
memory usage: 1.7+ MB


In [59]:
imdb_title_ratings.averagerating.value_counts(normalize= True)

7.0     0.030627
6.6     0.030478
7.2     0.030451
6.8     0.030316
6.5     0.030072
          ...   
9.6     0.000244
10.0    0.000217
9.8     0.000203
9.7     0.000162
9.9     0.000068
Name: averagerating, Length: 91, dtype: float64

In [60]:
imdb_title_ratings.numvotes.value_counts(normalize= True)

6        0.038927
5        0.036544
7        0.033525
8        0.029341
9        0.026118
           ...   
18286    0.000014
16289    0.000014
1958     0.000014
4007     0.000014
4098     0.000014
Name: numvotes, Length: 7349, dtype: float64

In [61]:
imdb_title_ratings.to_csv('imdb_title_ratings_clean.csv')

In [62]:
tmdb_movies

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...
26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [63]:
tmdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
genre_ids            26517 non-null object
id                   26517 non-null int64
original_language    26517 non-null object
original_title       26517 non-null object
popularity           26517 non-null float64
release_date         26517 non-null object
title                26517 non-null object
vote_average         26517 non-null float64
vote_count           26517 non-null int64
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [64]:
list_of_values= []
def value_count(df):
    for header in df:
         list_of_values.append(df[header].value_counts(normalize= True))
    

In [65]:
value_count(tmdb_movies)
list_of_values

[[99]                        0.139533
 []                          0.093487
 [18]                        0.085530
 [35]                        0.062601
 [27]                        0.043180
                               ...   
 [53, 35, 28, 18, 80]        0.000038
 [12, 99, 10751]             0.000038
 [28, 12, 16, 878, 10752]    0.000038
 [9648, 18, 12, 53, 878]     0.000038
 [53, 80, 37, 28]            0.000038
 Name: genre_ids, Length: 2477, dtype: float64, 292086    0.000113
 463839    0.000113
 11976     0.000113
 391872    0.000113
 416572    0.000113
             ...   
 356987    0.000038
 350846    0.000038
 479871    0.000038
 500353    0.000038
 524288    0.000038
 Name: id, Length: 25497, dtype: float64, en    0.878342
 fr    0.019120
 es    0.017159
 ru    0.011238
 ja    0.009994
         ...   
 kk    0.000038
 mi    0.000038
 dz    0.000038
 bo    0.000038
 yi    0.000038
 Name: original_language, Length: 76, dtype: float64, Eden                              0.000264
 

In [66]:
tmdb_movies = tmdb_movies[tmdb_movies['vote_count'] > 10]

In [67]:
tmdb_movies.to_csv('tmdb_movies_clean.csv')

In [68]:
rt_movie = pd.read_csv('zippedData/rt.movie_info.tsv.gz', delimiter = '\t', encoding = 'unicode_escape')

In [69]:
rt_movie.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [70]:
rt_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
id              1560 non-null int64
synopsis        1498 non-null object
rating          1557 non-null object
genre           1552 non-null object
director        1361 non-null object
writer          1111 non-null object
theater_date    1201 non-null object
dvd_date        1201 non-null object
currency        340 non-null object
box_office      340 non-null object
runtime         1530 non-null object
studio          494 non-null object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [71]:
rt_movie = rt_movie.drop(['currency'], axis = 1)

In [72]:
rt_movie = rt_movie.drop(['box_office'], axis = 1)

In [73]:
rt_movie.to_csv('rt_movie_clean.csv')

In [74]:
movie_budgets.head() 

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [75]:
movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5234 entries, 1 to 82
Data columns (total 5 columns):
release_date         5234 non-null object
movie                5234 non-null object
production_budget    5234 non-null int64
domestic_gross       5234 non-null int64
worldwide_gross      5234 non-null int64
dtypes: int64(3), object(2)
memory usage: 245.3+ KB


In [76]:
franchises = pd.read_csv('Franchises.csv')

FileNotFoundError: [Errno 2] File b'Franchises.csv' does not exist: b'Franchises.csv'

In [None]:
franchises

In [None]:
franchises.info()

In [None]:
franchises = franchises.set_index('Rank')

In [None]:
franchises.to_csv('franchises_clean.csv')

In [None]:
extfranchises = pd.read_csv('extfranchises.csv', encoding='cp1252')

In [None]:
extfranchises

In [None]:
extfranchises.info()

In [None]:
extfranchises.to_csv('extfranchises_clean.csv')