In [1]:
## Pandas
import pandas as pd
## Numpy
import numpy as np

import json, os, time

import tmdbsimple as tmdb

from tqdm.notebook import tqdm_notebook

import gzip

In [2]:
# Files
tmdb_2000 = pd.read_csv("Data/final_tmdb_data_2000.csv.gz")
tmdb_2001 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz")

In [3]:
# Combine

Files = [tmdb_2000, tmdb_2001]

combined = pd.concat(Files)

In [4]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2568 entries, 0 to 1330
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2568 non-null   object 
 1   adult                  2566 non-null   float64
 2   backdrop_path          1397 non-null   object 
 3   belongs_to_collection  205 non-null    object 
 4   budget                 2566 non-null   float64
 5   genres                 2566 non-null   object 
 6   homepage               173 non-null    object 
 7   id                     2566 non-null   float64
 8   original_language      2566 non-null   object 
 9   original_title         2566 non-null   object 
 10  overview               2516 non-null   object 
 11  popularity             2566 non-null   float64
 12  poster_path            2308 non-null   object 
 13  production_companies   2566 non-null   object 
 14  production_countries   2566 non-null   object 
 15  rele

In [5]:
# checking for duplicates
combined.duplicated().sum()

1

In [6]:
# Drop duplicates

combined = combined.drop_duplicates()
combined.duplicated().sum()

0

In [7]:
# Budget and Revenue filter

financials_filtered = (combined['budget'] > 0) | (combined['revenue']>0)

financials_filtered.value_counts()

False    1930
True      637
dtype: int64

In [8]:
combined = combined[financials_filtered]

## How many movies had at least some valid financial information?

In [9]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 1 to 1309
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                637 non-null    object 
 1   adult                  637 non-null    float64
 2   backdrop_path          540 non-null    object 
 3   belongs_to_collection  109 non-null    object 
 4   budget                 637 non-null    float64
 5   genres                 637 non-null    object 
 6   homepage               76 non-null     object 
 7   id                     637 non-null    float64
 8   original_language      637 non-null    object 
 9   original_title         637 non-null    object 
 10  overview               635 non-null    object 
 11  popularity             637 non-null    float64
 12  poster_path            618 non-null    object 
 13  production_companies   637 non-null    object 
 14  production_countries   637 non-null    object 
 15  relea

At least 637 movies had at least some valid financial information.

## How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [10]:
combined['certification'].value_counts()

R        232
PG-13    131
PG        35
NR        17
G         15
Name: certification, dtype: int64

## What is the average revenue per certification category?

In [11]:
Revenue_avg = combined.groupby(['certification'])['revenue'].mean()
Revenue_avg

certification
G        1.173648e+08
NR       9.588674e+06
PG       1.106405e+08
PG-13    9.928786e+07
R        3.242712e+07
Name: revenue, dtype: float64

## What is the average budget per certification category?

In [12]:
Budget_avg = combined.groupby(['certification'])['budget'].mean()
Budget_avg

certification
G        3.813333e+07
NR       6.302358e+06
PG       4.482849e+07
PG-13    4.299357e+07
R        1.948407e+07
Name: budget, dtype: float64

In [13]:
# Saving combined data

combined.to_csv("data/tmdb_results_combined.csv.gz", compression='gzip', index=False)