# **Movie Predictions Part 4**

**Name:** **Derek Overton**

**Date:** **3/15/2023**

**Project: Movie Predictions Part 4**

# **Imports**

In [56]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import pymysql
pymysql.install_as_MySQLdb()
import scipy.stats as stats

# Additional Imports
import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy.types import *
from urllib.parse import quote_plus
from scipy import stats
import glob

# **Loading Data**

In [2]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'akas.csv.gz',
 'basics.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'final_tmdb_data_2005.csv.gz',
 'final_tmdb_data_2006.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2011.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2020.csv.gz',
 'ratings.csv.gz',
 'tmdb_api_results_2000.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_api_results_2002.json',
 'tmdb_api_results_2003.json',
 'tmdb_api_results_2004.json',
 'tmdb_api_results_2005.json',
 'tmdb_api_results_2006.json',
 '

In [3]:
# Import Basics
basics = pd.read_csv('Data/basics.csv.gz')

In [4]:
# Import Ratings
ratings = pd.read_csv('Data/ratings.csv.gz')

In [5]:
# Import Akas
akas = pd.read_csv('Data/akas.csv.gz')

In [6]:
# Import Tmdb
tmdb = pd.read_csv('Data/tmdb_results_combined.csv.gz')

# **Create Custom Functions**

In [7]:
def df_to_sql(df,primary=None):
    sql_schema = {key: None for key in df.columns}
    #Create schema to convert col.dtype to sql-types
    for col in df.columns:
       # print (f"{col} is type:{basics[col].dtype}")
        if df[col].dtype == "int64":
            sql_schema[col]=Integer()
        elif df[col].dtype == "float64":
            sql_schema[col]=Float()
        elif df[col].dtype == "object":
            sql_schema[col]=Text(df[col].fillna('').map(len).max()+1)
    if primary != None:
        #Change the primary key to type String(length=...)
        sql_schema[primary] = String(df[primary].fillna('').map(len).max()+1)
    return sql_schema

# **Creating MYSQL Database**

In [8]:
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
connection_str = "mysql+pymysql://root:Biggykrs11!!@localhost/movie"

In [9]:
# Create the engine:
engine = create_engine(connection_str)

In [10]:
# Check if the database exists. If not, create it.
if database_exists(connection_str) == False:
  create_database(connection_str)
else:
  print('The database already exists')

The database already exists


In [11]:
# Check for database existance:
database_exists(connection_str)

True

# **Making Dataframe tables in MySQL Database**

## **Basics**

In [12]:
#Check the dtypes of your dataframe: (df.dtypes).
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult             int64
startYear           int64
endYear           float64
runtimeMinutes      int64
genres             object
dtype: object

In [13]:
#Use custom function to convert to sql-ready
basics_schema = df_to_sql(basics,"tconst")
basics_schema

{'tconst': String(length=11),
 'titleType': Text(length=6),
 'primaryTitle': Text(length=243),
 'originalTitle': Text(length=243),
 'isAdult': Integer(),
 'startYear': Integer(),
 'endYear': Float(),
 'runtimeMinutes': Integer(),
 'genres': Text(length=30)}

In [14]:
# Save to sql with dtype and index=False
basics.to_sql('title_basics',engine,dtype=basics_schema,if_exists='replace',
              index=False)

86012

In [15]:
#Run the query to ADD PRIMARY KEY
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x18612f72e50>

In [16]:
#Query the table and show first 5 rows
q = '''
SELECT *
FROM title_basics
Limit 5;
'''
pd.read_sql_query(q, engine)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


## **Ratings**

In [17]:
ratings.columns

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')

In [18]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,263
2,tt0000005,6.2,2596
3,tt0000006,5.1,177
4,tt0000007,5.4,815


In [19]:
ratings['tconst'].unique()

array(['tt0000001', 'tt0000002', 'tt0000005', ..., 'tt9916348',
       'tt9916362', 'tt9916428'], dtype=object)

In [20]:
#Create a ratings_id map by pairing the unique ratings with an incrementing integer
ratings_id = range(len(ratings['tconst'].unique()))
ratings_map = dict(zip(ratings['tconst'].unique(), ratings_id))
#Add ratings_id primary key column
ratings["id"] = ratings["tconst"].map(ratings_map)

In [21]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes,id
0,tt0000001,5.7,1959,0
1,tt0000002,5.8,263,1
2,tt0000005,6.2,2596,2
3,tt0000006,5.1,177,3
4,tt0000007,5.4,815,4


In [22]:
ratings_schema = df_to_sql(ratings)
ratings_schema

{'tconst': Text(length=11),
 'averageRating': Float(),
 'numVotes': Integer(),
 'id': Integer()}

In [23]:
# Save to sql with dtype and index=False
ratings.to_sql('title_ratings',engine,dtype=ratings_schema,if_exists='replace',
              index=False)

492147

In [24]:
#Run the query to ADD PRIMARY KEY
engine.execute('ALTER TABLE title_ratings ADD PRIMARY KEY (`id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x18612fc28e0>

In [25]:
#Query the table and show first 5 rows
q = '''
SELECT *
FROM title_ratings
Limit 5;
'''
pd.read_sql_query(q, engine)

Unnamed: 0,tconst,averageRating,numVotes,id
0,tt0000001,5.7,1959,0
1,tt0000002,5.8,263,1
2,tt0000005,6.2,2596,2
3,tt0000006,5.1,177,3
4,tt0000007,5.4,815,4


## **Title_Genres**

In [26]:
## create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama,[Drama]
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama,[Drama]
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama,[Drama]
...,...,...,...,...,...,...,...,...,...,...
86007,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama,[Drama]
86008,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy","[Comedy, Drama, Fantasy]"
86009,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama,[Drama]
86010,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller","[Action, Adventure, Thriller]"


In [27]:
exploded_genres = basics.explode('genres_split')
exploded_genres


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance",Fantasy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance",Romance
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama,Drama
...,...,...,...,...,...,...,...,...,...,...
86010,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller",Action
86010,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller",Adventure
86010,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller",Thriller
86011,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History",Drama


In [28]:
unique_genres = sorted(exploded_genres['genres_split'].unique())

In [29]:
## Save just tconst and genres_split as new df
title_genres = exploded_genres[['tconst', 'genres_split']].copy()
title_genres.head()

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0069049,Drama


In [30]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Short': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [31]:
## Make a dictionary with list of unique genres as the key and the new iteger id as values
genre_id_map = dict(zip(unique_genres, range(len(unique_genres))))
genre_id_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Short': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [32]:
basics['genres_split'] = basics['genres_split'].apply(lambda x: tuple(x))

In [33]:
## make new integer genre_id and drop string genres
basics['genre_id'] = basics['genres_split'].map(genre_map)
basics = basics.drop(columns='genres_split')

In [34]:
## Manaully make dataframe with named cols from the .keyd and .values
genre_lookup = pd.DataFrame ({'Genre_Name': genre_id_map.keys(),
                             'genre_ID':genre_id_map.values()})
genre_lookup.head()

Unnamed: 0,Genre_Name,genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


In [35]:
basics['int_index'] = range(len(basics))

In [36]:
## get max string length
max_str_len = basics['genres'].fillna('').map(len).max()

In [37]:
## Calculate max string lengths for object columns
key_len = basics['tconst'].fillna('').map(len).max()
title_len = basics['primaryTitle'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':Float(),
    'endYear':Float(),
    'runtimeMinutes':Integer()}



In [38]:
# Save to sql with dtype and index=False
basics.to_sql('title_basics',engine,dtype=df_schema,if_exists='replace',index=False)


86012

In [39]:
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x18614b58d00>

In [40]:
#Check the dtypes of your dataframe: (df.dtypes).
title_genres.columns

Index(['tconst', 'genres_split'], dtype='object')

In [41]:
#Use custom function to convert to sql-ready
title_genres_schema = df_to_sql(title_genres)
title_genres_schema

{'tconst': Text(length=11), 'genres_split': Text(length=11)}

In [42]:
# Save to sql with dtype and index=False
title_genres.to_sql('title_genres',engine,dtype=title_genres_schema,if_exists='replace',
              index=False)

160621

In [43]:
#Query the table and show first 5 rows
q = '''
SELECT *
FROM title_genres
Limit 5;
'''
pd.read_sql_query(q, engine)

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
1,tt0035423,Fantasy
2,tt0035423,Romance
3,tt0062336,Drama
4,tt0069049,Drama


## **Genres**

In [44]:
genres = pd.DataFrame(basics)

In [45]:
genres.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres', 'genre_id',
       'int_index'],
      dtype='object')

In [46]:
#Use custom function to convert to sql-ready
genres_schema = df_to_sql(genres)
genres_schema

{'tconst': Text(length=11),
 'titleType': Text(length=6),
 'primaryTitle': Text(length=243),
 'originalTitle': Text(length=243),
 'isAdult': Integer(),
 'startYear': Integer(),
 'endYear': Float(),
 'runtimeMinutes': Integer(),
 'genres': Text(length=30),
 'genre_id': Float(),
 'int_index': Integer()}

In [47]:
# Save to sql with dtype and index=False
genres.to_sql('genres',engine,dtype=genres_schema,if_exists='replace',
              index=False)

86012

In [48]:
#Query the table and show first 5 rows
q = '''
SELECT *
FROM genres
Limit 5;
'''
pd.read_sql_query(q, engine)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genre_id,int_index
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance",,0
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama,,1
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama,,2
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi",,3
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama,,4


## **TMBD Data**

In [49]:
tmdb.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.115,2133.0,PG
1,tt0120467,0.0,/knok3mNReKqPTplnnqz7E4dd7mD.jpg,,120000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",,19085.0,en,Vulgar,...,14904.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone loves a clown... some more than others.,Vulgar,0.0,5.5,44.0,R
2,tt0120630,0.0,/t7aJ7hqsMLEoqBJk7yAXxcAVvcz.jpg,"{'id': 718551, 'name': 'Chicken Run Collection...",45000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7443.0,en,Chicken Run,...,224834564.0,84.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,This ain't no chick flick. It's poultry in mot...,Chicken Run,0.0,6.74,4293.0,G
3,tt0120753,0.0,/3vUkQpnwbsFEQlMh9EORpvKJfpo.jpg,,8000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,318.0,en,The Million Dollar Hotel,...,105983.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone has something to hide.,The Million Dollar Hotel,0.0,5.9,270.0,R
4,tt0120755,0.0,/24DZfupDlhXeTchmcOkoGRhP5Vg.jpg,"{'id': 87359, 'name': 'Mission: Impossible Col...",125000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,955.0,en,Mission: Impossible II,...,546388105.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Expect the impossible again.,Mission: Impossible II,0.0,6.112,5657.0,PG-13


In [50]:
tmdb.columns

Index(['imdb_id', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'certification'],
      dtype='object')

In [51]:
#You only need to keep the imdb_id, revenue, budget, and certification columns
tmdb_req = tmdb[["imdb_id","revenue","budget","certification"]]

In [52]:
#Use custom function to convert to sql-ready
tmdb_schema = df_to_sql(tmdb_req,"imdb_id")
tmdb_schema

{'imdb_id': String(length=11),
 'revenue': Float(),
 'budget': Float(),
 'certification': Text(length=6)}

In [53]:
# Save to sql with dtype and index=False
tmdb_req.to_sql('tmdb_data',engine,dtype=tmdb_schema,if_exists='replace',
              index=False)

1810

In [54]:
#Run the query to ADD PRIMARY KEY
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x18635fead00>

In [55]:
#Query the table and show first 5 rows
q = '''
SELECT *
FROM tmdb_data
Limit 5;
'''
pd.read_sql_query(q, engine)

Unnamed: 0,imdb_id,revenue,budget,certification
0,tt0035423,76019000.0,48000000.0,PG-13
1,tt0118589,5271670.0,22000000.0,PG-13
2,tt0118694,12855000.0,150000.0,PG
3,tt0120467,14904.0,120000.0,R
4,tt0120630,224835000.0,45000000.0,G


# **Movie Project Part Four**

## **Hypothesis Testing**

**The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?**

- **Null Hypothesis:** The rating of a movie does not have a significant affect on how much revenue the movie generates. 

- **Alternate Hypothesis:** The rating of a movie does have a significant affect on how much revenue the movie generates. 


In [64]:
q = "Data/tmdb_results_combined.csv.gz"
tmdb_results_combined = glob.glob(q)
# Showing the first 5
tmdb_results_combined[:5]

['Data/tmdb_results_combined.csv.gz']

In [65]:
## Loading all files as df and appending to a list
df_list = []
for file in tmdb_results_combined:
    tmdb_df = pd.read_csv(file, index_col=0)
    df_list.append(tmdb_df)
    
## Concatenating the list of dfs into 1 combined
df_combined = pd.concat(df_list)
df_combined

Unnamed: 0_level_0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.115,2133.0,PG
tt0120467,0.0,/knok3mNReKqPTplnnqz7E4dd7mD.jpg,,120000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",,19085.0,en,Vulgar,Vulgar is about a man who is a children's clow...,...,14904.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone loves a clown... some more than others.,Vulgar,0.0,5.500,44.0,R
tt0120630,0.0,/t7aJ7hqsMLEoqBJk7yAXxcAVvcz.jpg,"{'id': 718551, 'name': 'Chicken Run Collection...",45000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7443.0,en,Chicken Run,Having been hopelessly repressed and facing ev...,...,224834564.0,84.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,This ain't no chick flick. It's poultry in mot...,Chicken Run,0.0,6.740,4293.0,G
tt0120753,0.0,/3vUkQpnwbsFEQlMh9EORpvKJfpo.jpg,,8000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,318.0,en,The Million Dollar Hotel,The Million Dollar Hotel starts with a jump fr...,...,105983.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone has something to hide.,The Million Dollar Hotel,0.0,5.900,270.0,R
tt0120755,0.0,/24DZfupDlhXeTchmcOkoGRhP5Vg.jpg,"{'id': 87359, 'name': 'Mission: Impossible Col...",125000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,955.0,en,Mission: Impossible II,With computer genius Luther Stickell at his si...,...,546388105.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Expect the impossible again.,Mission: Impossible II,0.0,6.112,5657.0,PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9735462,0.0,/18diCYWCxODOpeJNV89hoPyobxj.jpg,,25900000.0,"[{'id': 10402, 'name': 'Music'}, {'id': 35, 'n...",,498402.0,fr,Aline,"A fictionalized biopic of Aline Dieu, a multit...",...,11131254.0,126.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,A fiction freely inspired by the life of Célin...,Aline,0.0,6.827,269.0,PG-13
tt9770150,0.0,/563sRDK3rZS31TXCdTY4lfcwrNK.jpg,,5000000.0,"[{'id': 18, 'name': 'Drama'}]",https://www.searchlightpictures.com/nomadland,581734.0,en,Nomadland,A woman in her sixties embarks on a journey th...,...,14784114.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,See you down the road.,Nomadland,0.0,7.261,2570.0,R
tt9839038,0.0,/gq9FvopuQeGs6TKljGghefC14uD.jpg,"{'id': 723457, 'name': 'Jim Button Collection'...",20000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",,616180.0,de,Jim Knopf und die Wilde 13,A year has gone by since Jim Button and his be...,...,6700000.0,110.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Jim Button and the Wild 13,0.0,6.200,30.0,
tt9877170,0.0,/3e46k8KPCs1N7luAqNDFZFzrbbE.jpg,,7907534.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 28, ...",,661043.0,hi,मलंग,"Advait visits Goa where he meets Sara, a free-...",...,11136444.0,135.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,Unleash The Madness,Malang,0.0,7.256,39.0,


In [66]:
## Loading and Concatenating the list of dfs with 1 line
df_combined = pd.concat([pd.read_csv(file, index_col=0) for file in tmdb_results_combined])
df_combined

Unnamed: 0_level_0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.115,2133.0,PG
tt0120467,0.0,/knok3mNReKqPTplnnqz7E4dd7mD.jpg,,120000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",,19085.0,en,Vulgar,Vulgar is about a man who is a children's clow...,...,14904.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone loves a clown... some more than others.,Vulgar,0.0,5.500,44.0,R
tt0120630,0.0,/t7aJ7hqsMLEoqBJk7yAXxcAVvcz.jpg,"{'id': 718551, 'name': 'Chicken Run Collection...",45000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7443.0,en,Chicken Run,Having been hopelessly repressed and facing ev...,...,224834564.0,84.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,This ain't no chick flick. It's poultry in mot...,Chicken Run,0.0,6.740,4293.0,G
tt0120753,0.0,/3vUkQpnwbsFEQlMh9EORpvKJfpo.jpg,,8000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,318.0,en,The Million Dollar Hotel,The Million Dollar Hotel starts with a jump fr...,...,105983.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone has something to hide.,The Million Dollar Hotel,0.0,5.900,270.0,R
tt0120755,0.0,/24DZfupDlhXeTchmcOkoGRhP5Vg.jpg,"{'id': 87359, 'name': 'Mission: Impossible Col...",125000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,955.0,en,Mission: Impossible II,With computer genius Luther Stickell at his si...,...,546388105.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Expect the impossible again.,Mission: Impossible II,0.0,6.112,5657.0,PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9735462,0.0,/18diCYWCxODOpeJNV89hoPyobxj.jpg,,25900000.0,"[{'id': 10402, 'name': 'Music'}, {'id': 35, 'n...",,498402.0,fr,Aline,"A fictionalized biopic of Aline Dieu, a multit...",...,11131254.0,126.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,A fiction freely inspired by the life of Célin...,Aline,0.0,6.827,269.0,PG-13
tt9770150,0.0,/563sRDK3rZS31TXCdTY4lfcwrNK.jpg,,5000000.0,"[{'id': 18, 'name': 'Drama'}]",https://www.searchlightpictures.com/nomadland,581734.0,en,Nomadland,A woman in her sixties embarks on a journey th...,...,14784114.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,See you down the road.,Nomadland,0.0,7.261,2570.0,R
tt9839038,0.0,/gq9FvopuQeGs6TKljGghefC14uD.jpg,"{'id': 723457, 'name': 'Jim Button Collection'...",20000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",,616180.0,de,Jim Knopf und die Wilde 13,A year has gone by since Jim Button and his be...,...,6700000.0,110.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Jim Button and the Wild 13,0.0,6.200,30.0,
tt9877170,0.0,/3e46k8KPCs1N7luAqNDFZFzrbbE.jpg,,7907534.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 28, ...",,661043.0,hi,मलंग,"Advait visits Goa where he meets Sara, a free-...",...,11136444.0,135.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,Unleash The Madness,Malang,0.0,7.256,39.0,


In [67]:
## Saving the final combined dataframe
final_fname ='Data/tmdb_results_combined.csv.gz'
df_combined.to_csv(final_fname, compression='gzip', index=False)

In [68]:
df_combined = pd.read_csv(final_fname)
df_combined

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.115,2133.0,PG
1,0.0,/knok3mNReKqPTplnnqz7E4dd7mD.jpg,,120000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",,19085.0,en,Vulgar,Vulgar is about a man who is a children's clow...,...,14904.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone loves a clown... some more than others.,Vulgar,0.0,5.500,44.0,R
2,0.0,/t7aJ7hqsMLEoqBJk7yAXxcAVvcz.jpg,"{'id': 718551, 'name': 'Chicken Run Collection...",45000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7443.0,en,Chicken Run,Having been hopelessly repressed and facing ev...,...,224834564.0,84.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,This ain't no chick flick. It's poultry in mot...,Chicken Run,0.0,6.740,4293.0,G
3,0.0,/3vUkQpnwbsFEQlMh9EORpvKJfpo.jpg,,8000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,318.0,en,The Million Dollar Hotel,The Million Dollar Hotel starts with a jump fr...,...,105983.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Everyone has something to hide.,The Million Dollar Hotel,0.0,5.900,270.0,R
4,0.0,/24DZfupDlhXeTchmcOkoGRhP5Vg.jpg,"{'id': 87359, 'name': 'Mission: Impossible Col...",125000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,955.0,en,Mission: Impossible II,With computer genius Luther Stickell at his si...,...,546388105.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Expect the impossible again.,Mission: Impossible II,0.0,6.112,5657.0,PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1805,0.0,/18diCYWCxODOpeJNV89hoPyobxj.jpg,,25900000.0,"[{'id': 10402, 'name': 'Music'}, {'id': 35, 'n...",,498402.0,fr,Aline,"A fictionalized biopic of Aline Dieu, a multit...",...,11131254.0,126.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,A fiction freely inspired by the life of Célin...,Aline,0.0,6.827,269.0,PG-13
1806,0.0,/563sRDK3rZS31TXCdTY4lfcwrNK.jpg,,5000000.0,"[{'id': 18, 'name': 'Drama'}]",https://www.searchlightpictures.com/nomadland,581734.0,en,Nomadland,A woman in her sixties embarks on a journey th...,...,14784114.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,See you down the road.,Nomadland,0.0,7.261,2570.0,R
1807,0.0,/gq9FvopuQeGs6TKljGghefC14uD.jpg,"{'id': 723457, 'name': 'Jim Button Collection'...",20000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",,616180.0,de,Jim Knopf und die Wilde 13,A year has gone by since Jim Button and his be...,...,6700000.0,110.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Jim Button and the Wild 13,0.0,6.200,30.0,
1808,0.0,/3e46k8KPCs1N7luAqNDFZFzrbbE.jpg,,7907534.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 28, ...",,661043.0,hi,मलंग,"Advait visits Goa where he meets Sara, a free-...",...,11136444.0,135.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,Unleash The Madness,Malang,0.0,7.256,39.0,
