# **Movie Predictions Part 3**

**Name:** **Derek Overton**

**Date:** **3/12/2023**

**Project: Movie Predictions**

# **Imports**

In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import pymysql
pymysql.install_as_MySQLdb()


# Additional Imports
import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus
from scipy import stats

# **Loading Data**

In [2]:
with open('/Users/talen/.secret/tmbd_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
tmdb.API_KEY =  login['api-key']

In [4]:
# Load JSON file into pandas dataframe
with open('Data/tmdb_api_results_2000.json', 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data)

# Save dataframe as compressed CSV file
with gzip.open('Data/tmdb_api_results_2000.csv.gz', 'wt', encoding='utf8') as file:
    df.to_csv(file, index=False)

In [5]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'basics.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'ratings.csv.gz',
 'tmdb_api_results_2000.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [6]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/basics.csv.gz')

In [7]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [8]:
ratings = pd.read_csv('Data/ratings.csv.gz')

In [9]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
2,tt0000005,6.2,2589
3,tt0000006,5.1,177
4,tt0000007,5.4,812


# **Creating MYSQL Database**

In [10]:
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
connection_str = "mysql+pymysql://root:Password@localhost/Movie"

In [11]:
# Create the engine:
engine = create_engine(connection_str)

In [12]:
# Check if the database exists. If not, create it.
if database_exists(connection_str) == False:
  create_database(connection_str)
else:
  print('The database already exists')

The database already exists


In [13]:
# Check for database existance:
database_exists(connection_str)

True

# **Making Dataframe tables in MySQL Database**

In [16]:
## Saving dataframes to database
basics.to_sql('title_basics', engine, index=False, if_exists = 'replace')
basics.to_sql('title_ratings', engine, index=False, if_exists = 'replace')
basics.to_sql('title_genres', engine, index=False, if_exists = 'replace')
basics.to_sql('genres',engine, index=False, if_exists = 'replace')
basics.to_sql('tmdb_data',engine, index=False, if_exists = 'replace')

85575

In [18]:
## Checking if tables created
q= '''SHOW TABLES;'''
pd.read_sql(q,engine)

Unnamed: 0,Tables_in_movie
0,genres
1,title_basics
2,title_genres
3,title_ratings
4,tmdb_data
