## 0. Database Connection

In [1]:
import os
import configparser
from sqlalchemy import create_engine

# Read configuration file
mysqlcfg = configparser.ConfigParser()
mysqlcfg.read("mysql.cfg")  # Ensure this file is in your working directory

user = mysqlcfg['mysql']['user']
passwd = mysqlcfg['mysql']['passwd']
host = mysqlcfg['mysql']['host']

# Build the connection string.
dburl = f"mysql+pymysql://{user}:{passwd}@{host}:3306"
engine = create_engine(dburl, connect_args={"local_infile": 1})


# Connect to the database
try:
    connection = engine.connect()
    print("Successfully connected to the RDS MySQL instance!")
except Exception as e:
    print("Connection failed:", e)

Successfully connected to the RDS MySQL instance!


In [2]:
# Load SQL magic in Jupyter Notebook
%reload_ext sql

# Set the DATABASE_URL environment variable
os.environ['DATABASE_URL'] = dburl

# Now run a test query
%sql SELECT version();

1 rows affected.


version()
8.0.40


In [3]:
%sql show databases;

 * mysql+pymysql://admin:***@imdb-db.ccz8a4s84tqy.us-east-1.rds.amazonaws.com:3306
7 rows affected.


Database
imdb
information_schema
mysql
new_schema
performance_schema
rds-title-principals
sys


In [4]:
%%sql 
use imdb;
show tables;

 * mysql+pymysql://admin:***@imdb-db.ccz8a4s84tqy.us-east-1.rds.amazonaws.com:3306
0 rows affected.
5 rows affected.


Tables_in_imdb
box_office
box_office_tconst
title_basics
title_principals
title_ratings


## 1. Create box_office table for imported data

Create Table for csv import

```

DROP TABLE IF EXISTS box_office;

CREATE TABLE IF NOT EXISTS 
    
    box_office (
        
        Year INT,
        
        Title VARCHAR(255),
        
        Gross VARCHAR(18),
        
        PRIMARY KEY (Title,Year)
    
    );```

In [14]:
%%sql
describe box_office;

 * mysql+pymysql://admin:***@imdb-db.ccz8a4s84tqy.us-east-1.rds.amazonaws.com:3306
3 rows affected.


Field,Type,Null,Key,Default,Extra
Year,int,NO,PRI,,
Title,varchar(255),NO,PRI,,
Gross,varchar(18),YES,,,


## 2. Cleaned imported csv dataset from kaggle

Cleaning CSV

sample record:
Year | Title | Gross

1999 | A Title with, a comma | "$100,100,000"

After cleaning:

1999 | A title with, a comma | 100100000

Change to tsv file to accomodate commas in Titles

In [29]:
import csv

def clean_csv_to_tsv(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile)
        writer = csv.writer(outfile,delimiter='\t')

        for row in reader:
            cleaned_row = []
            for col in row:
                if '$' in col:
                    col = col.replace("$","")
                    col = col.replace(",","")
                cleaned_row.append(col)
            writer.writerow(cleaned_row)

# Example usage
clean_csv_to_tsv('/home/jovyan/imdb-data-mining-pjt/boxoffice_data_2024.csv', '/home/jovyan/imdb-data-mining-pjt/boxoffice_data_2024_cleaned.tsv')

## 3. Imported tsv file in to a table

The below code was run on my local machine with appropriate permissions

```%%sql
LOAD DATA LOCAL INFILE '/Users/nathanjh/Desktop/boxoffice_data_2024_cleaned.tsv'

INTO TABLE box_office

FIELDS TERMINATED BY '\t'

LINES TERMINATED BY '\n'

IGNORE 1 LINES;
```

In [11]:
%%sql
select * 
from box_office  
order by cast(Gross as unsigned) desc
limit 5;

 * mysql+pymysql://admin:***@imdb-db.ccz8a4s84tqy.us-east-1.rds.amazonaws.com:3306
5 rows affected.


Year,Title,Gross
2019,Avengers: Endgame,2799439100
2009,Avatar,2743577587
2022,Avatar: The Way of Water,2320250281
2015,Star Wars: Episode VII - The Force Awakens,2068223624
2018,Avengers: Infinity War,2048359754


## 4. import data from query matching data in box_office and title_basics in to a new table

```
CREATE TABLE box_office_tconst AS

SELECT 
    
    box_office.Year,
    
    box_office.Title,
    
    box_office.Gross,
    
    title_basics.tconst

FROM box_office

INNER JOIN title_basics
    
    ON box_office.Title = title_basics.primaryTitle
    
    AND (title_basics.startYear = box_office.Year OR title_basics.endYear = box_office.Year)

WHERE title_basics.titleType = 'movie'

ORDER BY CAST(box_office.Gross AS UNSIGNED) DESC;
```

In [12]:
%%sql
select * 
from box_office_tconst
order by cast(Gross as unsigned) desc
limit 5;

 * mysql+pymysql://admin:***@imdb-db.ccz8a4s84tqy.us-east-1.rds.amazonaws.com:3306
5 rows affected.


Year,Title,Gross,tconst
2009,Avatar,2743577587,tt0499549
2021,Spider-Man: No Way Home,1912233593,tt10872600
1997,Titanic,1843373318,tt0120338
2015,Jurassic World,1670400637,tt0369610
2012,The Avengers,1518812988,tt0848228
