In [2]:
import pandas as pd
import json
import sqlite3

# --- STEP 1: LOAD JSON DATA ---
items = []
with open('IMDB_output.json', 'r', encoding='utf-8') as f:
    for line in f:
        # Handling Scrapy's potential formatting quirks (extra brackets/commas)
        clean_line = line.strip().lstrip('[').rstrip(',').rstrip(']')
        if clean_line:
            try:
                items.append(json.loads(clean_line))
            except json.JSONDecodeError:
                continue

df = pd.DataFrame(items)

# --- STEP 2: DATA CLEANING (Fixing the "None" and List Issues) ---
# Clean strings to ensure perfect matches during joins
def clean_val(val):
    if isinstance(val, list):
        return [str(i).strip() for i in val if i]
    return str(val).strip() if pd.notnull(val) else None

df['director'] = df['director'].apply(clean_val)
df['stars'] = df['stars'].apply(clean_val)
df['writer'] = df['writer'].apply(clean_val)

# Convert genres list to string for SQLite compatibility
df['genres'] = df['genres'].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

# --- STEP 3: CREATE TABLES MATCHING YOUR ERD ---
# 1. Master Person Table (Unique list of all humans)
all_names = pd.concat([df['stars'].explode(), df['director'], df['writer'].explode()]).unique()
filtered_names = [name for name in all_names if name and str(name).lower() != 'none']

df_person = pd.DataFrame({
    'FullName': filtered_names, 
    'person_id': range(1, len(filtered_names) + 1)
})

# 2. Junction Table Logic (Linking People to Movies)
def create_junction(df, source_col, person_df):
    exploded = df[['movie_id', source_col]].explode(source_col).dropna(subset=[source_col])
    return exploded.merge(person_df, left_on=source_col, right_on='FullName')[['movie_id', 'person_id']]

df_movie_cast = create_junction(df, 'stars', df_person)
df_directors = create_junction(df, 'director', df_person)
df_writers = create_junction(df, 'writer', df_person)

# 3. Core Movie & Financial Tables
df_movie = df[df['title'].notnull()][['movie_id', 'title', 'release_date', 'rating', 'duration', 'genres']].drop_duplicates('movie_id')
df_sales = df[df['title'].notnull()][['movie_id', 'budget', 'grossworldwide', 'openingweekend']].drop_duplicates('movie_id')

# 4. Review Tables (Renamed to match ERD)
df_reviews = df[df['reviewer_name'].notnull()][['movie_id', 'reviewer_name', 'review_date', 'review_score', 'review_text']]
df_reviews = df_reviews.rename(columns={'review_score': 'User_score', 'review_date': 'DateP', 'review_text': 'summary'})

# --- STEP 4: EXPORT TO SQLITE ---
conn = sqlite3.connect('imdb_project.db')

df_movie.to_sql('Movie', conn, if_exists='replace', index=False)
df_sales.to_sql('Sales', conn, if_exists='replace', index=False)
df_person.to_sql('Person', conn, if_exists='replace', index=False)
df_movie_cast.to_sql('MovieCast', conn, if_exists='replace', index=False)
df_directors.to_sql('Director', conn, if_exists='replace', index=False)
df_writers.to_sql('Writer', conn, if_exists='replace', index=False)
df_reviews.to_sql('UserReview', conn, if_exists='replace', index=False)

conn.close()
print("Relational database 'imdb_project.db' created successfully with 7 tables!")

Relational database 'imdb_project.db' created successfully with 7 tables!


In [None]:
# Connect to your database
conn = sqlite3.connect('imdb_project.db')

# 1. See all tables in the database
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Database Tables:\n", tables)

# 2. View the first 5 rows of the Movie table
df_movie_check = pd.read_sql("SELECT * FROM Movie LIMIT 5", conn)
display(df_movie_check)

# 3. Test a JOIN between Movie and Director
query = """
SELECT m.title, p.FullName as Director
FROM Movie m
JOIN Director d ON m.movie_id = d.movie_id
JOIN Person p ON d.person_id = p.person_id
LIMIT 10;
"""
df_join_check = pd.read_sql(query, conn)
display(df_join_check)

conn.close()

Database Tables:
          name
0       Movie
1       Sales
2      Person
3   MovieCast
4    Director
5      Writer
6  UserReview


Unnamed: 0,movie_id,title,release_date,rating,duration,genres
0,tt32642706,The Rip,"January 16, 2026",6.9,1h 53m,"Action, Crime, Drama"
1,tt32141377,28 Years Later: The Bone Temple,"January 16, 2026",7.8,1h 49m,"Horror, Sci-Fi"
2,tt27543632,The Housemaid,"December 19, 2025",7.0,2h 11m,"Drama, Thriller"
3,tt30144839,One Battle After Another,"September 26, 2025",7.8,2h 41m,"Action, Crime, Drama"
4,tt22740896,People We Meet on Vacation,"January 9, 2026",6.7,1h 57m,Romance


Unnamed: 0,title,Director
0,The Rip,Joe Carnahan
1,28 Years Later: The Bone Temple,Nia DaCosta
2,The Housemaid,Paul Feig
3,One Battle After Another,Paul Thomas Anderson
4,People We Meet on Vacation,Brett Haley
5,Marty Supreme,Josh Safdie
6,Hamnet,Chloé Zhao
7,Avatar: Fire and Ash,James Cameron
8,Predator: Badlands,Dan Trachtenberg
9,Zootopia 2,Jared Bush


In [4]:
import sqlite3
import pandas as pd

# Connect to the finalized database
conn = sqlite3.connect('imdb_project.db')

# List of all tables from your ERD
tables = ['Movie', 'Sales', 'Person', 'Director', 'Writer', 'MovieCast', 'UserReview']

print("--- FULL DATABASE VERIFICATION ---")
for table in tables:
    print(f"\nTable: {table}")
    # Read the first 3 rows of the current table
    df_check = pd.read_sql(f"SELECT * FROM {table} LIMIT 3", conn)
    display(df_check)

conn.close()

--- FULL DATABASE VERIFICATION ---

Table: Movie


Unnamed: 0,movie_id,title,release_date,rating,duration,genres
0,tt32642706,The Rip,"January 16, 2026",6.9,1h 53m,"Action, Crime, Drama"
1,tt32141377,28 Years Later: The Bone Temple,"January 16, 2026",7.8,1h 49m,"Horror, Sci-Fi"
2,tt27543632,The Housemaid,"December 19, 2025",7.0,2h 11m,"Drama, Thriller"



Table: Sales


Unnamed: 0,movie_id,budget,grossworldwide,openingweekend
0,tt32642706,,,
1,tt32141377,"$63,000,000 (estimated)","$31,200,000","$13,000,000"
2,tt27543632,"$35,000,000 (estimated)","$177,509,730","$19,010,430"



Table: Person


Unnamed: 0,FullName,person_id
0,Kyle Chandler,1
1,Jayson Merrill,2
2,Angel Rosario Jr.,3



Table: Director


Unnamed: 0,movie_id,person_id
0,tt32642706,766
1,tt32141377,767
2,tt27543632,768



Table: Writer


Unnamed: 0,movie_id,person_id
0,tt32642706,766
1,tt32642706,797
2,tt32141377,798



Table: MovieCast


Unnamed: 0,movie_id,person_id
0,tt32642706,1
1,tt32642706,2
2,tt32642706,3



Table: UserReview


Unnamed: 0,movie_id,reviewer_name,DateP,User_score,summary
0,tt32642706,AshleyO-699,"Jan 15, 2026",7,"I almost didn't want to watch this picture, su..."
1,tt32642706,adamcomito,"Jan 17, 2026",6,No doubt Ben and Matt had fun making this film...
2,tt32642706,Fredolow,"Jan 15, 2026",6,"The Rip has the bones of an interesting story,..."


In [5]:
# Check total unique names from your original data
raw_names = pd.concat([df['stars'].explode(), df['director'], df['writer'].explode()]).unique()
print(f"Total unique people found in raw data: {len([n for n in raw_names if n])}")

# Check total rows in your new Person table
print(f"Total people stored in Person table: {len(df_person)}")

Total unique people found in raw data: 826
Total people stored in Person table: 826
