# Movie Database - Metacritic + IMDB Integration

Building database tables from Metacritic (main source) + IMDB data.  
All MovieIDs use format: `slug-year` (e.g., `polite-society-2023`)

# STEP 1: IMPORT LIBRARIES

In [1]:
import pandas as pd
import json
import re


# STEP 2: LOAD DATA FILES

In [2]:
# Load Metacritic CSV files
mc_movies = pd.read_csv('movies_dataMC.csv')
mc_expert_reviews = pd.read_csv('expert_reviews_dataMC.csv')
mc_user_reviews = pd.read_csv('user_reviews_dataMC.csv')

# Remove header row if present
mc_movies = mc_movies[mc_movies['movie_id'] != 'movie']

print(f"Metacritic: {len(mc_movies)} movies, {len(mc_expert_reviews)} expert reviews, {len(mc_user_reviews)} user reviews")


Metacritic: 1411 movies, 17869 expert reviews, 16000 user reviews


# STEP 3: LOAD IMDB JSON FILE

In [3]:
# Load IMDB JSON file
with open('output3.json', 'r') as file:
    content = file.read()
    
# Parse JSON (handles both array format and line-by-line format)
imdb_records = []
try:
    imdb_records = json.loads(content)
except:
    for line in content.split('\n'):
        line = line.strip()
        if line and line not in ['[', ']', ',']:
            try:
                if line.endswith(','):
                    line = line[:-1]
                imdb_records.append(json.loads(line))
            except:
                pass

imdb_df = pd.DataFrame(imdb_records)

# Split movies and reviews (both are in same JSON file)
imdb_movies = imdb_df[imdb_df['title'].notna()].copy()
imdb_reviews = imdb_df[imdb_df['reviewer_name'].notna()].copy()

print(f"IMDB: {len(imdb_movies)} movies, {len(imdb_reviews)} user reviews")

IMDB: 1031 movies, 6423 user reviews


# STEP 4: HELPER FUNCTIONS

In [4]:
def normalize_title(title):
    """Clean title for matching (lowercase, no special chars, no articles)"""
    if pd.isna(title):
        return ""
    title = str(title).lower().strip()
    title = re.sub(r'\s*\(\d{4}\)\s*$', '', title)  # Remove (year)
    title = re.sub(r'^(the|a|an)\s+', '', title)  # Remove articles
    title = re.sub(r'[^\w\s]', '', title)  # Remove special chars
    return ' '.join(title.split())

def create_slug(title):
    """Convert title to URL slug (polite-society)"""
    if pd.isna(title):
        return ""
    slug = str(title).lower()
    slug = re.sub(r'[^a-z0-9]+', '-', slug)
    return slug.strip('-')

def extract_year(date_string):
    """Extract 4-digit year from date string"""
    if pd.isna(date_string):
        return None
    match = re.search(r'\b(19|20)\d{2}\b', str(date_string))
    return int(match.group()) if match else None

def parse_duration(duration_str):
    """Convert '2 h 28 m' to minutes (148)"""
    if pd.isna(duration_str):
        return None
    duration_str = str(duration_str)
    minutes = 0
    hours_match = re.search(r'(\d+)\s*h', duration_str)
    if hours_match:
        minutes += int(hours_match.group(1)) * 60
    mins_match = re.search(r'(\d+)\s*m', duration_str)
    if mins_match:
        minutes += int(mins_match.group(1))
    return minutes if minutes > 0 else None



# STEP 5: MATCH IMDB WITH METACRITIC

In [5]:
# Add normalized titles and years for matching
mc_movies['normalized_title'] = mc_movies['title'].apply(normalize_title)
mc_movies['year'] = mc_movies['release_date'].apply(extract_year)

imdb_movies['normalized_title'] = imdb_movies['title'].apply(normalize_title)
imdb_movies['year'] = imdb_movies['release_date'].apply(extract_year)

# Match by title + year
matches_year = pd.merge(
    imdb_movies[['movie_id', 'title', 'normalized_title', 'year']],
    mc_movies[['movie_id', 'title', 'normalized_title', 'year']],
    on=['normalized_title', 'year'],
    how='inner',
    suffixes=('_imdb', '_mc')
)

# Match by title only (for unmatched)
unmatched_imdb = imdb_movies[~imdb_movies['movie_id'].isin(matches_year['movie_id_imdb'])]
unmatched_mc = mc_movies[~mc_movies['movie_id'].isin(matches_year['movie_id_mc'])]

matches_title = pd.merge(
    unmatched_imdb[['movie_id', 'title', 'normalized_title']],
    unmatched_mc[['movie_id', 'title', 'normalized_title']],
    on='normalized_title',
    how='inner',
    suffixes=('_imdb', '_mc')
)

all_matches = pd.concat([matches_year, matches_title], ignore_index=True)

print(f"Matched: {len(all_matches)} movies (title+year: {len(matches_year)}, title only: {len(matches_title)})")
print(f"Unmatched IMDB: {len(imdb_movies) - len(all_matches)}")



Matched: 104 movies (title+year: 84, title only: 20)
Unmatched IMDB: 927


# STEP 6: CREATE MOVIE TABLE (slug-year format)

In [6]:
# Process Metacritic Movies
mc_movies['MovieID'] = mc_movies.apply(
    lambda row: f"{row['movie_id']}-{row['year']}" if pd.notna(row['year']) else row['movie_id'],
    axis=1
)
mc_movies['original_id'] = mc_movies['movie_id'].copy()

Movie_MC = mc_movies[[
    'MovieID', 'original_id', 'title', 'release_date', 'duration', 'rating', 'genres',
    'production_company', 'tagline', 'website', 'awards'
]].copy()
Movie_MC['duration'] = Movie_MC['duration'].apply(parse_duration)
Movie_MC['source'] = 'metacritic'

# Process Unmatched IMDB Movies
unmatched_imdb_ids = set(imdb_movies['movie_id']) - set(all_matches['movie_id_imdb'])
unmatched_imdb_movies = imdb_movies[imdb_movies['movie_id'].isin(unmatched_imdb_ids)].copy()

unmatched_imdb_movies['slug'] = unmatched_imdb_movies['title'].apply(create_slug)
unmatched_imdb_movies['MovieID'] = unmatched_imdb_movies.apply(
    lambda row: f"{row['slug']}-{row['year']}" if pd.notna(row['year']) else row['slug'],
    axis=1
)
unmatched_imdb_movies['original_id'] = unmatched_imdb_movies['movie_id'].copy()
unmatched_imdb_movies['duration'] = unmatched_imdb_movies['duration'].apply(parse_duration)

Movie_IMDB = pd.DataFrame({
    'MovieID': unmatched_imdb_movies['MovieID'],
    'original_id': unmatched_imdb_movies['original_id'],
    'title': unmatched_imdb_movies['title'],
    'release_date': unmatched_imdb_movies['release_date'],
    'duration': unmatched_imdb_movies['duration'],
    'rating': unmatched_imdb_movies['rating'],
    'genres': unmatched_imdb_movies['genres'],
    'production_company': unmatched_imdb_movies['production_company'],
    'tagline': None,
    'website': None,
    'awards': None,
    'source': 'imdb'
})

# Combine
Movie = pd.concat([Movie_MC, Movie_IMDB], ignore_index=True)

# *** CRITICAL: Create mapping dictionaries for later use ***
mc_to_new_id = dict(zip(Movie_MC['original_id'], Movie_MC['MovieID']))
imdb_to_new_id = dict(zip(Movie_IMDB['original_id'], Movie_IMDB['MovieID']))

print(f"Movie table: {len(Movie)} total ({len(Movie_MC)} MC + {len(Movie_IMDB)} IMDB)")
print(f"Mappings: mc_to_new_id ({len(mc_to_new_id)}), imdb_to_new_id ({len(imdb_to_new_id)})")
Movie[['MovieID', 'title', 'release_date', 'source']].head(10)


Movie table: 2339 total (1411 MC + 928 IMDB)
Mappings: mc_to_new_id (1411), imdb_to_new_id (928)


Unnamed: 0,MovieID,title,release_date,source
0,furiosa-a-mad-max-saga-2024,Furiosa: A Mad Max Saga,"May 24, 2024",metacritic
1,beyond-utopia-2023,Beyond Utopia,"Oct 23, 2023",metacritic
2,chicken-for-linda!-2024,Chicken for Linda!,"Apr 5, 2024",metacritic
3,youth-hard-times-2024,Youth (Hard Times),"Nov 1, 2024",metacritic
4,viet-and-nam-2025,Viet and Nam,"Mar 28, 2025",metacritic
5,the-girl-with-the-needle-2024,The Girl with the Needle,"Dec 6, 2024",metacritic
6,the-league-2023,The League,"Jul 7, 2023",metacritic
7,godland-2023,Godland,"Feb 3, 2023",metacritic
8,mountains-2024,Mountains,"Aug 16, 2024",metacritic
9,last-summer-2023-2024,Last Summer,"Jun 28, 2024",metacritic


# STEP 7: CREATE SALES TABLE

In [7]:
Sales = imdb_movies[['movie_id', 'budget', 'grossworldwide', 'openingweekend']].copy()

# Map using the dictionary from Step 6
Sales['MovieID'] = Sales['movie_id'].map(imdb_to_new_id)
Sales = Sales.dropna(subset=['MovieID'])

# Clean currency
for col in ['budget', 'grossworldwide', 'openingweekend']:
    Sales[col] = Sales[col].astype(str).str.replace('[\$,]', '', regex=True)
    Sales[col] = pd.to_numeric(Sales[col], errors='coerce')

Sales = Sales.rename(columns={
    'budget': 'Budget',
    'grossworldwide': 'GrossWorldwide', 
    'openingweekend': 'OpeningWeekend'
})
Sales = Sales[['MovieID', 'Budget', 'GrossWorldwide', 'OpeningWeekend']]

# Verify no orphans
Sales = Sales[Sales['MovieID'].isin(Movie['MovieID'])]

print(f"Sales table: {len(Sales)} records")
Sales.head()


Sales table: 928 records


Unnamed: 0,MovieID,Budget,GrossWorldwide,OpeningWeekend
2,little-dixie-2023,,,
23,outlast-2023,,,
40,duk-sit-dai-jong-2023,,14911562.0,
49,karate-ghost-2023,,,
56,pertsa-kilu-faaraon-sormus-2023,,,


# STEP 8: CREATE PERSON TABLE

In [8]:
# Combine all people from directors and writers
directors = mc_movies['director'].dropna().str.split(',').explode().str.strip()
writers = mc_movies['writer'].dropna().str.split(',').explode().str.strip()

Person = pd.DataFrame({'name': pd.concat([directors, writers]).unique()})
Person['PersonID'] = range(1, len(Person) + 1)
Person = Person[['PersonID', 'name']]

print(f"Person table: {len(Person)} people")
Person.head()



Person table: 2953 people


Unnamed: 0,PersonID,name
0,1,George Miller
1,2,Madeleine Gavin
2,3,Sébastien Laudenbach
3,4,Chiara Malta
4,5,Wang Bing


# STEP 9: CREATE DIRECTOR TABLE

In [9]:
directors_expanded = mc_movies[['MovieID', 'director']].copy()
directors_expanded = directors_expanded[directors_expanded['director'].notna()]
directors_expanded = directors_expanded.assign(
    director=directors_expanded['director'].str.split(',')
).explode('director')
directors_expanded['director'] = directors_expanded['director'].str.strip()

person_map = dict(zip(Person['name'], Person['PersonID']))
directors_expanded['PersonID'] = directors_expanded['director'].map(person_map)

Director = directors_expanded[['MovieID', 'PersonID']].dropna()
Director = Director.drop_duplicates()

print(f"Director table: {len(Director)} relationships")
Director.head()



Director table: 1577 relationships


Unnamed: 0,MovieID,PersonID
1,furiosa-a-mad-max-saga-2024,1
2,beyond-utopia-2023,2
3,chicken-for-linda!-2024,3
3,chicken-for-linda!-2024,4
4,youth-hard-times-2024,5


# STEP 10: CREATE WRITER TABLE

In [10]:
writers_expanded = mc_movies[['MovieID', 'writer']].copy()
writers_expanded = writers_expanded[writers_expanded['writer'].notna()]
writers_expanded = writers_expanded.assign(
    writer=writers_expanded['writer'].str.split(',')
).explode('writer')
writers_expanded['writer'] = writers_expanded['writer'].str.strip()

writers_expanded['PersonID'] = writers_expanded['writer'].map(person_map)

Writer = writers_expanded[['MovieID', 'PersonID']].dropna()
Writer = Writer.drop_duplicates()

print(f"Writer table: {len(Writer)} relationships")
Writer.head()


Writer table: 2758 relationships


Unnamed: 0,MovieID,PersonID
1,furiosa-a-mad-max-saga-2024,1
1,furiosa-a-mad-max-saga-2024,1425
2,beyond-utopia-2023,2
3,chicken-for-linda!-2024,4
3,chicken-for-linda!-2024,3


# STEP 11: CREATE EXPERT & EXPERTREVIEW TABLES

In [11]:
# Expert table
Expert = mc_expert_reviews[['expert_name']].drop_duplicates().copy()
Expert['ExpertID'] = range(1, len(Expert) + 1)
Expert = Expert.rename(columns={'expert_name': 'name'})
Expert = Expert[['ExpertID', 'name']]

print(f"Expert table: {len(Expert)} experts")

# ExpertReview table
ExpertReview = mc_expert_reviews.copy()
ExpertReview['MovieID'] = ExpertReview['movie_id'].map(mc_to_new_id)

expert_map = dict(zip(Expert['name'], Expert['ExpertID']))
ExpertReview['ExpertID'] = ExpertReview['expert_name'].map(expert_map)
ExpertReview['ReviewID'] = ['expert_' + str(i) for i in range(len(ExpertReview))]

ExpertReview = ExpertReview[['ReviewID', 'MovieID', 'ExpertID', 'expert_score', 'review_date', 'review_text']]
ExpertReview = ExpertReview.rename(columns={
    'expert_score': 'meta_score',
    'review_date': 'DateP',
    'review_text': 'summary'
})
ExpertReview = ExpertReview[ExpertReview['MovieID'].notna()]

print(f"ExpertReview table: {len(ExpertReview)} reviews")
ExpertReview.head()

Expert table: 649 experts
ExpertReview table: 17869 reviews


Unnamed: 0,ReviewID,MovieID,ExpertID,meta_score,DateP,summary
0,expert_0,king-coal-2023,1,91,,"Sheldon is a coal miner’s daughter, and her br..."
1,expert_1,king-coal-2023,2,90,"Aug 14, 2023","In this melancholic, thoughtfully attuned cine..."
2,expert_2,king-coal-2023,3,83,,It offers no easy answers while spinning an ev...
3,expert_3,king-coal-2023,4,80,"Aug 10, 2023","Filmmaker Elaine McMillion Sheldon, a native o..."
4,expert_4,king-coal-2023,5,75,,King Coal goes deeper into the cultural roots ...


# STEP 12: CREATE USER & USERREVIEW TABLES

In [12]:
# Part 1: Create User Table
mc_users = mc_user_reviews[['user_name']].drop_duplicates().rename(columns={'user_name': 'Reviewer'})
imdb_users = imdb_reviews[['reviewer_name']].drop_duplicates().rename(columns={'reviewer_name': 'Reviewer'})
User = pd.concat([mc_users, imdb_users], ignore_index=True).drop_duplicates()
User['UserID'] = range(1, len(User) + 1)
User = User[['UserID', 'Reviewer']]

print(f"User table created: {len(User)} users")

# Part 2: Process Metacritic User Reviews
UserReview_MC = mc_user_reviews.copy()
UserReview_MC = UserReview_MC[UserReview_MC['movie_id'].notna()]
UserReview_MC['MovieID'] = UserReview_MC['movie_id'].map(mc_to_new_id)
UserReview_MC = UserReview_MC.merge(User, left_on='user_name', right_on='Reviewer', how='left')
UserReview_MC['ReviewID'] = ['mc_user_' + str(i) for i in range(len(UserReview_MC))]
UserReview_MC = UserReview_MC[['ReviewID', 'MovieID', 'UserID', 'user_score', 'review_date', 'review_text']]
UserReview_MC = UserReview_MC.rename(columns={'review_date': 'DateP', 'review_text': 'summary'})
UserReview_MC = UserReview_MC[UserReview_MC['MovieID'].notna()]

print(f"Metacritic user reviews: {len(UserReview_MC)}")

# Part 3: Process IMDB User Reviews
UserReview_IMDB = imdb_reviews.copy()
UserReview_IMDB = UserReview_IMDB[UserReview_IMDB['movie_id'].notna()]
UserReview_IMDB['MovieID'] = UserReview_IMDB['movie_id'].map(imdb_to_new_id)
UserReview_IMDB = UserReview_IMDB.merge(User, left_on='reviewer_name', right_on='Reviewer', how='left')
UserReview_IMDB['ReviewID'] = ['imdb_user_' + str(i) for i in range(len(UserReview_IMDB))]
UserReview_IMDB = UserReview_IMDB[['ReviewID', 'MovieID', 'UserID', 'review_score', 'review_date', 'review_text']]
UserReview_IMDB = UserReview_IMDB.rename(columns={'review_score': 'user_score', 'review_date': 'DateP', 'review_text': 'summary'})
UserReview_IMDB = UserReview_IMDB[UserReview_IMDB['MovieID'].notna()]

print(f"IMDB user reviews: {len(UserReview_IMDB)}")

# Part 4: Combine All Reviews
UserReview = pd.concat([UserReview_MC, UserReview_IMDB], ignore_index=True)

print(f"\n{'='*60}")
print(f"UserReview table: {len(UserReview)} total reviews")
print(f"  - MC reviews: {len(UserReview_MC)}")
print(f"  - IMDB reviews: {len(UserReview_IMDB)}")
print(f"{'='*60}\n")

UserReview.head(10)

User table created: 8922 users
Metacritic user reviews: 16000
IMDB user reviews: 4304

UserReview table: 20304 total reviews
  - MC reviews: 16000
  - IMDB reviews: 4304



Unnamed: 0,ReviewID,MovieID,UserID,user_score,DateP,summary
0,mc_user_0,the-rip-2026,1,7,"Jan 22, 2026",There are films that don’t try to reinvent any...
1,mc_user_1,the-rip-2026,2,5,"Jan 21, 2026","The definition of mid, ok, ""meh"". It wasn't ba..."
2,mc_user_2,the-rip-2026,3,4,"Jan 21, 2026","If you like one shootout after another, with s..."
3,mc_user_3,the-rip-2026,4,6,"Jan 21, 2026",Too many unnecessary F bombs. At best a B movi...
4,mc_user_4,the-rip-2026,5,6,"Jan 20, 2026","The Rip is a film that makes clear, from the v..."
5,mc_user_5,the-rip-2026,6,4,"Jan 20, 2026",Another 4/10 Netflix movie noone asked for and...
6,mc_user_6,the-rip-2026,7,8,"Jan 17, 2026",Damon and Affleck's little Netflix thriller is...
7,mc_user_7,the-rip-2026,8,1,"Jan 19, 2026",Worst third act I have seen in some time. The ...
8,mc_user_8,the-rip-2026,9,8,"Jan 19, 2026","No decepciona, buena trama no se descubre hast..."
9,mc_user_9,the-rip-2026,10,0,"Jan 19, 2026",**** number of times the screenwriters typed t...


# STEP 13: SUMMARY

In [13]:
print("="*60)
print("DATABASE SUMMARY")
print("="*60)
print(f"Movie:        {len(Movie):5} rows")
print(f"Sales:        {len(Sales):5} rows")
print(f"Person:       {len(Person):5} rows")
print(f"Director:     {len(Director):5} rows")
print(f"Writer:       {len(Writer):5} rows")
print(f"Expert:       {len(Expert):5} rows")
print(f"ExpertReview: {len(ExpertReview):5} rows")
print(f"User:         {len(User):5} rows")
print(f"UserReview:   {len(UserReview):5} rows")
print("="*60)

DATABASE SUMMARY
Movie:         2339 rows
Sales:          928 rows
Person:        2953 rows
Director:      1577 rows
Writer:        2758 rows
Expert:         649 rows
ExpertReview: 17869 rows
User:          8922 rows
UserReview:   20304 rows


# STEP 14: EXPORT TO CSV

In [14]:
import os

output_dir = 'database_tables'
os.makedirs(output_dir, exist_ok=True)

tables = [
    ('Movie.csv', Movie),
    ('Sales.csv', Sales),
    ('Person.csv', Person),
    ('Director.csv', Director),
    ('Writer.csv', Writer),
    ('Expert.csv', Expert),
    ('ExpertReview.csv', ExpertReview),
    ('User.csv', User),
    ('UserReview.csv', UserReview)
]

for filename, df in tables:
    filepath = os.path.join(output_dir, filename)
    df.to_csv(filepath, index=False)
    print(f"✓ {filename}")

print(f"\nFiles saved to {output_dir}/")

✓ Movie.csv
✓ Sales.csv
✓ Person.csv
✓ Director.csv
✓ Writer.csv
✓ Expert.csv
✓ ExpertReview.csv
✓ User.csv
✓ UserReview.csv

Files saved to database_tables/


# STEP 15: DATA QUALITY CHECK

In [15]:
# Check foreign key integrity
checks = [
    ('Sales → Movie', Sales[~Sales['MovieID'].isin(Movie['MovieID'])]),
    ('Director → Movie', Director[~Director['MovieID'].isin(Movie['MovieID'])]),
    ('Director → Person', Director[~Director['PersonID'].isin(Person['PersonID'])]),
    ('Writer → Movie', Writer[~Writer['MovieID'].isin(Movie['MovieID'])]),
    ('Writer → Person', Writer[~Writer['PersonID'].isin(Person['PersonID'])]),
    ('ExpertReview → Movie', ExpertReview[~ExpertReview['MovieID'].isin(Movie['MovieID'])]),
    ('ExpertReview → Expert', ExpertReview[~ExpertReview['ExpertID'].isin(Expert['ExpertID'])]),
    ('UserReview → Movie', UserReview[~UserReview['MovieID'].isin(Movie['MovieID'])]),
    ('UserReview → User', UserReview[~UserReview['UserID'].isin(User['UserID'])])
]

print("Foreign Key Checks:")
print("-" * 40)
for name, invalid in checks:
    status = "success" if len(invalid) == 0 else f" {len(invalid)} errors"
    print(f"{name:25} {status}")

total_errors = sum(len(invalid) for _, invalid in checks)
print("-" * 40)
if total_errors == 0:
    print("All checks passed!")
else:
    print(f" {total_errors} issues found")

Foreign Key Checks:
----------------------------------------
Sales → Movie             success
Director → Movie          success
Director → Person         success
Writer → Movie            success
Writer → Person           success
ExpertReview → Movie      success
ExpertReview → Expert     success
UserReview → Movie        success
UserReview → User         success
----------------------------------------
All checks passed!


In [17]:
# ==============================================================================
# STEP 16: LOAD DATA INTO SQLITE DATABASE (FIXED VERSION)
# ==============================================================================

import sqlite3
import json

# Connect to database
conn = sqlite3.connect('OnlineDM_project.db')
cursor = conn.cursor()

# Enable Foreign Key support
cursor.execute("PRAGMA foreign_keys = ON;")

print("Loading data into SQLite database...")
print("="*60)

# ==============================================================================
# 1. PREPARE MOVIE TABLE
# ==============================================================================

Movie_DB = Movie.copy()

# Map columns to match your database schema
Movie_DB = Movie_DB.rename(columns={'MovieID': 'movie_id'})

# Add missing columns (set to None for now)
Movie_DB['overall_meta_score'] = None
Movie_DB['overall_user_score'] = None

# *** FIX: Convert list/array columns to strings ***
for col in ['genres', 'production_company', 'awards']:
    if col in Movie_DB.columns:
        Movie_DB[col] = Movie_DB[col].apply(
            lambda x: ', '.join(x) if isinstance(x, list) else str(x) if pd.notna(x) else None
        )

# Select only columns that exist in your schema
Movie_DB = Movie_DB[[
    'movie_id', 'release_date', 'title', 'production_company', 
    'duration', 'rating', 'genres', 'tagline', 'website', 'awards',
    'overall_meta_score', 'overall_user_score'
]].copy()  # Add .copy() to avoid issues

# Load into database
Movie_DB.to_sql('Movie', conn, if_exists='replace', index=False)
print(f"✓ Movie: {len(Movie_DB)} rows loaded")


# ==============================================================================
# 2. PREPARE SALES TABLE
# ==============================================================================

Sales_DB = Sales.copy()

# Map columns to match your database schema
Sales_DB = Sales_DB.rename(columns={
    'MovieID': 'movie_id',
    'Budget': 'budget',
    'GrossWorldwide': 'gross_worldwide',
    'OpeningWeekend': 'opening_weekend'
})

# Add missing columns
Sales_DB['gross_domestic'] = None
Sales_DB['gross_international'] = None

# Select columns in correct order
Sales_DB = Sales_DB[[
    'movie_id', 'budget', 'gross_domestic', 'gross_international',
    'gross_worldwide', 'opening_weekend'
]].copy()

# Load into database
Sales_DB.to_sql('Sales', conn, if_exists='replace', index=False)
print(f"✓ Sales: {len(Sales_DB)} rows loaded")


# ==============================================================================
# 3. PREPARE USER TABLE
# ==============================================================================

User_DB = User.copy()

# Map columns to match your database schema
User_DB = User_DB.rename(columns={
    'UserID': 'user_id',
    'Reviewer': 'reviewer'
})

# Add total_reviews column (count reviews per user)
user_review_counts = UserReview.groupby('UserID').size().reset_index(name='total_reviews')
User_DB = User_DB.merge(user_review_counts, left_on='user_id', right_on='UserID', how='left')
User_DB['total_reviews'] = User_DB['total_reviews'].fillna(0).astype(int)
User_DB = User_DB.drop('UserID', axis=1, errors='ignore')

# Select columns
User_DB = User_DB[['user_id', 'reviewer', 'total_reviews']].copy()

# Load into database
User_DB.to_sql('User', conn, if_exists='replace', index=False)
print(f"✓ User: {len(User_DB)} rows loaded")


# ==============================================================================
# 4. PREPARE USERREVIEW TABLE
# ==============================================================================

UserReview_DB = UserReview.copy()

# Map columns to match your database schema
UserReview_DB = UserReview_DB.rename(columns={
    'ReviewID': 'review_id',
    'MovieID': 'movie_id',
    'UserID': 'user_id',
    'DateP': 'DateP'  # Already correct
})

# Ensure user_score is integer
UserReview_DB['user_score'] = pd.to_numeric(UserReview_DB['user_score'], errors='coerce').fillna(0).astype(int)

# Select columns
UserReview_DB = UserReview_DB[['review_id', 'movie_id', 'user_id', 'user_score', 'DateP', 'summary']].copy()

# Load into database
UserReview_DB.to_sql('UserReview', conn, if_exists='replace', index=False)
print(f"✓ UserReview: {len(UserReview_DB)} rows loaded")


# ==============================================================================
# 5. PREPARE EXPERT TABLE
# ==============================================================================

Expert_DB = Expert.copy()

# Map columns to match your database schema
Expert_DB = Expert_DB.rename(columns={
    'ExpertID': 'expert_id',
    'name': 'reviewer'
})

# Add total_reviews column (count reviews per expert)
expert_review_counts = ExpertReview.groupby('ExpertID').size().reset_index(name='total_reviews')
Expert_DB = Expert_DB.merge(expert_review_counts, left_on='expert_id', right_on='ExpertID', how='left')
Expert_DB['total_reviews'] = Expert_DB['total_reviews'].fillna(0).astype(int)
Expert_DB = Expert_DB.drop('ExpertID', axis=1, errors='ignore')

# Select columns
Expert_DB = Expert_DB[['expert_id', 'reviewer', 'total_reviews']].copy()

# Load into database
Expert_DB.to_sql('Expert', conn, if_exists='replace', index=False)
print(f"✓ Expert: {len(Expert_DB)} rows loaded")


# ==============================================================================
# 6. PREPARE EXPERTREVIEW TABLE
# ==============================================================================

ExpertReview_DB = ExpertReview.copy()

# Map columns to match your database schema
ExpertReview_DB = ExpertReview_DB.rename(columns={
    'ReviewID': 'review_id',
    'MovieID': 'movie_id',
    'ExpertID': 'expert_id',
    'DateP': 'DateP'  # Already correct
})

# Ensure meta_score is integer
ExpertReview_DB['meta_score'] = pd.to_numeric(ExpertReview_DB['meta_score'], errors='coerce').fillna(0).astype(int)

# Select columns
ExpertReview_DB = ExpertReview_DB[['review_id', 'movie_id', 'expert_id', 'meta_score', 'DateP', 'summary']].copy()

# Load into database
ExpertReview_DB.to_sql('ExpertReview', conn, if_exists='replace', index=False)
print(f"✓ ExpertReview: {len(ExpertReview_DB)} rows loaded")


# ==============================================================================
# 7. PREPARE FILM_CREW TABLE (renamed from Person)
# ==============================================================================

FilmCrew_DB = Person.copy()

# Map columns to match your database schema
FilmCrew_DB = FilmCrew_DB.rename(columns={
    'PersonID': 'person_id',
    'name': 'FullName'
})

# Select columns
FilmCrew_DB = FilmCrew_DB[['person_id', 'FullName']].copy()

# Load into database
FilmCrew_DB.to_sql('Film_Crew', conn, if_exists='replace', index=False)
print(f"✓ Film_Crew: {len(FilmCrew_DB)} rows loaded")


# ==============================================================================
# 8. PREPARE DIRECTOR TABLE
# ==============================================================================

Director_DB = Director.copy()

# Map columns to match your database schema
Director_DB = Director_DB.rename(columns={
    'MovieID': 'movie_id',
    'PersonID': 'person_id'
})

# Select columns
Director_DB = Director_DB[['movie_id', 'person_id']].copy()

# Load into database
Director_DB.to_sql('Director', conn, if_exists='replace', index=False)
print(f"✓ Director: {len(Director_DB)} rows loaded")


# ==============================================================================
# 9. PREPARE WRITER TABLE
# ==============================================================================

Writer_DB = Writer.copy()

# Map columns to match your database schema
Writer_DB = Writer_DB.rename(columns={
    'MovieID': 'movie_id',
    'PersonID': 'person_id'
})

# Select columns
Writer_DB = Writer_DB[['movie_id', 'person_id']].copy()

# Load into database
Writer_DB.to_sql('Writer', conn, if_exists='replace', index=False)
print(f"✓ Writer: {len(Writer_DB)} rows loaded")


# ==============================================================================
# 10. NOTE: MovieCast table is empty (no cast data in source files)
# ==============================================================================

# Create empty MovieCast table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS MovieCast (
        movie_id TEXT,
        person_id INTEGER,
        CharacterName TEXT,
        PRIMARY KEY (movie_id, person_id),
        FOREIGN KEY (movie_id) REFERENCES Movie (movie_id),
        FOREIGN KEY (person_id) REFERENCES Film_crew (person_id)
    )
''')
print("✓ MovieCast: 0 rows (no cast data available)")


# ==============================================================================
# SUMMARY
# ==============================================================================

print("="*60)
print("DATABASE LOAD COMPLETE")
print("="*60)

# Verify counts
tables = ['Movie', 'Sales', 'User', 'UserReview', 'Expert', 'ExpertReview', 
          'Film_Crew', 'Director', 'Writer', 'MovieCast']

for table in tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    count = cursor.fetchone()[0]
    print(f"{table:15} {count:6} rows")

print("="*60)

# Commit and close
conn.commit()
conn.close()

print("\nDatabase saved successfully!")


Loading data into SQLite database...
✓ Movie: 2339 rows loaded
✓ Sales: 928 rows loaded
✓ User: 8922 rows loaded
✓ UserReview: 20304 rows loaded
✓ Expert: 649 rows loaded
✓ ExpertReview: 17869 rows loaded
✓ Film_Crew: 2953 rows loaded
✓ Director: 1577 rows loaded
✓ Writer: 2758 rows loaded
✓ MovieCast: 0 rows (no cast data available)
DATABASE LOAD COMPLETE
Movie             2339 rows
Sales              928 rows
User              8922 rows
UserReview       20304 rows
Expert             649 rows
ExpertReview     17869 rows
Film_Crew         2953 rows
Director          1577 rows
Writer            2758 rows
MovieCast            0 rows

Database saved successfully!
