# Movie Database - Metacritic + IMDB Integration

Building database tables from Metacritic (main source) + IMDB data.  
All MovieIDs use format: `slug-year` (e.g., `polite-society-2023`)

## Step 1: Import Libraries

In [12]:
import pandas as pd
import json
import re

## Step 2: Load Data Files

In [13]:
# Load Metacritic CSV files
mc_movies = pd.read_csv('movies_dataMC.csv')
mc_expert_reviews = pd.read_csv('expert_reviews_dataMC.csv')
mc_user_reviews = pd.read_csv('user_reviews_dataMC.csv')

# Remove header row if present
mc_movies = mc_movies[mc_movies['movie_id'] != 'movie']

print(f"Metacritic: {len(mc_movies)} movies, {len(mc_expert_reviews)} expert reviews, {len(mc_user_reviews)} user reviews")

Metacritic: 1411 movies, 17869 expert reviews, 16000 user reviews


In [14]:
# Load IMDB JSON file
with open('IMDBscrapingoutput.json', 'r') as file:
    content = file.read()
    
# Parse JSON (handles both array format and line-by-line format)
imdb_records = []
try:
    imdb_records = json.loads(content)
except:
    for line in content.split('\n'):
        line = line.strip()
        if line and line not in ['[', ']', ',']:
            try:
                if line.endswith(','):
                    line = line[:-1]
                imdb_records.append(json.loads(line))
            except:
                pass

imdb_df = pd.DataFrame(imdb_records)

# Split movies and reviews (both are in same JSON file)
imdb_movies = imdb_df[imdb_df['title'].notna()].copy()
imdb_reviews = imdb_df[imdb_df['reviewer_name'].notna()].copy()

print(f"IMDB: {len(imdb_movies)} movies, {len(imdb_reviews)} user reviews")

IMDB: 1031 movies, 6423 user reviews


## Step 3: Helper Functions

In [15]:
def normalize_title(title):
    """Clean title for matching (lowercase, no special chars, no articles)"""
    if pd.isna(title):
        return ""
    title = str(title).lower().strip()
    title = re.sub(r'\s*\(\d{4}\)\s*$', '', title)  # Remove (year)
    title = re.sub(r'^(the|a|an)\s+', '', title)  # Remove articles
    title = re.sub(r'[^\w\s]', '', title)  # Remove special chars
    return ' '.join(title.split())

def create_slug(title):
    """Convert title to URL slug (polite-society)"""
    if pd.isna(title):
        return ""
    slug = str(title).lower()
    slug = re.sub(r'[^a-z0-9]+', '-', slug)
    return slug.strip('-')

def extract_year(date_string):
    """Extract 4-digit year from date string"""
    if pd.isna(date_string):
        return None
    match = re.search(r'\b(19|20)\d{2}\b', str(date_string))
    return int(match.group()) if match else None

def parse_duration(duration_str):
    """Convert '2 h 28 m' to minutes (148)"""
    if pd.isna(duration_str):
        return None
    duration_str = str(duration_str)
    minutes = 0
    hours_match = re.search(r'(\d+)\s*h', duration_str)
    if hours_match:
        minutes += int(hours_match.group(1)) * 60
    mins_match = re.search(r'(\d+)\s*m', duration_str)
    if mins_match:
        minutes += int(mins_match.group(1))
    return minutes if minutes > 0 else None

## Step 4: Match IMDB with Metacritic

In [16]:
# Add normalized titles and years for matching
mc_movies['normalized_title'] = mc_movies['title'].apply(normalize_title)
mc_movies['year'] = mc_movies['release_date'].apply(extract_year)

imdb_movies['normalized_title'] = imdb_movies['title'].apply(normalize_title)
imdb_movies['year'] = imdb_movies['release_date'].apply(extract_year)

# Match by title + year
matches_year = pd.merge(
    imdb_movies[['movie_id', 'title', 'normalized_title', 'year']],
    mc_movies[['movie_id', 'title', 'normalized_title', 'year']],
    on=['normalized_title', 'year'],
    how='inner',
    suffixes=('_imdb', '_mc')
)

# Match by title only (for unmatched)
unmatched_imdb = imdb_movies[~imdb_movies['movie_id'].isin(matches_year['movie_id_imdb'])]
unmatched_mc = mc_movies[~mc_movies['movie_id'].isin(matches_year['movie_id_mc'])]

matches_title = pd.merge(
    unmatched_imdb[['movie_id', 'title', 'normalized_title']],
    unmatched_mc[['movie_id', 'title', 'normalized_title']],
    on='normalized_title',
    how='inner',
    suffixes=('_imdb', '_mc')
)

all_matches = pd.concat([matches_year, matches_title], ignore_index=True)

print(f"Matched: {len(all_matches)} movies (title+year: {len(matches_year)}, title only: {len(matches_title)})")
print(f"Unmatched IMDB: {len(imdb_movies) - len(all_matches)}")

Matched: 104 movies (title+year: 84, title only: 20)
Unmatched IMDB: 927


## Step 5: Create Movie Table (slug-year format)

In [17]:
# Metacritic movies: use existing slug + year
mc_movies['MovieID'] = mc_movies.apply(
    lambda row: f"{row['movie_id']}-{row['year']}" if pd.notna(row['year']) else row['movie_id'],
    axis=1
)

# Select columns from Metacritic
Movie = mc_movies[[
    'MovieID', 'title', 'release_date', 'duration', 'rating', 'genres',
    'production_company', 'tagline', 'website', 'awards'
]].copy()

# Convert duration to minutes
Movie['duration'] = Movie['duration'].apply(parse_duration)

# Add unmatched IMDB movies (create slug + year)
unmatched_imdb_ids = set(imdb_movies['movie_id']) - set(all_matches['movie_id_imdb'])
unmatched_imdb_movies = imdb_movies[imdb_movies['movie_id'].isin(unmatched_imdb_ids)].copy()

unmatched_imdb_movies['slug'] = unmatched_imdb_movies['title'].apply(create_slug)
unmatched_imdb_movies['MovieID'] = unmatched_imdb_movies.apply(
    lambda row: f"{row['slug']}-{row['year']}" if pd.notna(row['year']) else row['slug'],
    axis=1
)

imdb_to_add = pd.DataFrame({
    'MovieID': unmatched_imdb_movies['MovieID'],
    'title': unmatched_imdb_movies['title'],
    'release_date': unmatched_imdb_movies['release_date'],
    'duration': unmatched_imdb_movies['duration'],
    'rating': unmatched_imdb_movies['rating'],
    'genres': unmatched_imdb_movies['genres'],
    'production_company': unmatched_imdb_movies['production_company'],
    'tagline': None,
    'website': None,
    'awards': None
})

Movie = pd.concat([Movie, imdb_to_add], ignore_index=True)

print(f"Movie table: {len(Movie)} total ({len(mc_movies)} MC + {len(unmatched_imdb_movies)} IMDB)")
Movie[['MovieID', 'title', 'release_date']].head()

Movie table: 2339 total (1411 MC + 928 IMDB)


Unnamed: 0,MovieID,title,release_date
0,furiosa-a-mad-max-saga-2024,Furiosa: A Mad Max Saga,"May 24, 2024"
1,beyond-utopia-2023,Beyond Utopia,"Oct 23, 2023"
2,chicken-for-linda!-2024,Chicken for Linda!,"Apr 5, 2024"
3,youth-hard-times-2024,Youth (Hard Times),"Nov 1, 2024"
4,viet-and-nam-2025,Viet and Nam,"Mar 28, 2025"


## Step 6: Create Sales Table

In [18]:
# Map IMDB movie_id to new slug-year MovieID
imdb_movies['slug'] = imdb_movies['title'].apply(create_slug)
imdb_movies['MovieID'] = imdb_movies.apply(
    lambda row: f"{row['slug']}-{row['year']}" if pd.notna(row['year']) else row['slug'],
    axis=1
)

Sales = imdb_movies[['MovieID', 'budget', 'grossworldwide', 'openingweekend']].copy()
Sales = Sales.rename(columns={
    'budget': 'Budget',
    'grossworldwide': 'GrossWorldwide',
    'openingweekend': 'OpeningWeekend'
})

# Clean currency values
for col in ['Budget', 'GrossWorldwide', 'OpeningWeekend']:
    Sales[col] = Sales[col].replace('[\$,]', '', regex=True)
    Sales[col] = pd.to_numeric(Sales[col], errors='coerce')

print(f"Sales table: {len(Sales)} entries")
Sales.head()

Sales table: 1031 entries


Unnamed: 0,MovieID,Budget,GrossWorldwide,OpeningWeekend
0,polite-society-2023,,2680713.0,817740.0
1,stephen-curry-underrated-2023,,,
2,little-dixie-2023,,,
4,the-disappearance-of-shere-hite-2024,,53189.0,15837.0
23,outlast-2023,,,


## Step 7: Create Person Table

In [19]:
# Collect all unique people from directors and writers
all_people = set()

# From Metacritic
for directors in mc_movies['director'].dropna():
    for director in str(directors).split(','):
        if director.strip():
            all_people.add(director.strip())

for writers in mc_movies['writer'].dropna():
    for writer in str(writers).split(','):
        if writer.strip():
            all_people.add(writer.strip())

# From IMDB
for directors in imdb_movies['director'].dropna():
    for director in str(directors).split(','):
        if director.strip():
            all_people.add(director.strip())

for writers_list in imdb_movies['writer'].dropna():
    if isinstance(writers_list, list):
        for writer in writers_list:
            if str(writer).strip():
                all_people.add(str(writer).strip())
    else:
        for writer in str(writers_list).split(','):
            if writer.strip():
                all_people.add(writer.strip())

Person = pd.DataFrame({
    'PersonID': range(1, len(all_people) + 1),
    'FullName': sorted(list(all_people)),
    'CareerScore': None
})

print(f"Person table: {len(Person)} people")
Person.head()

Person table: 4544 people


Unnamed: 0,PersonID,FullName,CareerScore
0,1,2023 (France),
1,2,2023 (United States),
2,3,2023 (Vietnam),
3,4,A.F. Harrold,
4,5,A.J. Bermudez,


## Step 8: Create Director Table

In [20]:
director_list = []

# Metacritic directors
for idx, row in mc_movies.iterrows():
    if pd.notna(row['director']):
        for director in str(row['director']).split(','):
            director = director.strip()
            if director:
                person_id = Person[Person['FullName'] == director]['PersonID'].values
                if len(person_id) > 0:
                    director_list.append({'MovieID': row['MovieID'], 'PersonID': person_id[0]})

# IMDB directors
for idx, row in unmatched_imdb_movies.iterrows():
    if pd.notna(row['director']):
        for director in str(row['director']).split(','):
            director = director.strip()
            if director:
                person_id = Person[Person['FullName'] == director]['PersonID'].values
                if len(person_id) > 0:
                    director_list.append({'MovieID': row['MovieID'], 'PersonID': person_id[0]})

Director = pd.DataFrame(director_list).drop_duplicates()

print(f"Director table: {len(Director)} relationships")
Director.head()

Director table: 2509 relationships


Unnamed: 0,MovieID,PersonID
0,furiosa-a-mad-max-saga-2024,1483
1,beyond-utopia-2023,2639
2,chicken-for-linda!-2024,4111
3,chicken-for-linda!-2024,742
4,youth-hard-times-2024,4394


## Step 9: Create Writer Table

In [22]:
writer_list = []

# Metacritic writers
for idx, row in mc_movies.iterrows():
    if pd.notna(row['writer']):
        for writer in str(row['writer']).split(','):
            writer = writer.strip()
            if writer:
                person_id = Person[Person['FullName'] == writer]['PersonID'].values
                if len(person_id) > 0:
                    writer_list.append({'MovieID': row['MovieID'], 'PersonID': person_id[0]})

# IMDB writers
for idx, row in unmatched_imdb_movies.iterrows():
    writer_value = row['writer']
    if pd.notna(writer_value) if not isinstance(writer_value, list) else len(writer_value) > 0:
        writers = writer_value if isinstance(writer_value, list) else str(writer_value).split(',')
        for writer in writers:
            writer = str(writer).strip()
            if writer:
                person_id = Person[Person['FullName'] == writer]['PersonID'].values
                if len(person_id) > 0:
                    writer_list.append({'MovieID': row['MovieID'], 'PersonID': person_id[0]})

Writer = pd.DataFrame(writer_list).drop_duplicates()

print(f"Writer table: {len(Writer)} relationships")
Writer.head()

Writer table: 4091 relationships


Unnamed: 0,MovieID,PersonID
0,furiosa-a-mad-max-saga-2024,1483
1,furiosa-a-mad-max-saga-2024,3169
2,beyond-utopia-2023,2639
3,chicken-for-linda!-2024,742
4,chicken-for-linda!-2024,4111


## Step 10: Create Expert & ExpertReview Tables

In [32]:
# Expert table (critics)
Expert = mc_expert_reviews[['expert_name']].drop_duplicates().copy()
Expert = Expert.rename(columns={'expert_name': 'Reviewer'})
Expert['ExpertID'] = range(1, len(Expert) + 1)
Expert['TotalReviews'] = None
Expert = Expert[['ExpertID', 'Reviewer', 'TotalReviews']]

print(f"Expert table: {len(Expert)} critics")
Expert.head()

Expert table: 649 critics


Unnamed: 0,ExpertID,Reviewer,TotalReviews
0,1,Marya E. Gates,
1,2,Robert Daniels,
2,3,John Fink,
3,4,Kyle Smith,
4,5,Katie Rife,


In [34]:
# Create mapping from Metacritic movie_id to slug-year MovieID (if not already created)
mc_id_map = dict(zip(mc_movies['movie_id'], mc_movies['MovieID']))

# ExpertReview table
ExpertReview = mc_expert_reviews.copy()
ExpertReview = ExpertReview.merge(Expert[['ExpertID', 'Reviewer']], left_on='expert_name', right_on='Reviewer', how='left')
ExpertReview['MovieID'] = ExpertReview['movie_id'].map(mc_id_map)
ExpertReview['ReviewID'] = ['expert_' + str(i) for i in range(len(ExpertReview))]
ExpertReview = ExpertReview[['ReviewID', 'MovieID', 'ExpertID', 'expert_score', 'review_date', 'review_text']]
ExpertReview = ExpertReview.rename(columns={'expert_score': 'meta_score', 'review_date': 'DateP', 'review_text': 'summary'})
ExpertReview = ExpertReview[ExpertReview['MovieID'].notna()]

print(f"ExpertReview table: {len(ExpertReview)} reviews")
ExpertReview.head()

ExpertReview table: 17869 reviews


Unnamed: 0,ReviewID,MovieID,ExpertID,meta_score,DateP,summary
0,expert_0,king-coal-2023,1,91,,"Sheldon is a coal miner’s daughter, and her br..."
1,expert_1,king-coal-2023,2,90,"Aug 14, 2023","In this melancholic, thoughtfully attuned cine..."
2,expert_2,king-coal-2023,3,83,,It offers no easy answers while spinning an ev...
3,expert_3,king-coal-2023,4,80,"Aug 10, 2023","Filmmaker Elaine McMillion Sheldon, a native o..."
4,expert_4,king-coal-2023,5,75,,King Coal goes deeper into the cultural roots ...


## Step 11: Create User & UserReview Tables

In [24]:
# User table (from both Metacritic and IMDB)
mc_users = mc_user_reviews[['user_name']].drop_duplicates().rename(columns={'user_name': 'Reviewer'})
imdb_users = imdb_reviews[['reviewer_name']].drop_duplicates().rename(columns={'reviewer_name': 'Reviewer'})

User = pd.concat([mc_users, imdb_users], ignore_index=True).drop_duplicates()
User['UserID'] = range(1, len(User) + 1)
User['TotalReviews'] = None
User = User[['UserID', 'Reviewer', 'TotalReviews']]

print(f"User table: {len(User)} users (MC: {len(mc_users)}, IMDB: {len(imdb_users)})")
User.head()

User table: 8922 users (MC: 4023, IMDB: 4903)


Unnamed: 0,UserID,Reviewer,TotalReviews
0,1,decatur555,
1,2,Sandandy58,
2,3,everett,
3,4,RonaB,
4,5,Nerdcall,


In [26]:
# Create mapping from Metacritic movie_id to slug-year MovieID
mc_id_map = dict(zip(mc_movies['movie_id'], mc_movies['MovieID']))

# UserReview table - Metacritic reviews
UserReview_MC = mc_user_reviews.copy()
UserReview_MC = UserReview_MC.merge(User[['UserID', 'Reviewer']], left_on='user_name', right_on='Reviewer', how='left')
UserReview_MC['MovieID'] = UserReview_MC['movie_id'].map(mc_id_map)
UserReview_MC['ReviewID'] = ['mc_user_' + str(i) for i in range(len(UserReview_MC))]
UserReview_MC = UserReview_MC[['ReviewID', 'MovieID', 'UserID', 'user_score', 'review_date', 'review_text']]
UserReview_MC = UserReview_MC.rename(columns={'review_date': 'DateP', 'review_text': 'summary'})
UserReview_MC = UserReview_MC[UserReview_MC['MovieID'].notna()]

print(f"Metacritic user reviews: {len(UserReview_MC)}")

Metacritic user reviews: 16000


In [27]:
# UserReview table - IMDB reviews (convert to slug-year MovieID)
UserReview_IMDB = imdb_reviews.copy()
UserReview_IMDB = UserReview_IMDB.merge(User[['UserID', 'Reviewer']], left_on='reviewer_name', right_on='Reviewer', how='left')

# Map old IMDB movie_id to new slug-year MovieID
imdb_id_map = dict(zip(imdb_movies['movie_id'], imdb_movies['MovieID']))
UserReview_IMDB['MovieID'] = UserReview_IMDB['movie_id'].map(imdb_id_map)

UserReview_IMDB['ReviewID'] = ['imdb_user_' + str(i) for i in range(len(UserReview_IMDB))]
UserReview_IMDB = UserReview_IMDB[['ReviewID', 'MovieID', 'UserID', 'review_score', 'review_date', 'review_text']]
UserReview_IMDB = UserReview_IMDB.rename(columns={'review_score': 'user_score', 'review_date': 'DateP', 'review_text': 'summary'})

print(f"IMDB user reviews: {len(UserReview_IMDB)}")

IMDB user reviews: 6423


In [28]:
# Combine all user reviews
UserReview = pd.concat([UserReview_MC, UserReview_IMDB], ignore_index=True)

print(f"UserReview table: {len(UserReview)} total reviews")
UserReview.head()

UserReview table: 22423 total reviews


Unnamed: 0,ReviewID,MovieID,UserID,user_score,DateP,summary
0,mc_user_0,the-rip-2026,1,7,"Jan 22, 2026",There are films that don’t try to reinvent any...
1,mc_user_1,the-rip-2026,2,5,"Jan 21, 2026","The definition of mid, ok, ""meh"". It wasn't ba..."
2,mc_user_2,the-rip-2026,3,4,"Jan 21, 2026","If you like one shootout after another, with s..."
3,mc_user_3,the-rip-2026,4,6,"Jan 21, 2026",Too many unnecessary F bombs. At best a B movi...
4,mc_user_4,the-rip-2026,5,6,"Jan 20, 2026","The Rip is a film that makes clear, from the v..."


## Step 12: Summary

In [35]:
print("="*60)
print("DATABASE SUMMARY")
print("="*60)
print(f"Movie:        {len(Movie):5} rows")
print(f"Sales:        {len(Sales):5} rows")
print(f"Person:       {len(Person):5} rows")
print(f"Director:     {len(Director):5} rows")
print(f"Writer:       {len(Writer):5} rows")
print(f"Expert:       {len(Expert):5} rows")
print(f"ExpertReview: {len(ExpertReview):5} rows")
print(f"User:         {len(User):5} rows")
print(f"UserReview:   {len(UserReview):5} rows")
print("="*60)

DATABASE SUMMARY
Movie:         2339 rows
Sales:         1031 rows
Person:        4544 rows
Director:      2509 rows
Writer:        4091 rows
Expert:         649 rows
ExpertReview: 17869 rows
User:          8922 rows
UserReview:   22423 rows


## Step 13: Export to CSV

In [36]:
import os

output_dir = 'database_tables'
os.makedirs(output_dir, exist_ok=True)

tables = [
    ('Movie.csv', Movie),
    ('Sales.csv', Sales),
    ('Person.csv', Person),
    ('Director.csv', Director),
    ('Writer.csv', Writer),
    ('Expert.csv', Expert),
    ('ExpertReview.csv', ExpertReview),
    ('User.csv', User),
    ('UserReview.csv', UserReview)
]

for filename, df in tables:
    filepath = os.path.join(output_dir, filename)
    df.to_csv(filepath, index=False)
    print(f"✓ {filename}")

print(f"\nFiles saved to {output_dir}/")

✓ Movie.csv
✓ Sales.csv
✓ Person.csv
✓ Director.csv
✓ Writer.csv
✓ Expert.csv
✓ ExpertReview.csv
✓ User.csv
✓ UserReview.csv

Files saved to database_tables/


## Step 14: Data Quality Check

In [37]:
# Check foreign key integrity
checks = [
    ('Sales → Movie', Sales[~Sales['MovieID'].isin(Movie['MovieID'])]),
    ('Director → Movie', Director[~Director['MovieID'].isin(Movie['MovieID'])]),
    ('Director → Person', Director[~Director['PersonID'].isin(Person['PersonID'])]),
    ('Writer → Movie', Writer[~Writer['MovieID'].isin(Movie['MovieID'])]),
    ('Writer → Person', Writer[~Writer['PersonID'].isin(Person['PersonID'])]),
    ('ExpertReview → Movie', ExpertReview[~ExpertReview['MovieID'].isin(Movie['MovieID'])]),
    ('ExpertReview → Expert', ExpertReview[~ExpertReview['ExpertID'].isin(Expert['ExpertID'])]),
    ('UserReview → Movie', UserReview[~UserReview['MovieID'].isin(Movie['MovieID'])]),
    ('UserReview → User', UserReview[~UserReview['UserID'].isin(User['UserID'])])
]

print("Foreign Key Checks:")
print("-" * 40)
for name, invalid in checks:
    status = "✓" if len(invalid) == 0 else f"❌ {len(invalid)} errors"
    print(f"{name:25} {status}")

total_errors = sum(len(invalid) for _, invalid in checks)
print("-" * 40)
if total_errors == 0:
    print("✓ All checks passed!")
else:
    print(f"⚠ {total_errors} issues found")

Foreign Key Checks:
----------------------------------------
Sales → Movie             ❌ 35 errors
Director → Movie          ✓
Director → Person         ✓
Writer → Movie            ✓
Writer → Person           ✓
ExpertReview → Movie      ✓
ExpertReview → Expert     ✓
UserReview → Movie        ❌ 777 errors
UserReview → User         ✓
----------------------------------------
⚠ 812 issues found
