# Data Collection (OMDb + Box Office)

We'll add text and stuff later.

## 1. Setup

In [33]:
import os
import json
import pandas as pd
import requests
import time
import math
import re
import warnings

# ignores annoying warnings
warnings.filterwarnings(
    "ignore",
    message=".*apply operated on the grouping columns.*",
    category=DeprecationWarning
)

base_dir = os.getcwd()
# obdb paths
json_path = os.path.join(base_dir, "..", "data_collection", "omdb_data.json")
csv_output_path = os.path.join(base_dir, "..", "data_collection", "omdb_cleaned.csv")
# kaggle metadata path
metadata_path = os.path.join(base_dir, "..", "data_collection", "Kaggle The Movies Dataset", "movies_metadata.csv")
# master movie list parth
movie_list_path = os.path.join(base_dir, "..", "data_collection", "movie_list.txt")

## 2. Building Master Movie List

description of this section, our idea and an advantage of this approach

### 2.1 Load, clean & derive

what we do

In [34]:
df = pd.read_csv(metadata_path, low_memory=False)

df = df.dropna(subset=["budget", "release_date"])
df["budget"] = pd.to_numeric(df["budget"], errors="coerce")
df["year"]   = pd.to_datetime(df["release_date"], errors="coerce").dt.year
df = df[df["year"].between(1974, 2024)]  # 50 years span

### 2.2 Budget tiers

why we make tiers

In [35]:
df["decade"]  = (df["year"] // 10) * 10
df["budget_m"] = df["budget"] / 1000000

budget_bins  = [0, 5, 20, 100, df["budget_m"].max()+1]
budget_labels= ["<5M", "5–20M", "20–100M", ">100M"]
df["budget_tier"] = pd.cut(df["budget_m"], bins=budget_bins, labels=budget_labels)

n_decades = df["decade"].nunique()
n_tiers   = df["budget_tier"].nunique()
total_cells = n_decades * n_tiers

N_PER_CELL = math.ceil(5000 / total_cells)
print(f"{n_decades=} × {n_tiers=} = {total_cells} cells")
print(f"If we want to reach ~5000 movies, then N_PER_CELL = {N_PER_CELL}")

n_decades=6 × n_tiers=4 = 24 cells
If we want to reach ~5000 movies, then N_PER_CELL = 209


### 2.3 Stratified sampling

what is stratified sampling

how this is gonna help us?

In [36]:
sampled = (
    df
    .groupby(["decade", "budget_tier"], observed=True, group_keys=False)
    .apply(
        lambda grp: grp.sample(min(len(grp), N_PER_CELL), random_state=42),
        include_groups=True)
)
print(f"Total sampled: {len(sampled)}")
print(f"Sampled {len(sampled)} movies across "
      f"{sampled['decade'].nunique()} decades × "
      f"{sampled['budget_tier'].nunique()} budget tiers")

Total sampled: 2971
Sampled 2971 movies across 5 decades × 4 budget tiers


### 2.4 Random draw to reach 5000 movies

we end up with 2971 movies but we want more!

so now we are going to randomly choose the rest from the original Kaggle dataset to reach 5k

In [37]:
target = 5000
current = len(sampled)
if current < target:
    residual = target - current
    remaining = df.loc[~df.index.isin(sampled.index)] # all except already chosen
    extra = remaining.sample(residual, random_state=42)    # randomly xhoosing
    sampled = pd.concat([sampled, extra]) #merge
print(f"Final stratified + random sample: {len(sampled)} movies")

Final stratified + random sample: 5000 movies


### 2.5 Final movie list
done

In [38]:
sampled["title"].to_csv(movie_list_path, index=False, header=False)
print(f"Master list of {len(sampled)} titles is saved to {movie_list_path}")

Master list of 5000 titles is saved to c:\PythonProjects\Movie-Team\jupyter notebooks\..\data_collection\movie_list.txt


## 3. OMDb Data Fetching


Refer to `omdb_fetch.py` script

## 4. Load & Explore JSON

text

json to dataframe

In [39]:
# into dataframe
with open(json_path, "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons
0,Good Guys Wear Black,1978,PG,01 Jun 1978,95 min,Action,Ted Post,"Bruce Cohn, Mark Medoff, Joseph Fraley","Chuck Norris, Anne Archer, James Franciscus",An ex-US Army commando must find the reason wh...,...,5.1,3865,tt0079227,movie,,,,,True,
1,Grizzly,1976,PG,21 May 1976,91 min,"Adventure, Horror, Thriller",William Girdler,"Harvey Flaxman, David Sheldon, Andrew Prine","Christopher George, Andrew Prine, Richard Jaeckel",An eighteen-foot-tall grizzly bear terrorizes ...,...,5.2,6166,tt0074593,movie,,,,,True,
2,Midnight Express,1978,R,27 Oct 1978,121 min,"Biography, Crime, Drama",Alan Parker,"Oliver Stone, Billy Hayes, William Hoffer","Brad Davis, Irene Miracle, Bo Hopkins","Billy Hayes, an American college student, is c...",...,7.5,90240,tt0077928,movie,,"$35,000,000",,,True,


## 5. Cleaning OMDb Data

text

### 5.1 Selecting relevant columns

In [40]:
columns_to_keep = [
    "Title", "imdbID", "Year", "Genre", "Director", "Actors", "Language",
    "Country", "Runtime", "BoxOffice", "imdbRating", "imdbVotes", "Awards", "Type"
]

df = df[columns_to_keep]
df.head(3)

Unnamed: 0,Title,imdbID,Year,Genre,Director,Actors,Language,Country,Runtime,BoxOffice,imdbRating,imdbVotes,Awards,Type
0,Good Guys Wear Black,tt0079227,1978,Action,Ted Post,"Chuck Norris, Anne Archer, James Franciscus",English,United States,95 min,,5.1,3865,,movie
1,Grizzly,tt0074593,1976,"Adventure, Horror, Thriller",William Girdler,"Christopher George, Andrew Prine, Richard Jaeckel",English,United States,91 min,,5.2,6166,,movie
2,Midnight Express,tt0077928,1978,"Biography, Crime, Drama",Alan Parker,"Brad Davis, Irene Miracle, Bo Hopkins","English, Turkish, French","United Kingdom, United States",121 min,"$35,000,000",7.5,90240,Won 2 Oscars. 17 wins & 14 nominations total,movie


### 5.2 Cleaning numeric fields

In [41]:
#BoxOffice to integer
df["BoxOffice"] = df["BoxOffice"].replace("N/A", pd.NA)
df["BoxOffice"] = df["BoxOffice"].dropna().apply(
    lambda x: int(re.sub(r"[\$,]", "", x)) if isinstance(x, str) else pd.NA
)

#imdbRating to float
df["imdbRating"] = pd.to_numeric(df["imdbRating"], errors="coerce")

#imdbVotes to integer
df["imdbVotes"] = df["imdbVotes"].replace("N/A", pd.NA)
df["imdbVotes"] = df["imdbVotes"].dropna().apply(
    lambda x: int(x.replace(",", "")) if isinstance(x, str) else pd.NA
)

# saving cleaned data to csv
df.to_csv(csv_output_path, index=False)

## 6. Box Office Missing Values

After cleaning, we check how many movies do not contain valid box office revenue data.

In [42]:
boxoffice_miss = df[df["BoxOffice"].isna()]
print(f"{len(boxoffice_miss)} out of {len(df)} movies are missing BoxOffice data.")

1919 out of 4764 movies are missing BoxOffice data.


### 6.1 Merging IMDb and TMDb IDs

1 load raw json and locate imdbID

2 join with links.csv from Kaggle to get tmdb 



TMDB
API KEY - 1d779d5d4246f2809fc00d7729449f09
API Read Access Token - eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxZDc3OWQ1ZDQyNDZmMjgwOWZjMDBkNzcyOTQ0OWYwOSIsIm5iZiI6MTc0MDA1NzI3OC45OTMsInN1YiI6IjY3YjcyYWJlMGM2MTAyNWZhZmM0MGE2YSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.pXVcufNeWf-U64bdoFfjCLPPM013DFZRvkFUDTJILnc



In [43]:
# I load json as dataframe
with open(json_path, "r", encoding="utf-8") as f:
    omdb_raw = json.load(f)
omdb_raw_df = pd.DataFrame(omdb_raw)

# links.csv dataframe from Kaggle(imdbId keyword)
links_path = os.path.join(base_dir, "..", "data_collection", "Kaggle The Movies Dataset", "links.csv")
links_df = pd.read_csv(links_path, usecols=["imdbId", "tmdbId"])

missing = df[df["BoxOffice"].isna()].copy() # titles still missing Boxofice

# taking out numbers from imdbID field tt0123456 to 123456
missing["imdbId_numeric"] = (missing["imdbID"]
    .str.replace("^tt", "", regex=True)
    .astype("Int64")
)

missing = missing.merge( # and now merge on that numeric field
    links_df, left_on="imdbId_numeric", right_on="imdbId", how="left")

## 6.2 Save enhanced missing list

a

In [44]:
enhanced_cols = ["Title","imdbID","tmdbId"]
enhanced_path = os.path.join(
    base_dir, "..", "data_collection", "missing_boxoffice_enhanced.csv"
)
missing[enhanced_cols].to_csv(enhanced_path, index=False)
print(f"Enhanced missing list ({len(missing)}) was saved to {enhanced_path}")

Enhanced missing list (1919) was saved to c:\PythonProjects\Movie-Team\jupyter notebooks\..\data_collection\missing_boxoffice_enhanced.csv


## 6.3 TMDb API call

There we call TMDb to fill in `revenue` where boxoffice is EMPTY

refer to `tmdb_fetch.py` script to replicate

1657 out of 1919 missing revenue movies have been successfully fetched!

## 6.4 Merging TMDb revenues with `df`


text

In [46]:
tmdb_rev = pd.read_csv(
    os.path.join(base_dir, "..", "data_collection", "tmdb_revenues.csv")
)

#merging into df by title
df = df.merge(tmdb_rev[["Title","BoxOffice"]],
    on="Title", how="left", suffixes=("", "_tmdb")
)

#wherever BoxOffice is null = fill from the tmdb column
df["BoxOffice"] = df["BoxOffice"].fillna(df["BoxOffice_tmdb"])
df = df.drop(columns=["BoxOffice_tmdb"])

still_missing = df["BoxOffice"].isna().sum()
total_movies  = len(df)
print(f"{still_missing} out of {total_movies} movies still have no BoxOffice after TMDb API calling.")


262 out of 4830 movies still have no BoxOffice after TMDb API calling.


## 6.5 Building the fallback list

here we are going to find still missing revenue 262 movies

THose 262 titles fall into 2 possible categories:

- no tmdb ID at all (hence we couldn't call TMDb API)
- title has tmdb id but API returned zero because it may not track this title

That being said, we will use "The-Numbers" in order to scrape them again!


In [47]:
# 1) Get the titles still missing after TMDb merge
still_missing_titles = df[df["BoxOffice"].isna()]["Title"]

# 2) Filter your enriched-missing DataFrame (that has tmdbId) by those titles
fallback = missing[missing["Title"].isin(still_missing_titles)][
    ["Title","imdbID","tmdbId"]
]

fallback_path = os.path.join(
    base_dir, "..", "data_collection", "fallback_boxoffice.csv"
)
fallback.to_csv(fallback_path, index=False)

print(f"{len(fallback)} titles to scrape are saved to {fallback_path}")


262 titles to scrape are saved to c:\PythonProjects\Movie-Team\jupyter notebooks\..\data_collection\fallback_boxoffice.csv


## 7 Final Data

