# Step 1: Data Scraping
- Reads TMDB_API_KEY from Kaggle Secrets
- Saves raw JSON pages and a consolidated CSV under /kaggle/working
- Filters movies between 2015 and 2025 with vote_average >= 6 and vote_count >= 50
- Uses UTF-8 for all file writes

In [1]:
import time
import requests
import csv
import json
from pathlib import Path
from kaggle_secrets import UserSecretsClient

# === Get API key from Kaggle Secrets ===
user_secrets = UserSecretsClient()
API_KEY = user_secrets.get_secret("TMDB_API_KEY")
if not API_KEY:
    raise RuntimeError("TMDB_API_KEY not found in Kaggle Secrets. Add it in Notebook Settings.")

# === Output paths inside Kaggle working directory ===
OUT_DIR = Path("/kaggle/working/data/raw/tmdb")
OUT_DIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = OUT_DIR / "movies_2015_2025.csv"

# === TMDB discover endpoint and base parameters ===
BASE_URL = "https://api.themoviedb.org/3/discover/movie"
params = {
    "api_key": API_KEY,
    "language": "en-US",
    "sort_by": "popularity.desc",
    "page": 1,
    "include_adult": False,
    "include_video": False,
    "primary_release_date.gte": "2015-01-01",
    "primary_release_date.lte": "2025-12-31",
    # Quality filters
    "vote_average.gte": 6.0,
    "vote_count.gte": 50,
}

# Safety / limits
MAX_PAGES = 1000   # TMDB typically returns <= 500 pages, cap for safety
REQUEST_DELAY = 0.35  # seconds between requests; increase if you see rate limits

collected = []
page = 1

# Simple retry wrapper for robustness
def safe_get(url, params, max_retries=3, timeout=30):
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, params=params, timeout=timeout)
            if resp.status_code == 200:
                return resp
            else:
                print(f"Request returned {resp.status_code}. Attempt {attempt}/{max_retries}.")
        except requests.RequestException as e:
            print(f"Request exception: {e}. Attempt {attempt}/{max_retries}.")
        time.sleep(1.0 * attempt)
    return None

print("Starting TMDB fetch. Output directory:", OUT_DIR)

while True:
    params["page"] = page
    resp = safe_get(BASE_URL, params)
    if resp is None:
        print("Failed to fetch page", page, "- stopping.")
        break

    try:
        data = resp.json()
    except ValueError as e:
        print("Failed to parse JSON for page", page, e)
        break

    # Save raw JSON page as UTF-8
    raw_path = OUT_DIR / f"discover_page_{page}.json"
    raw_text = json.dumps(data, ensure_ascii=False)
    raw_path.write_text(raw_text, encoding="utf-8")

    results = data.get("results", [])
    if not results:
        print("No results on page", page, "-> stopping.")
        break

    for m in results:
        collected.append({
            "tmdb_id": m.get("id"),
            "title": m.get("title") or m.get("original_title"),
            "release_date": m.get("release_date"),
            "year": (m.get("release_date") or "")[:4],
            "overview": m.get("overview"),
            "vote_average": m.get("vote_average"),
            "vote_count": m.get("vote_count"),
            "popularity": m.get("popularity"),
            "poster_path": m.get("poster_path")
        })

    print(f"Saved page {page}, items {len(results)} (total collected: {len(collected)})")

    page += 1
    total_pages = data.get("total_pages", 0) or 0
    if page > total_pages or page > MAX_PAGES:
        print(f"Reached last page or max pages (page {page}, total_pages {total_pages}).")
        break

    time.sleep(REQUEST_DELAY)

# Write consolidated CSV (UTF-8)
fieldnames = ["tmdb_id","title","release_date","year","overview","vote_average","vote_count","popularity","poster_path"]
with CSV_PATH.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for row in collected:
        writer.writerow(row)

print("Done. Total movies collected:", len(collected))
print("CSV saved to", CSV_PATH)

Starting TMDB fetch. Output directory: /kaggle/working/data/raw/tmdb
Saved page 1, items 20 (total collected: 20)
Saved page 2, items 20 (total collected: 40)
Saved page 3, items 20 (total collected: 60)
Saved page 4, items 20 (total collected: 80)
Saved page 5, items 20 (total collected: 100)
Saved page 6, items 20 (total collected: 120)
Saved page 7, items 20 (total collected: 140)
Saved page 8, items 20 (total collected: 160)
Saved page 9, items 20 (total collected: 180)
Saved page 10, items 20 (total collected: 200)
Saved page 11, items 20 (total collected: 220)
Saved page 12, items 20 (total collected: 240)
Saved page 13, items 20 (total collected: 260)
Saved page 14, items 20 (total collected: 280)
Saved page 15, items 20 (total collected: 300)
Saved page 16, items 20 (total collected: 320)
Saved page 17, items 20 (total collected: 340)
Saved page 18, items 20 (total collected: 360)
Saved page 19, items 20 (total collected: 380)
Saved page 20, items 20 (total collected: 400)
Save

# Step 2: TMDB enrichment

In [7]:
# tmdb_details_kaggle.py
# Reads /kaggle/working/data/raw/tmdb/movies_2015_2025.csv
# Fetches per-movie details: genres, credits (top 3 cast), keywords, external_ids (imdb_id), poster_url
# Writes output to /kaggle/working/data/processed/movies_tmdb_enriched.csv

import time
import requests
import csv
import json
from pathlib import Path
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
API_KEY = user_secrets.get_secret("TMDB_API_KEY")
if not API_KEY:
    raise RuntimeError("TMDB_API_KEY not found in Kaggle Secrets.")

IN_PATH = Path("/kaggle/working/data/raw/tmdb/movies_2015_2025.csv")
OUT_DIR = Path("/kaggle/working/data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / "movies_tmdb_enriched.csv"

BASE = "https://api.themoviedb.org/3"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; tmdb-enricher/1.0)"}

def safe_get(url, params=None, retries=3):
    for i in range(retries):
        try:
            r = requests.get(url, params=params, timeout=30, headers=HEADERS)
            if r.status_code == 200:
                return r.json()
            else:
                print(f"TMDB request {r.status_code} for {url}")
        except Exception as e:
            print("Request error:", e)
        time.sleep(1 + i)
    return None

rows = []
with IN_PATH.open("r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for i,row in enumerate(reader):
        tmdb_id = row.get("tmdb_id")
        if not tmdb_id:
            continue
        # movie details
        details = safe_get(f"{BASE}/movie/{tmdb_id}", params={"api_key": API_KEY, "language": "en-US"})
        # credits
        credits = safe_get(f"{BASE}/movie/{tmdb_id}/credits", params={"api_key": API_KEY})
        # keywords
        keywords = safe_get(f"{BASE}/movie/{tmdb_id}/keywords", params={"api_key": API_KEY})
        # external ids (to get imdb_id)
        ext = safe_get(f"{BASE}/movie/{tmdb_id}/external_ids", params={"api_key": API_KEY})

        genres_list = []
        runtime = None
        homepage = ""
        if details:
            genres_list = [g.get("name") for g in details.get("genres", [])] if details.get("genres") else []
            runtime = details.get("runtime")
            homepage = details.get("homepage") or ""
        cast_top3 = []
        if credits:
            cast = credits.get("cast", [])[:10]
            for c in cast[:3]:
                name = c.get("name")
                character = c.get("character")
                cast_top3.append(name)
        keyword_list = []
        if keywords:
            # TMDB keywords endpoint: {"keywords": [...] }
            kw = keywords.get("keywords") or keywords.get("results") or []
            keyword_list = [k.get("name") for k in kw if k.get("name")]
        imdb_id = None
        if ext:
            imdb_id = ext.get("imdb_id")
        poster_path = row.get("poster_path") or (details.get("poster_path") if details else None)
        poster_url = f"https://image.tmdb.org/t/p/w500{poster_path}" if poster_path else ""

        new_row = {
            "tmdb_id": tmdb_id,
            "title": row.get("title"),
            "release_date": row.get("release_date"),
            "year": row.get("year"),
            "overview": (row.get("overview") or ""),
            "vote_average": row.get("vote_average"),
            "vote_count": row.get("vote_count"),
            "popularity": row.get("popularity"),
            "genres": "|".join(genres_list),
            "cast_top3": "|".join(cast_top3),
            "keywords_tmdb": "|".join(keyword_list),
            "imdb_id": imdb_id or "",
            "poster_url": poster_url,
            "runtime": runtime,
            "homepage": homepage
        }
        rows.append(new_row)
        if (i+1) % 50 == 0:
            print(f"Processed {i+1} movies")
        time.sleep(0.25)  # be polite

# Save CSV
fieldnames = list(rows[0].keys()) if rows else ["tmdb_id","title"]
with OUT_CSV.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print("TMDB enrichment complete. Saved to", OUT_CSV)

Processed 50 movies
Processed 100 movies
Processed 150 movies
Processed 200 movies
Processed 250 movies
Processed 300 movies
Processed 350 movies
Processed 400 movies
Processed 450 movies
Processed 500 movies
Processed 550 movies
Processed 600 movies
Processed 650 movies
Processed 700 movies
Processed 750 movies
Processed 800 movies
Processed 850 movies
Processed 900 movies
Processed 950 movies
Processed 1000 movies
Processed 1050 movies
Processed 1100 movies
Processed 1150 movies
Processed 1200 movies
Processed 1250 movies
Processed 1300 movies
Processed 1350 movies
Processed 1400 movies
Processed 1450 movies
Processed 1500 movies
Processed 1550 movies
Processed 1600 movies
Processed 1650 movies
Processed 1700 movies
Processed 1750 movies
Processed 1800 movies
Processed 1850 movies
Processed 1900 movies
Processed 1950 movies
Processed 2000 movies
Processed 2050 movies
Processed 2100 movies
Processed 2150 movies
Processed 2200 movies
Processed 2250 movies
Processed 2300 movies
Processe

# imdb_reviews_scraper

In [8]:
# imdb_reviews_scraper_kaggle.py
# Reads movies_tmdb_enriched.csv, scrapes top 3 reviews from imdb for movies with imdb_id
# Writes imdb_reviews.csv with columns: tmdb_id, imdb_id, title, review_idx, review_title, review_rating, review_text

import time
import csv
from pathlib import Path
from bs4 import BeautifulSoup
import requests

IN_CSV = Path("/kaggle/working/data/processed/movies_tmdb_enriched.csv")
OUT_DIR = Path("/kaggle/working/data/processed/imdb_scraped")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_REV = OUT_DIR / "imdb_reviews.csv"

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; imdb-scraper/1.0)"}

def scrape_reviews(imdb_id, top_k=3):
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    r = requests.get(url, headers=HEADERS, timeout=20)
    if r.status_code != 200:
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    reviews = []
    # IMDb structure: .review-container or .lister-item mode
    containers = soup.select(".review-container") or soup.select(".lister-item.mode-detail")
    for c in containers[:top_k]:
        title_el = c.select_one(".title")
        title = title_el.get_text(strip=True) if title_el else ""
        content_el = c.select_one(".text.show-more__control")
        if not content_el:
            content_el = c.select_one(".content .text")
        content = content_el.get_text(strip=True) if content_el else ""
        rating_el = c.select_one(".rating-other-user-rating span")
        rating = rating_el.get_text(strip=True) if rating_el else ""
        reviews.append({"title": title, "rating": rating, "content": content})
    # fallback simple parse
    if not reviews:
        # gather paragraphs
        for p in soup.select("div.text"):
            text = p.get_text(strip=True)
            if text:
                reviews.append({"title": "", "rating": "", "content": text})
                if len(reviews) >= top_k:
                    break
    return reviews

with IN_CSV.open("r", encoding="utf-8") as f_in, OUT_REV.open("w", newline="", encoding="utf-8") as f_out:
    reader = csv.DictReader(f_in)
    writer = csv.writer(f_out)
    writer.writerow(["tmdb_id","imdb_id","title","review_idx","review_title","review_rating","review_text"])
    for i,row in enumerate(reader):
        imdb_id = row.get("imdb_id") or ""
        tmdb_id = row.get("tmdb_id")
        title = row.get("title")
        if not imdb_id:
            continue
        try:
            revs = scrape_reviews(imdb_id, top_k=3)
            for idx,rv in enumerate(revs):
                writer.writerow([tmdb_id, imdb_id, title, idx+1, rv.get("title",""), rv.get("rating",""), rv.get("content","")])
        except Exception as e:
            print("Error scraping", imdb_id, e)
        # polite delay; pause slightly more periodically
        if (i+1) % 10 == 0:
            time.sleep(2.0)
        else:
            time.sleep(0.6)
        if (i+1) % 50 == 0:
            print("Scraped reviews for", i+1, "movies")

print("IMDb reviews scraping finished. Saved to", OUT_REV)

Scraped reviews for 50 movies
Scraped reviews for 100 movies
Scraped reviews for 150 movies
Scraped reviews for 200 movies
Scraped reviews for 250 movies
Scraped reviews for 300 movies
Scraped reviews for 350 movies
Scraped reviews for 400 movies
Scraped reviews for 450 movies
Scraped reviews for 500 movies
Scraped reviews for 550 movies
Scraped reviews for 600 movies
Scraped reviews for 650 movies
Scraped reviews for 700 movies
Scraped reviews for 750 movies
Scraped reviews for 800 movies
Scraped reviews for 850 movies
Scraped reviews for 900 movies
Scraped reviews for 950 movies
Scraped reviews for 1000 movies
Scraped reviews for 1050 movies
Scraped reviews for 1100 movies
Scraped reviews for 1150 movies
Scraped reviews for 1200 movies
Scraped reviews for 1250 movies
Scraped reviews for 1300 movies
Scraped reviews for 1350 movies
Scraped reviews for 1400 movies
Scraped reviews for 1450 movies
Scraped reviews for 1500 movies
Scraped reviews for 1550 movies
Scraped reviews for 1600 mov

# movielens_fetch_map

In [9]:
# movielens_fetch_map_kaggle.py
# Downloads MovieLens ml-latest-small (or 20m) and maps to TMDB ids via links.csv.
# Produces ratings_for_cf.csv containing userId,movieId(rating movieLens),tmdb_id,rating,timestamp

import os
import zipfile
import requests
import pandas as pd
from pathlib import Path

OUT_DIR = Path("/kaggle/working/data/ml")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# choose dataset: 'ml-latest-small' (small) or 'ml-20m' (large if you want)
ML_NAME = "ml-latest-small"   # change to "ml-20m" if you need more
ML_URL = f"https://files.grouplens.org/datasets/movielens/{ML_NAME}.zip"
ZIP_PATH = OUT_DIR / f"{ML_NAME}.zip"

print("Downloading MovieLens:", ML_URL)
r = requests.get(ML_URL, stream=True)
with ZIP_PATH.open("wb") as f:
    for chunk in r.iter_content(chunk_size=8192):
        if chunk:
            f.write(chunk)
print("Downloaded to", ZIP_PATH)

with zipfile.ZipFile(ZIP_PATH, 'r') as z:
    z.extractall(OUT_DIR)
print("Extracted to", OUT_DIR)

# Read ratings and links
ratings = pd.read_csv(OUT_DIR / ML_NAME / "ratings.csv")
links = pd.read_csv(OUT_DIR / ML_NAME / "links.csv")  # contains movieId, imdbId, tmdbId (may be NaN)

# Read our processed TMDB enriched movie list to know which tmdb_ids exist
movies_tmdb = pd.read_csv("/kaggle/working/data/processed/movies_tmdb_enriched.csv", dtype={"tmdb_id": str})
tmdb_set = set(movies_tmdb["tmdb_id"].astype(str).tolist())

# Map MovieLens movieId -> tmdbId using links.csv, filter only those that have tmdbId and exist in our set
links['tmdbId'] = links['tmdbId'].astype(pd.Int64Dtype()).astype(object)
links['tmdbId'] = links['tmdbId'].apply(lambda x: str(int(x)) if pd.notna(x) else "")
links_filtered = links[links['tmdbId'] != ""].copy()
links_filtered = links_filtered[links_filtered['tmdbId'].isin(tmdb_set)]

print("MovieLens movies mapped to our TMDB list:", links_filtered.shape[0])

# Filter ratings to only these movieIds
movie_ids_to_keep = set(links_filtered['movieId'].tolist())
ratings_filtered = ratings[ratings['movieId'].isin(movie_ids_to_keep)].copy()

# Merge to add tmdbId
ratings_filtered = ratings_filtered.merge(links_filtered[['movieId','tmdbId']], on='movieId', how='left')
ratings_filtered = ratings_filtered.rename(columns={'tmdbId': 'tmdb_id'})

# Save ratings for collaborative filtering
OUT_RAT = Path("/kaggle/working/data/processed/ratings_for_cf.csv")
ratings_filtered.to_csv(OUT_RAT, index=False)
print("Saved filtered ratings to", OUT_RAT)
print("Ratings count:", len(ratings_filtered))

Downloading MovieLens: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Downloaded to /kaggle/working/data/ml/ml-latest-small.zip
Extracted to /kaggle/working/data/ml
MovieLens movies mapped to our TMDB list: 478
Saved filtered ratings to /kaggle/working/data/processed/ratings_for_cf.csv
Ratings count: 2060


# merge_master_dataset

In [1]:
# merge_master_dataset_fixed.py
# Robust merge script that handles NaNs and flexible paths (Kaggle).
# Produces movies_master.csv with 'combined_text' column.

import pandas as pd
from pathlib import Path
import json

# Helper: find a file by name under possible base dirs
def find_file(filename, candidates=None):
    if candidates is None:
        candidates = [
            Path("/kaggle/working/data/processed"),
            Path("/kaggle/working/data/raw/tmdb"),
            Path("/kaggle/input/high-rated-movies-dataset-20152025"),
            Path("/kaggle/input"),
            Path("."),
        ]
    for base in candidates:
        p = base / filename
        if p.exists():
            return p
    # try a recursive search (limited)
    for base in candidates:
        if base.exists():
            for found in base.rglob(filename):
                return found
    return None

# locate TMDB enriched file
tmdb_path = find_file("movies_tmdb_enriched.csv")
if not tmdb_path:
    tmdb_path = find_file("movies_2015_2025.csv")  # fallback to raw TMDB CSV if enriched missing
    if tmdb_path:
        print("Found raw TMDB CSV at", tmdb_path, "- it may lack genres/keywords/credits.")
if not tmdb_path:
    raise FileNotFoundError("Could not find movies_tmdb_enriched.csv or movies_2015_2025.csv. Place it under /kaggle/working/data/processed or /kaggle/input.")

print("Loading movies from:", tmdb_path)
movies_tmdb = pd.read_csv(tmdb_path, dtype=str).fillna("")  # read everything as str, fill NaN with empty string

# attempt to find imdb reviews file
rev_path = find_file("imdb_reviews.csv", candidates=[Path("/kaggle/working/data/processed/imdb_scraped"), Path("/kaggle/working/data/processed"), Path("/kaggle/input")])
if rev_path and rev_path.exists():
    print("Loading scraped reviews from:", rev_path)
    reviews = pd.read_csv(rev_path, dtype=str).fillna("")
    # combine reviews per movie into single text field
    reviews_grouped = reviews.groupby('tmdb_id')['review_text'].apply(lambda texts: " ".join([t for t in texts if str(t).strip() != ""])).reset_index()
    movies = movies_tmdb.merge(reviews_grouped, left_on='tmdb_id', right_on='tmdb_id', how='left')
    movies['review_text'] = movies['review_text'].fillna("")
else:
    print("No imdb_reviews.csv found. Continuing without review_text.")
    movies = movies_tmdb.copy()
    movies['review_text'] = ""

# Create robust combined_text from multiple columns; skip empty strings
def safe_join(parts):
    out = []
    for p in parts:
        if p is None:
            continue
        s = str(p)
        if s.strip() == "" or s.strip().lower() == "nan":
            continue
        out.append(s.strip())
    return " . ".join(out)

# Normalize some columns if they exist
for col in ['genres','keywords_tmdb','cast_top3','overview','title','review_text']:
    if col not in movies.columns:
        movies[col] = ""

# Build combined_text
movies['combined_text'] = movies.apply(lambda r: safe_join([
    r.get('title',''),
    r.get('overview',''),
    r.get('genres','').replace("|"," ") if isinstance(r.get('genres',''), str) else str(r.get('genres','')),
    r.get('keywords_tmdb','').replace("|"," ") if isinstance(r.get('keywords_tmdb',''), str) else str(r.get('keywords_tmdb','')),
    r.get('cast_top3','').replace("|"," ") if isinstance(r.get('cast_top3',''), str) else str(r.get('cast_top3','')),
    r.get('review_text','')
]), axis=1)

# some optional cleanup: ensure year numeric where possible
if 'year' in movies.columns:
    movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

OUT_DIR = Path("/kaggle/working/data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_MOV = OUT_DIR / "movies_master.csv"
movies.to_csv(OUT_MOV, index=False, encoding="utf-8")
print("Saved movies_master to", OUT_MOV)
print("movies_master shape:", movies.shape)

# quick sanity print: show first 3 combined_text samples
for i, txt in enumerate(movies['combined_text'].head(3)):
    print(f"\n--- sample {i+1} ---\n{txt[:800]}\n")

Found raw TMDB CSV at /kaggle/input/high-rated-movies-dataset-20152025/movies_2015_2025.csv - it may lack genres/keywords/credits.
Loading movies from: /kaggle/input/high-rated-movies-dataset-20152025/movies_2015_2025.csv
No imdb_reviews.csv found. Continuing without review_text.
Saved movies_master to /kaggle/working/data/processed/movies_master.csv
movies_master shape: (8239, 14)

--- sample 1 ---
Zootopia 2 . After cracking the biggest case in Zootopia's history, rookie cops Judy Hopps and Nick Wilde find themselves on the twisting trail of a great mystery when Gary Deâ€™Snake arrives and turns the animal metropolis upside down. To crack the case, Judy and Nick must go undercover to unexpected new parts of town, where their growing partnership is tested like never before.


--- sample 2 ---
TRON: Ares . A highly sophisticated Program called Ares is sent from the digital world into the real world on a dangerous mission, marking humankind's first encounter with A.I. beings.


--- samp