In [3]:
import requests
import pandas as pd
import time

api_key = "my-api-key"  # hidden for privacy

# Get genre name mapping from TMDb
def get_genre_mapping(api_key):
    print("Fetching genre mapping...")
    url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US"
    response = requests.get(url)
    if response.status_code != 200:
        print("❌ Failed to fetch genre list.")
        print(response.text)
        return {}
    data = response.json()
    genre_map = {genre["id"]: genre["name"] for genre in data["genres"]}
    print("✅ Genre mapping fetched successfully.\n")
    return genre_map

# Main function to fetch movies
def fetch_movies(api_key, pages=480):
    base_url = "https://api.themoviedb.org/3/discover/movie"
    all_movies = []
    genre_dict = get_genre_mapping(api_key)

    for page in range(1, pages + 1):
        print(f"🔄 Fetching page {page}...")

        params = {
            "api_key": api_key,
            "language": "en-US",
            "sort_by": "popularity.desc",
            "page": page
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()
        except Exception as e:
            print(f"❌ Error fetching page {page}: {e}")
            continue

        data = response.json()

        if "results" not in data:
            print(f"⚠️ Unexpected response structure on page {page}")
            print(data)
            break

        for movie in data["results"]:
            genre_names = [genre_dict.get(gid, "Unknown") for gid in movie.get("genre_ids", [])]
            genre_str = ", ".join(genre_names)

            all_movies.append([
                movie.get("title"),
                movie.get("release_date"),
                movie.get("vote_average"),
                movie.get("vote_count"),
                movie.get("original_language"),
                movie.get("overview"),
                movie.get("popularity"),
                genre_str
            ])

        print(f"✅ Page {page} scraped. Total movies so far: {len(all_movies)}\n")
        time.sleep(0.5)  # Respect rate limits

    # Save to CSV
    df = pd.DataFrame(all_movies, columns=[
        "Title", "Release Date", "Rating", "Vote Count",
        "Language", "Overview", "Popularity", "Genres"
    ])
    df.to_csv("tmdb_movies.csv", index=False)
    print(f"📁 Done! Saved {len(df)} movies to 'tmdb_movies_with_genres.csv'.")

# 🔁 Run this to test (start with a few pages only)
fetch_movies(api_key, pages=480)


Fetching genre mapping...
✅ Genre mapping fetched successfully.

🔄 Fetching page 1...
✅ Page 1 scraped. Total movies so far: 20

🔄 Fetching page 2...
✅ Page 2 scraped. Total movies so far: 40

🔄 Fetching page 3...
✅ Page 3 scraped. Total movies so far: 60

🔄 Fetching page 4...
✅ Page 4 scraped. Total movies so far: 80

🔄 Fetching page 5...
✅ Page 5 scraped. Total movies so far: 100

🔄 Fetching page 6...
✅ Page 6 scraped. Total movies so far: 120

🔄 Fetching page 7...
✅ Page 7 scraped. Total movies so far: 140

🔄 Fetching page 8...
✅ Page 8 scraped. Total movies so far: 160

🔄 Fetching page 9...
✅ Page 9 scraped. Total movies so far: 180

🔄 Fetching page 10...
✅ Page 10 scraped. Total movies so far: 200

🔄 Fetching page 11...
✅ Page 11 scraped. Total movies so far: 220

🔄 Fetching page 12...
✅ Page 12 scraped. Total movies so far: 240

🔄 Fetching page 13...
✅ Page 13 scraped. Total movies so far: 260

🔄 Fetching page 14...
✅ Page 14 scraped. Total movies so far: 280

🔄 Fetching page 15.

In [3]:
import requests
import pandas as pd
import time

api_key = "my-api-key"  # hidden for privacy
existing_csv = "tmdb_movies.csv"  # Your previous file

# Load previously scraped movies
try:
    existing_df = pd.read_csv(existing_csv)
    scraped_titles = set(existing_df['Title'].dropna().unique())
    print(f"🔄 Loaded {len(scraped_titles)} existing titles to avoid duplication.")
except FileNotFoundError:
    existing_df = pd.DataFrame()
    scraped_titles = set()
    print("⚠️ No existing file found. Starting fresh.")

# Get genre name mapping from TMDb
def get_genre_mapping(api_key):
    print("Fetching genre mapping...")
    url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US"
    response = requests.get(url)
    data = response.json()
    return {genre["id"]: genre["name"] for genre in data.get("genres", [])}

# Fetch movies by language
def fetch_movies_by_language(api_key, language, pages=500):
    print(f"\n🌐 Scraping movies for language: {language}")
    genre_map = get_genre_mapping(api_key)
    base_url = "https://api.themoviedb.org/3/discover/movie"
    new_movies = []

    for page in range(1, pages + 1):
        print(f"🔄 Language {language} | Page {page}")
        params = {
            "api_key": api_key,
            "language": "en-US",
            "sort_by": "popularity.desc",
            "page": page,
            "with_original_language": language
        }

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"❌ Error on page {page}: {response.status_code} {response.text}")
            break

        data = response.json()
        results = data.get("results", [])
        if not results:
            break

        for movie in results:
            title = movie.get("title", "").strip()
            if title in scraped_titles:
                continue

            genre_names = [genre_map.get(gid, "Unknown") for gid in movie.get("genre_ids", [])]
            genre_str = ", ".join(genre_names)

            new_movies.append([
                title,
                movie.get("release_date"),
                movie.get("vote_average"),
                movie.get("vote_count"),
                movie.get("original_language"),
                movie.get("overview"),
                movie.get("popularity"),
                genre_str
            ])
            scraped_titles.add(title)

        print(f"✅ Page {page} done. Movies collected: {len(new_movies)}\n")
        time.sleep(0.5)

    return new_movies

# Main driver
def main():
    languages = ["hi", "fr", "es", "ja", "ko", "de", "ru", "zh", "pt"]  # skip "en" for now
    all_new_movies = []

    for lang in languages:
        new_data = fetch_movies_by_language(api_key, lang)
        all_new_movies.extend(new_data)

    if all_new_movies:
        new_df = pd.DataFrame(all_new_movies, columns=[
            "Title", "Release Date", "Rating", "Vote Count",
            "Language", "Overview", "Popularity", "Genres"
        ])
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        combined_df.to_csv(existing_csv, index=False)
        print(f"🎉 Done! Added {len(new_df)} new movies. Total now: {len(combined_df)}")
    else:
        print("⚠️ No new movies added.")

main()

🔄 Loaded 9325 existing titles to avoid duplication.

🌐 Scraping movies for language: hi
Fetching genre mapping...
🔄 Language hi | Page 1
✅ Page 1 done. Movies collected: 0

🔄 Language hi | Page 2
✅ Page 2 done. Movies collected: 1

🔄 Language hi | Page 3
✅ Page 3 done. Movies collected: 3

🔄 Language hi | Page 4
✅ Page 4 done. Movies collected: 3

🔄 Language hi | Page 5
✅ Page 5 done. Movies collected: 3

🔄 Language hi | Page 6
✅ Page 6 done. Movies collected: 5

🔄 Language hi | Page 7
✅ Page 7 done. Movies collected: 6

🔄 Language hi | Page 8
✅ Page 8 done. Movies collected: 7

🔄 Language hi | Page 9
✅ Page 9 done. Movies collected: 11

🔄 Language hi | Page 10
✅ Page 10 done. Movies collected: 19

🔄 Language hi | Page 11
✅ Page 11 done. Movies collected: 25

🔄 Language hi | Page 12
✅ Page 12 done. Movies collected: 35

🔄 Language hi | Page 13
✅ Page 13 done. Movies collected: 43

🔄 Language hi | Page 14
✅ Page 14 done. Movies collected: 57

🔄 Language hi | Page 15
✅ Page 15 done. Movi

In [9]:
import pandas as pd

# Load your scraped data
df = pd.read_csv("tmdb_movies.csv")

print(f"Original dataset size: {df.shape[0]} rows")

# 1. 🔁 Drop duplicate movies by Title (or use 'ID' if you have it)
df.drop_duplicates(subset="Title", inplace=True)

# 2. 🧹 Remove rows with any missing (NaN) values
df.dropna(inplace=True)

# 3. 🚫 Remove rows where critical fields are empty strings (e.g., Overview, Genre, Title)
columns_to_check = ["Title", "Overview", "Genres"]  # Add more columns if needed
for col in columns_to_check:
    df = df[df[col].str.strip() != ""]

# 4. 🔄 Reset index after drops
df.reset_index(drop=True, inplace=True)

# 5. 💾 Save cleaned version
df.to_csv("tmdb_movies_cleanedd.csv", index=False)

print(f"✅ Cleaned dataset saved as 'tmdb_movies_cleaned.csv'")
print(f"New dataset size: {df.shape[0]} rows")

Original dataset size: 89147 rows
✅ Cleaned dataset saved as 'tmdb_movies_cleaned.csv'
New dataset size: 70747 rows
