In [None]:
%cd ../BackEnd

import json
import pandas as pd
from sqlalchemy.orm import Session
from database import SessionLocal
from models.credit import Credit
from models.person import Person
from models.cast_member import CastMember
from models.crew_member import CrewMember

In [1]:
import json
import pandas as pd
import csv
import os

# CSV file names (these are the same files used for movie credits)
PERSONS_CSV = "persons_temp.csv"
CREDITS_CSV = "credits_temp.csv"
CAST_CSV    = "cast_temp.csv"
CREW_CSV    = "crew_temp.csv"

# Utility functions to load existing keys/pairs from a CSV file
def load_existing_keys(filename, key_index=0):
    keys = set()
    if os.path.exists(filename):
        with open(filename, "r", newline="", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)  # skip header
            for row in reader:
                if row and len(row) > key_index:
                    keys.add(row[key_index])
    return keys

def load_existing_pairs(filename, key_index1=0, key_index2=1):
    pairs = set()
    if os.path.exists(filename):
        with open(filename, "r", newline="", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)  # skip header
            for row in reader:
                if row and len(row) > key_index2:
                    pairs.add((row[key_index1], row[key_index2]))
    return pairs

# Load global caches from existing CSV files (if any)
global_persons_cache = load_existing_keys(PERSONS_CSV, key_index=0)
global_credits_cache = load_existing_keys(CREDITS_CSV, key_index=0)
global_cast_cache    = load_existing_pairs(CAST_CSV, key_index1=0, key_index2=1)  # (credit_id, person_id)
global_crew_cache    = load_existing_pairs(CREW_CSV, key_index1=0, key_index2=1)  # (credit_id, person_id)

print(f"Loaded {len(global_persons_cache)} persons, {len(global_credits_cache)} credits, "
      f"{len(global_cast_cache)} cast entries, and {len(global_crew_cache)} crew entries from existing CSVs.")

# Open CSV files in append mode (so that new TV data gets added after movie data)
persons_file = open(PERSONS_CSV, "a", newline="", encoding="utf-8")
credits_file = open(CREDITS_CSV, "a", newline="", encoding="utf-8")
cast_file    = open(CAST_CSV, "a", newline="", encoding="utf-8")
crew_file    = open(CREW_CSV, "a", newline="", encoding="utf-8")

persons_writer = csv.writer(persons_file)
credits_writer = csv.writer(credits_file)
cast_writer    = csv.writer(cast_file)
crew_writer    = csv.writer(crew_file)

# (Assumes the CSVs already have header rows from the movie import.)

# Process the TV credits CSV file in chunks.
chunk_iter = pd.read_csv(
    "tv_credits.csv",
    chunksize=100000,
    sep=",",
    encoding="utf-8",
    quotechar='"'
)

chunk_count = 0
for chunk_df in chunk_iter:
    chunk_count += 1
    print(f"Processing TV chunk #{chunk_count} with {len(chunk_df)} rows...")
    # Local caches for this chunk (to minimize repeated writes within the chunk)
    local_persons_cache = set()
    local_credits_cache = set()

    for _, row in chunk_df.iterrows():
        content_id = row["id"]  # TV TMDB ID

        # Parse cast JSON
        cast_data = []
        if "cast" in row and pd.notna(row["cast"]):
            try:
                cast_data = json.loads(row["cast"])
            except Exception:
                pass

        # Parse crew JSON
        crew_data = []
        if "crew" in row and pd.notna(row["crew"]):
            try:
                crew_data = json.loads(row["crew"])
            except Exception:
                pass

        # Process cast members
        for member in cast_data:
            credit_id = member.get("credit_id")
            person_id = member.get("id")
            if not credit_id or not person_id:
                continue
            # Convert IDs to strings for consistency
            credit_id = str(credit_id)
            person_id = str(person_id)

            # Add person if new (check both global and local caches)
            if person_id not in global_persons_cache and person_id not in local_persons_cache:
                local_persons_cache.add(person_id)
                persons_writer.writerow([
                    person_id,
                    member.get("name", ""),
                    member.get("profile_path", "")
                ])

            # Add credit if new
            if credit_id not in global_credits_cache and credit_id not in local_credits_cache:
                local_credits_cache.add(credit_id)
                credits_writer.writerow([credit_id, content_id, "tv"])

            # Add cast entry if new (using (credit_id, person_id) as unique key)
            cast_pair = (credit_id, person_id)
            if cast_pair not in global_cast_cache:
                global_cast_cache.add(cast_pair)
                cast_writer.writerow([
                    credit_id,
                    person_id,
                    member.get("character", ""),
                    member.get("order", "")
                ])

        # Process crew members
        for member in crew_data:
            credit_id = member.get("credit_id")
            person_id = member.get("id")
            if not credit_id or not person_id:
                continue
            credit_id = str(credit_id)
            person_id = str(person_id)

            if person_id not in global_persons_cache and person_id not in local_persons_cache:
                local_persons_cache.add(person_id)
                persons_writer.writerow([
                    person_id,
                    member.get("name", ""),
                    member.get("profile_path", "")
                ])

            if credit_id not in global_credits_cache and credit_id not in local_credits_cache:
                local_credits_cache.add(credit_id)
                credits_writer.writerow([credit_id, content_id, "tv"])

            crew_pair = (credit_id, person_id)
            if crew_pair not in global_crew_cache:
                global_crew_cache.add(crew_pair)
                crew_writer.writerow([
                    credit_id,
                    person_id,
                    member.get("department", ""),
                    member.get("job", "")
                ])

    # Update global caches with new keys found in this chunk.
    global_persons_cache.update(local_persons_cache)
    global_credits_cache.update(local_credits_cache)

# Close all files.
persons_file.close()
credits_file.close()
cast_file.close()
crew_file.close()

print("TV credits CSV files appended with unique data.")


Loaded 3553674 persons, 14123007 credits, 7317625 cast entries, and 6805382 crew entries from existing CSVs.
Processing TV chunk #1 with 100000 rows...
Processing TV chunk #2 with 96692 rows...
TV credits CSV files appended with unique data.
