In [2]:
%cd ../BackEnd

import json
import pandas as pd
from sqlalchemy.orm import Session
from models.credit import Credit
from models.person import Person
from models.cast_member import CastMember
from models.crew_member import CrewMember

c:\Users\Korisnik\Documents\GitHub\CineSense\BackEnd


In [1]:
%cd ../DB

import json
import pandas as pd
import csv

# Output CSV file names (adjust paths as needed)
PERSONS_CSV = "tv_persons_temp.csv"
CREDITS_CSV = "tv_credits_temp.csv"
CAST_CSV    = "tv_cast_temp.csv"
CREW_CSV    = "tv_crew_temp.csv"

# Open all CSV files for writing (using newline="" to avoid extra blank lines on Windows)
with open(PERSONS_CSV, "w", newline="", encoding="utf-8") as persons_file, \
     open(CREDITS_CSV, "w", newline="", encoding="utf-8") as credits_file, \
     open(CAST_CSV,    "w", newline="", encoding="utf-8") as cast_file, \
     open(CREW_CSV,    "w", newline="", encoding="utf-8") as crew_file:

    # Create CSV writers for each file
    persons_writer = csv.writer(persons_file)
    credits_writer = csv.writer(credits_file)
    cast_writer    = csv.writer(cast_file)
    crew_writer    = csv.writer(crew_file)

    # Write header rows so that COPY can skip the header if desired.
    persons_writer.writerow(["person_id", "name", "profile_path"])
    credits_writer.writerow(["credit_id", "content_id", "content_type"])
    cast_writer.writerow(["credit_id", "person_id", "character", "order"])
    crew_writer.writerow(["credit_id", "person_id", "department", "job"])

    # Global caches to avoid duplicates across all chunks.
    global_persons_cache = set()
    global_credits_cache = set()

    # Read the movies credits CSV in chunks.
    chunk_iter = pd.read_csv(
        "tv_credits.csv",
        chunksize=100000,
        sep=",",
        encoding="utf-8",
        quotechar='"'
    )

    chunk_count = 0
    for chunk_df in chunk_iter:
        chunk_count += 1
        print(f"Processing chunk #{chunk_count} with {len(chunk_df)} rows...")

        for _, row in chunk_df.iterrows():
            content_id = row["id"]  # The movie TMDB ID

            # Process cast JSON.
            cast_data = []
            if "cast" in row and pd.notna(row["cast"]):
                try:
                    cast_data = json.loads(row["cast"])
                except Exception:
                    pass

            for member in cast_data:
                credit_id = member.get("credit_id")
                person_id = member.get("id")
                if not credit_id or not person_id:
                    continue

                # Write person record if not already added.
                if person_id not in global_persons_cache:
                    global_persons_cache.add(person_id)
                    persons_writer.writerow([
                        person_id,
                        member.get("name", ""),
                        member.get("profile_path", "")
                    ])

                # Write credit record if not already added.
                if credit_id not in global_credits_cache:
                    global_credits_cache.add(credit_id)
                    credits_writer.writerow([credit_id, content_id, "movie"])

                # Write cast member record.
                cast_writer.writerow([
                    credit_id,
                    person_id,
                    member.get("character", ""),
                    member.get("order", "")
                ])

            # Process crew JSON.
            crew_data = []
            if "crew" in row and pd.notna(row["crew"]):
                try:
                    crew_data = json.loads(row["crew"])
                except Exception:
                    pass

            for member in crew_data:
                credit_id = member.get("credit_id")
                person_id = member.get("id")
                if not credit_id or not person_id:
                    continue

                if person_id not in global_persons_cache:
                    global_persons_cache.add(person_id)
                    persons_writer.writerow([
                        person_id,
                        member.get("name", ""),
                        member.get("profile_path", "")
                    ])

                if credit_id not in global_credits_cache:
                    global_credits_cache.add(credit_id)
                    credits_writer.writerow([credit_id, content_id, "tv"])

                # Write crew member record.
                crew_writer.writerow([
                    credit_id,
                    person_id,
                    member.get("department", ""),
                    member.get("job", "")
                ])

    print("All movie credits processed; CSV files created.")


c:\Users\Korisnik\Documents\GitHub\CineSense\DB
Processing chunk #1 with 100000 rows...
Processing chunk #2 with 96692 rows...
All movie credits processed; CSV files created.
