In [3]:
import pandas as pd
import json
import ast
import re

# --- 1) Leer el TSV especificando sep="\t" ---
csv_path = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/title_oscar.tsv"
df = pd.read_csv(csv_path, sep="\t", dtype=str)

# --- 2) Preparar estructuras para nodos y enlaces ---
nodes_dict = {}
links = []

# --- 3) Iterar filas del DataFrame para crear nodos "movie" y parsear reparto ---
for _, row in df.iterrows():
    tconst = row["tconst"]
    # --- 3.1) Nodo película ---
    try:
        year = int(row["startYear"])
    except:
        year = None

    genres_list = row["genres"].split(",") if pd.notna(row["genres"]) else []

    movie_node = {
        "id": tconst,
        "type": "movie",
        "title": row["primaryTitle"],
        "year": year,
        "genres": genres_list,
        "averageRating": float(row["averageRating"]) if pd.notna(row["averageRating"]) else None,
        "numVotes": int(row["numVotes"]) if pd.notna(row["numVotes"]) else None,
        "directors": row["directors"].split(",") if pd.notna(row["directors"]) else [],
        "writers": row["writers"].split(",") if pd.notna(row["writers"]) else [],
        "oscarNominations": int(row["oscarNominations"]) if pd.notna(row["oscarNominations"]) else 0,
        "oscarWins": int(row["oscarWins"]) if pd.notna(row["oscarWins"]) else 0
    }
    if tconst not in nodes_dict:
        nodes_dict[tconst] = movie_node

    # --- 3.2) Parsear campo "person" para extraer (nconst, role, characters) ---
    persons_str = row["person"]
    if pd.notna(persons_str) and persons_str.strip() != "":
        # Extraer cada fragmento entre paréntesis
        grupos = re.findall(r"\(([^)]+)\)", persons_str)
        for grupo in grupos:
            texto = grupo.strip()  # eliminar espacios antes/después
            # Dividir en 3 partes: id, role, resto (characters)
            partes = re.split(r',', texto, maxsplit=2)
            if len(partes) < 3:
                # Si no hay al menos dos comas, ignoramos este "grupo"
                continue

            person_id = partes[0]
            role = partes[1]
            chars_text = partes[2]

            # Convertir characters de string a lista Python
            try:
                characters = ast.literal_eval(chars_text)
            except:
                characters = []

            # 3.2.1) Crear nodo persona si no existe
            if person_id not in nodes_dict:
                nodes_dict[person_id] = {
                    "id": person_id,
                    "type": "person"
                }

            # 3.2.2) Crear enlace (link)
            link = {
                "source": person_id,
                "target": tconst,
                "role": role,
                "characters": characters,
                "year": year
            }
            links.append(link)

# --- 4) Convertir diccionario de nodos a lista ---
nodes_list = list(nodes_dict.values())

# --- 5) Guardar en JSON para usar en D3 ---
output = {
    "nodes": nodes_list,
    "links": links
}
out_path = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"JSON guardado en: {out_path}")
print(f"Total nodos: {len(nodes_list)}")
print(f"Total enlaces: {len(links)}")


JSON guardado en: /home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project.json
Total nodos: 25650
Total enlaces: 40692
