In [2]:
import pandas as pd
import json
import ast
import re

# --- 0) Rutas de los archivos ---
oscar_tsv   = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/title_oscar.tsv"
world_csv   = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/world_imdb_movies_top_movies_per_year.csv"
name_basics = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/name.basics.tsv"

# --- 1) Leer los archivos ---
df_oscar = pd.read_csv(oscar_tsv, sep="\t", dtype=str)
df_world = pd.read_csv(world_csv, dtype=str)

# Intentamos cargar name.basics; si no existe, creamos un df vacío
try:
    df_names = pd.read_csv(name_basics, sep="\t", dtype=str, usecols=["nconst", "primaryName"])
    df_names.set_index("nconst", inplace=True)
except FileNotFoundError:
    print(f"Advertencia: no se encontró '{name_basics}'. Los nodos 'person' no tendrán campo 'name'.")
    df_names = pd.DataFrame(columns=["primaryName"])
    df_names.index.name = "nconst"

# Columnas relevantes del CSV world:
movie_id_col = "id"             # corresponde a tconst
country_col  = "country_origin" # nombre real de la columna

# --- 2) Preparar contenedores para nodos y enlaces ---
nodes_dict = {}
links = []

# --- 3) Iterar sobre cada fila de df_oscar ---
for _, row in df_oscar.iterrows():
    tconst = row["tconst"]
    try:
        year = int(row["startYear"])
    except:
        year = None

    genres_list = row["genres"].split(",") if pd.notna(row["genres"]) else []

    # 3.1) Crear nodo "movie", agregando country_origin si existe en df_world
    country = None
    if tconst in df_world[movie_id_col].values:
        match = df_world.loc[df_world[movie_id_col] == tconst, country_col]
        if not match.empty:
            country = match.iloc[0]

    movie_node = {
        "id": tconst,
        "type": "movie",
        "title": row["primaryTitle"],
        "year": year,
        "genres": genres_list,
        "averageRating": float(row["averageRating"]) if pd.notna(row["averageRating"]) else None,
        "numVotes": int(row["numVotes"]) if pd.notna(row["numVotes"]) else None,
        "directors": row["directors"].split(",") if pd.notna(row["directors"]) else [],
        "writers": row["writers"].split(",") if pd.notna(row["writers"]) else [],
        "oscarNominations": int(row["oscarNominations"]) if pd.notna(row["oscarNominations"]) else 0,
        "oscarWins": int(row["oscarWins"]) if pd.notna(row["oscarWins"]) else 0,
        "country_origin": country
    }
    if tconst not in nodes_dict:
        nodes_dict[tconst] = movie_node

    # 3.2) Parsear campo "person" para extraer (nconst, role, characters)
    persons_str = row["person"]
    if pd.notna(persons_str) and persons_str.strip() != "":
        # Extraemos cada parte dentro de paréntesis
        grupos = re.findall(r"\(([^)]+)\)", persons_str)
        for grupo in grupos:
            texto = grupo.strip()
            partes = re.split(r',', texto, maxsplit=2)
            if len(partes) < 3:
                continue

            person_id = partes[0]
            role = partes[1]
            chars_text = partes[2]

            # Convertir characters de string a lista Python
            try:
                characters = ast.literal_eval(chars_text)
            except:
                characters = []

            # 3.2.1) Crear nodo "person" si no existe, agregando su nombre si está en df_names
            if person_id not in nodes_dict:
                name = None
                if person_id in df_names.index:
                    name = df_names.at[person_id, "primaryName"]
                nodes_dict[person_id] = {
                    "id": person_id,
                    "type": "person",
                    "name": name
                }

            # 3.2.2) Crear enlace
            link = {
                "source": person_id,
                "target": tconst,
                "role": role,
                "characters": characters,
                "year": year
            }
            links.append(link)

# --- 4) Convertir diccionario de nodos a lista ---
nodes_list = list(nodes_dict.values())

# --- 5) Guardar JSON ---
output = {
    "nodes": nodes_list,
    "links": links
}
out_path = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"JSON guardado en: {out_path}")
print(f"Total nodos: {len(nodes_list)}")
print(f"Total enlaces: {len(links)}")


JSON guardado en: /home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project.json
Total nodos: 25650
Total enlaces: 40692
