In [1]:
import json

# 1. Ruta de entrada/salida: ajusta según corresponda
INPUT_PATH  = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project.json"
OUTPUT_PATH = "/home/lenovo/Documentos/repos/final-project-cinedive/static/data/graph_for_project_sin_country.json"

def procesar_json(input_path, output_path):
    # 1. Leer JSON original
    with open(input_path, "r", encoding="utf-8") as f:
        grafo = json.load(f)

    nodos = grafo.get("nodes", [])

    # 2. Eliminar cualquier campo relacionado con "country" en cada nodo
    for nodo in nodos:
        if "country_origin" in nodo:
            nodo.pop("country_origin", None)
        # Si hubiera otros campos que incluyan la palabra "country", se pueden eliminar así:
        # for clave in list(nodo.keys()):
        #     if "country" in clave:
        #         nodo.pop(clave, None)

    # 3. Guardar nuevo JSON sin campos de país
    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(grafo, f_out, ensure_ascii=False, indent=2)

    print(f"Proceso finalizado. Se procesaron {len(nodos)} nodos y se eliminaron campos de country.")

if __name__ == "__main__":
    procesar_json(INPUT_PATH, OUTPUT_PATH)


Proceso finalizado. Se procesaron 25650 nodos y se eliminaron campos de country.


In [5]:
import csv
import json
import ast
from collections import defaultdict

def parse_list_field(raw):
    """
    Garante que o resultado seja uma lista, mesmo se for uma string simples.
    """
    raw = raw.strip()
    if not raw:
        return []

    try:
        value = ast.literal_eval(raw)
        if isinstance(value, list):
            return value
        else:
            return [str(value)]
    except (ValueError, SyntaxError):
        return [raw]

def parse_person_field(field):
    people = []
    if not field.strip():
        return people

    raw_items = field.strip()[1:-1].split('),(')
    for item in raw_items:
        item = item.strip('()')
        try:
            parts = item.split(',', 2)
            if len(parts) == 3:
                pid = parts[0].strip()
                role_type = parts[1].strip()
                name_part = parts[2].strip()
                if name_part == r'\N':
                    continue  # pula registros sem nome
                name = ast.literal_eval(name_part)[0]
                people.append({
                    "id": pid,
                    "name": name,
                    "type": role_type
                })
        except Exception as e:
            print(f"Erro ao processar pessoa: {item} -> {e}")
            continue
    return people

def process_tsv_to_json(tsv_file, output_json):
    nodes = {}
    links_dict = defaultdict(lambda: {"filmes": [], "year": None, "country": None})
    
    with open(tsv_file, encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            movie = row['primaryTitle'].strip()
            year = row['startYear'].strip()

            country_list = parse_list_field(row.get('country_origin', ''))
            country = country_list[0] if country_list else None

            directors = [d.strip() for d in row.get('directors', '').split(',') if d.strip()]
            writers = [w.strip() for w in row.get('writers', '').split(',') if w.strip()]
            people = parse_person_field(row.get('person', ''))

            for d in directors:
                if d not in nodes:
                    nodes[d] = {"id": d, "name": d, "type": "director"}
            for w in writers:
                if w not in nodes:
                    nodes[w] = {"id": w, "name": w, "type": "writer"}
            for p in people:
                if p['id'] not in nodes:
                    nodes[p['id']] = {
                        "id": p['id'],
                        "name": p['name'],
                        "type": p['type']
                    }

            all_ids = directors + writers + [p['id'] for p in people]
            for i in range(len(all_ids)):
                for j in range(i+1, len(all_ids)):
                    source, target = sorted((all_ids[i], all_ids[j]))
                    key = (source, target)
                    links_dict[key]["filmes"].append(movie)
                    links_dict[key]["year"] = year
                    links_dict[key]["country"] = country

    nodes_list = list(nodes.values())
    links_list = [
        {
            "source": source,
            "target": target,
            "filmes": data["filmes"],
            "year": data["year"],
            "country": data["country"]
        }
        for (source, target), data in links_dict.items()
    ]

    result = {
        "nodes": nodes_list,
        "links": links_list
    }

    with open(output_json, 'w', encoding='utf-8') as f_out:
        json.dump(result, f_out, indent=2, ensure_ascii=False)

# 🔧 Exemplo de uso:
process_tsv_to_json("../static/data/title_oscar_con_country.tsv", "graphic_person.json")
