In [16]:
!pip install ollama

[0m

In [17]:
import ollama
import json
import pandas as pd
import numpy as np

In [18]:
with open('hyperedges_cora.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    hyperedges = {int(k): set(v) for k, v in data.items()}

In [None]:
sorted_hyperedges = dict(
    sorted(
        hyperedges.items(),
        key=lambda item: len(item[1]),   
        reverse=True                     # mayor primero
    )
)

# Número de nodos en cada hyperedge (para saber cuales modificar)
print("Hyperedges ordenados por tamaño (de mayor a menor):")
for i, (hyperedge_id, nodes) in enumerate(sorted_hyperedges.items(), 1):
    print(f"{i}. Hyperedge {hyperedge_id}: {len(nodes)} nodos → {sorted(nodes)[:10]}{'...' if len(nodes) > 10 else ''}")
    if i >= 100:  
        break

print(f"\nTotal de hyperedges: {len(sorted_hyperedges)}")

Hyperedges ordenados por tamaño (de mayor a menor):
1. Hyperedge 35: 168 nodos → [887, 1033, 1688, 1956, 8865, 12576, 15670, 18582, 27510, 28290]...
2. Hyperedge 6213: 78 nodos → [128, 434, 887, 1385, 4584, 6151, 6155, 6163, 6184, 6210]...
3. Hyperedge 1365: 74 nodos → [7276, 7297, 22835, 23448, 23502, 23507, 26850, 39904, 49482, 69392]...
4. Hyperedge 3229: 65 nodos → [6184, 7022, 16461, 23502, 25181, 25184, 27174, 27631, 29492, 30817]...
5. Hyperedge 910: 44 nodos → [906, 5462, 5869, 12439, 12946, 13136, 25702, 28278, 31043, 34355]...
6. Hyperedge 114: 42 nodos → [128, 130, 434, 6155, 6170, 6196, 6220, 7432, 8213, 23258]...
7. Hyperedge 4330: 40 nodos → [1717, 4329, 6913, 11093, 12638, 12946, 32688, 35797, 37884, 46468]...
8. Hyperedge 3231: 36 nodos → [3237, 6334, 8699, 8821, 10169, 17798, 20850, 25184, 39124, 39126]...
9. Hyperedge 1272: 34 nodos → [4584, 6917, 6923, 13686, 18615, 20593, 27230, 30895, 30901, 52835]...
10. Hyperedge 19621: 33 nodos → [6184, 7276, 7297, 23448, 42847,

In [None]:
df = pd.read_csv("./Cora_clusters.csv")[["paper_id", "Title", "Abstract"]]

papers_ids_valid = set(df.paper_id) 

print("Claves en hyperedges que NO están en papers_ids_valid:")
print(set(hyperedges.keys()) - papers_ids_valid)
print("Viceversa:")
print(papers_ids_valid - set(hyperedges.keys()))

filtered_hyperedges = {}
for k, v_set in hyperedges.items():
    if k in papers_ids_valid:
        filtered_values = v_set.intersection(papers_ids_valid)
        if filtered_values:  
            filtered_hyperedges[k] = filtered_values

print("Diccionario filtrado:")
print(len(filtered_hyperedges))
print(len(df))


filtered_df = df[df['paper_id'].isin(filtered_hyperedges.keys())].copy()

print(f"Tamaño del DataFrame filtrado: {len(filtered_df)}")

print(set(filtered_hyperedges.keys()) - set(filtered_df.paper_id))
print(set(filtered_df.paper_id) - set(filtered_hyperedges.keys()))

Claves en hyperedges que NO están en papers_ids_valid:
{1153024, 628751, 1153056, 22564, 1153065, 1130539, 1050679, 63549, 503871, 1153097, 1108050, 16470, 561238, 1128536, 1128542, 634975, 589923, 1126503, 630890, 594039, 20601, 137359, 112787, 180373, 12439, 1153183, 1130657, 149669, 1130676, 1104055, 1132731, 299195, 299197, 14531, 594119, 1108169, 989397, 714975, 610529, 1153254, 190698, 1153264, 35061, 1153275, 1153280, 1134865, 141596, 28964, 1112369, 575795, 143676, 63812, 1118546, 86359, 1130856, 2440, 733576, 24974, 1116569, 643485, 1130915, 57764, 1128868, 154023, 1132968, 8619, 1130931, 143801, 395725, 139738, 1106401, 102884, 358884, 1128945, 684531, 1110520, 133628, 649730, 1120777, 33301, 1128985, 102939, 45603, 1110563, 45605, 178727, 1153577, 141868, 10798, 47683, 1129027, 100935, 23116, 27230, 643695, 31349, 6775, 80515, 1129111, 1131165, 51879, 1153703, 1112767, 41666, 688849, 99030, 1125082, 78555, 78557, 13024, 1153784, 6910, 273152, 1131270, 662279, 1131277, 529165

In [None]:
import ast


print(f"Number of hyperedges: {len(filtered_hyperedges)}")
print(f"Len dataset: {len(filtered_df)}")

to_remove_by_cluster = {}

# ROL para el LLM
SYSTEM_PROMPT = """
You are an expert in machine learning research and graph-based citation networks, with deep knowledge of the Cora dataset and similar academic paper corpora.

The Cora dataset contains scientific publications from the field of machine learning, classified into 7 categories: Case_Based, Genetic_Algorithms, Neural_Networks, Probabilistic_Methods, Reinforcement_Learning, Rule_Learning, and Theory.

Your ONLY task is: Given a list of papers (each with ID, Title, and Abstract if available, or inferred from content) that belong to the same cluster/hyperedge that is based on a main paper of reference (e.g., a community detected via graph clustering or hypergraph), identify and remove any clear outliers — i.e., papers whose topic, methodology, or research focus does not align well with the dominant theme of the main paper.

Rules:
- Focus on thematic coherence: main ML subfield, type of problem addressed (e.g., supervised vs reinforcement learning), methods used (neural nets, genetic algorithms, probabilistic models, etc.), and citation patterns implied by the cluster.
- Be strict: only remove a paper if it is noticeably off-topic compared to the main paper (e.g., a Theory paper in a cluster dominated by Neural_Networks).
- Output format: ONLY a Python list of IDs to remove, e.g. [123456, 789012] or [] if none should be removed.
- Do NOT explain, do NOT add any text, do NOT say "I think" or "here are...". Just the list in valid Python syntax like [id1, id2].
"""

for node_id, nodes_in_edge in filtered_hyperedges.items():
    if len(nodes_in_edge) < 15:
        print("I am not processing this hyperedge")
        continue
    
    print(f"\nProcesando hipergrafo de {node_id} ({len(nodes_in_edge)} papers)")
    
    # Sample random (tamaño depende del número de nodos)
    sub_df = filtered_df[filtered_df['paper_id'].isin(nodes_in_edge)]
    if len(sub_df) > 50:
        sample_df = sub_df.sample(n=50, random_state=42)    
    else:
        sample_df = sub_df

    print(f"Length del sample: {len(sample_df)}")

    # Prompt del usuario
    user_prompt = "Here are some sample scientific articles from the same cluster/hyperedge:\n\n"
    for _, row in sample_df.iterrows():
        user_prompt += f"ID: {row['paper_id']}\nTitle: {row['Title']}\nAbstract: {row['Abstract'][:800]}...\n\n"  # truncate si es muy largo
    
    node_row = filtered_df.loc[filtered_df['paper_id'] == node_id].iloc[0]

    user_prompt += (
        f"Based on the hyperedges's main paper's Title and Abstract:\n"
        f"Title: {node_row['Title']}\n"
        f"Abstract: {node_row['Abstract'][:800]}\n"
        "Decide which IDs should be removed because they do not fit the general topic/theme well."
    )

    # Llama con chat → system + user
    response = ollama.chat(
        model='gemma3:1b',
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': user_prompt}
        ],
        options={
            'temperature': 0.1     # bajo para que sea determinista
        }
    )
    
    raw_output = response['message']['content'].strip()
    print("Model output:", raw_output)
    
    try:
        ids_to_remove = ast.literal_eval(raw_output)
        if isinstance(ids_to_remove, list):
            to_remove_by_cluster[node_id] = [int(id) for id in ids_to_remove] 
            print(f"→ IDs a eliminar: {ids_to_remove}")
        else:
            print("No es lista válida")
    except Exception as e:
        print(f"Error parseando: {e} → output crudo: {raw_output}")


all_removed = [id for ids in to_remove_by_cluster.values() for id in ids]
print(f"\nTotal IDs sugeridos para remover: {len(all_removed)}")
print("IDs:", all_removed)

Number of hyperedges: 2319
Len dataset: 2319
I am not processing this hyperedge
I am not processing this hyperedge
I am not processing this hyperedge
I am not processing this hyperedge
I am not processing this hyperedge

Procesando hipergrafo de 35 (135 papers)
Length del sample: 50
Model output: Okay, let's analyze the remaining IDs and determine which ones are most likely to be removed.

Here’s my assessment:

*   **ID 66563:** This seems completely unrelated to the overall theme of evolutionary algorithms and neural networks. It’s a very specific, niche topic. It’s likely to be removed.
*   **ID 486840:** Similar to ID 66563, this is a very specific and tangential topic. It doesn’t fit the broader context of evolutionary algorithms. It’s removed.
*   **ID 573964:** This is also very specific and doesn't connect to the overall topic of evolutionary algorithms. It's removed.
*   **ID 210872:** This is a very specific and niche topic. It doesn't connect to the overall theme of evolutio

In [None]:
filtered_df.to_csv("datos_para_cora_citation.csv", index=False)