In [2]:
!pip install ollama

[0m

In [None]:
import ollama
import pandas as pd
import random
import ast

df = pd.read_csv("./Cora_clusters.csv")[["paper_id", "Title", "Abstract", "cluster_hypergraph"]]

print(f"Number of hyperedges: {len(set(df.cluster_hypergraph))}")

grouped = df.groupby('cluster_hypergraph')

to_remove_by_cluster = {}

# System prompt (rol del experto)
SYSTEM_PROMPT = """
You are an expert in machine learning research and graph-based citation networks, with deep knowledge of the Cora dataset and similar academic paper corpora.

The Cora dataset contains scientific publications from the field of machine learning, classified into 7 categories: Case_Based, Genetic_Algorithms, Neural_Networks, Probabilistic_Methods, Reinforcement_Learning, Rule_Learning, and Theory.

Your ONLY task is: Given a list of papers (each with ID, Title, and Abstract if available, or inferred from content) that belong to the same cluster/hyperedge (e.g., a community detected via graph clustering or hypergraph), identify and remove any clear outliers — i.e., papers whose topic, methodology, or research focus does not align well with the dominant theme of the group.

Rules:
- Focus on thematic coherence: main ML subfield, type of problem addressed (e.g., supervised vs reinforcement learning), methods used (neural nets, genetic algorithms, probabilistic models, etc.), and citation patterns implied by the cluster.
- Be strict: only remove a paper if it is noticeably off-topic compared to the majority (e.g., a Theory paper in a cluster dominated by Neural_Networks).
- Output format: ONLY a Python list of IDs to remove, e.g. [123456, 789012] or [] if none should be removed.
- Do NOT explain, do NOT add any text, do NOT say "I think" or "here are...". Just the list in valid Python syntax like [id1, id2].
"""

for cluster_id, group in grouped:
    if len(group) < 10:
        continue
    
    print(f"\nProcesando cluster {cluster_id} ({len(group)} papers)")
    
    if len(group) > 500:
        sample_df = group.sample(n=50, random_state=42)    
    else:
        sample_df = group

    print(f"Length del sample: {len(sample_df)}")

    user_prompt = "Here are some sample scientific articles from the same cluster/hyperedge:\n\n"
    for _, row in sample_df.iterrows():
        user_prompt += f"ID: {row['paper_id']}\nTitle: {row['Title']}\nAbstract: {row['Abstract'][:800]}...\n\n"  #truncar si es muy largo
    
    user_prompt += (
        "Based on these papers (all from the same hyperedge), "
        "decide which IDs should be removed because they do not fit the general topic/theme well."
    )
    
    # Llama con chat → system + user
    response = ollama.chat(
        model='gemma3:1b',
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': user_prompt}
        ],
        options={
            'temperature': 0.1     # bajo para determinista
        }
    )
    
    raw_output = response['message']['content'].strip()
    print("Model output:", raw_output)
    
    try:
        ids_to_remove = ast.literal_eval(raw_output)
        if isinstance(ids_to_remove, list):
            to_remove_by_cluster[cluster_id] = [int(id) for id in ids_to_remove] 
            print(f"→ IDs a eliminar: {ids_to_remove}")
        else:
            print("No es lista válida")
    except Exception as e:
        print(f"Error parseando: {e} → output crudo: {raw_output}")

all_removed = [id for ids in to_remove_by_cluster.values() for id in ids]
print(f"\nTotal IDs sugeridos para remover: {len(all_removed)}")
print("IDs:", all_removed)

Number of hyperedges: 160

Procesando cluster -1 (940 papers)
Length del sample: 50
Model output: Based on the provided papers, the following IDs should be removed because they don't fit the general topic/theme:

*   **1131300:** This paper focuses on Bayesian noninformative mixture modeling, which is a distinct area from mobile robot navigation and case-based reasoning.

*   **1105810:** This paper focuses on mobile robot navigation, which is a specific application of case-based reasoning.

The remaining IDs are:

*   **19231:** This paper focuses on query by committee, which is a different area of AI research.
*   **195361:** This paper focuses on interpreting complex scenes using Bayesian networks, which is a different area of AI research.
*   **458439:** This paper focuses on PFSA modelling of behavioural sequences by evolutionary programming rockhampton, Queensland. (1994) "PFSA Modelling of Behavioural
*   **1119742:** This paper focuses on interpretation of complex scenes using 