In [1]:
!pip install ollama

[0m

In [None]:
import ollama
import pandas as pd
import random
import ast

df = pd.read_csv("./PubMded_clusterwise.csv")[["PMID", "Title", "Abstract", "cluster_hypergraph"]]
print(f"Number of hyperedges: {len(set(df.cluster_hypergraph))}")

grouped = df.groupby('cluster_hypergraph')

to_remove_by_cluster = {}

# Rol experto
SYSTEM_PROMPT = """
You are an expert biomedical researcher specializing in diabetes, metabolism, retinal complications, mitochondrial function, and related topics.
Your ONLY task is: Given a list of scientific papers (each with ID, Title, and Abstract) that belong to the same cluster/hyperedge, identify and remove any that are clear outliers — i.e., papers that do not fit the dominant topic/theme of the group.

Rules:
- Focus on thematic coherence (main disease model, organ/system studied, experimental approach, etc.).
- Be strict: only remove if the paper is noticeably off-topic compared to the majority.
- Output format: ONLY a Python list of IDs to remove, e.g. [12187484, 2344352] or [] if none should be removed.
- Do NOT explain, do NOT add any text. Just the list like [id1, id2].
"""

for cluster_id, group in grouped:
    if len(group) <= 100:
        continue
    
    print(f"\nProcesando cluster {cluster_id} ({len(group)} papers)")
    
    sample_size = int(len(group) * 0.2)
    sample_df = group.sample(n=sample_size, random_state=42)
    if len(sample_df) > 40:
        sample_df = sample_df.sample(n=40, random_state=42)

    print(f"Length del sample: {len(sample_df)}")
    
    # El prompt con los title, abstract de los nodes (papers) seleccionados
    user_prompt = "Here are some sample scientific articles from the same cluster/hyperedge:\n\n"
    for _, row in sample_df.iterrows():
        user_prompt += f"ID: {row['PMID']}\nTitle: {row['Title']}\nAbstract: {row['Abstract'][:800]}...\n\n"  
    
    user_prompt += (
        "Based on these papers (all from the same hyperedge), "
        "decide which IDs should be removed because they do not fit the general topic/theme well."
    )
    
    response = ollama.chat(
        model='gemma3:1b',
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': user_prompt}
        ],
        options={
            'temperature': 0.1     # bajo para determinista
        }
    )
    
    raw_output = response['message']['content'].strip()
    print("Model output:", raw_output)
    
    try:
        ids_to_remove = ast.literal_eval(raw_output)
        if isinstance(ids_to_remove, list):
            to_remove_by_cluster[cluster_id] = [int(id) for id in ids_to_remove]  
            print(f"→ IDs a eliminar: {ids_to_remove}")
        else:
            print("No es lista válida")
    except Exception as e:
        print(f"Error parseando: {e} → output crudo: {raw_output}")

all_removed = [id for ids in to_remove_by_cluster.values() for id in ids]
print(f"\nTotal IDs sugeridos para remover: {len(all_removed)}")
print("IDs:", all_removed)

Number of hyperedges: 50

Procesando cluster -1 (2860 papers)
Length del sample: 40
Model output: Based on the provided papers, here's my assessment of which IDs should be removed:

**IDs to Remove:**

1.  **ID: 3277013** – This ID seems highly tangential to the core theme of diabetes and atherosclerosis. It focuses on the *mechanism* of action of the study (mitochondrial GSH depletion) and doesn't directly relate to the clinical question or the overall goal of the research. It's a bit of a diversion.

2.  **ID: 17720018** – This ID seems to be a bit too broad. While it discusses the risk factors for diabetes, it doesn't directly address the *mechanism* of the study – the impact of myocardial infarction on diabetes risk. It's a bit of a rehash of the initial findings.

**Why these are the most appropriate removals:**

*   **Focus & Scope:** The remaining IDs are all more tightly focused on the core research question – investigating the relationship between myocardial infarction and dia