In [None]:
import csv
import ollama
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

MODEL = "mistral:instruct"
LIMIT = 30
BATCH_SIZE = 2
THREADS = 4
MAX_TAGS = 10

def extract_json_array(raw):
    m = re.search(r"\[.*\]", raw, re.DOTALL)
    return m.group(0) if m else "[]"

def build_batch_prompt(batch):
    formatted_articles = "\n\n".join([f"ARTICLE {i}:\n{text}" for i, text in batch])

    prompt = f"""
You are a semantic information extractor.

Your job is to create *ONE unified list of tags* for each article.

Each tag must be a *high-level conceptual theme* derived from:
- the meaning of the article,
- the emotions implicitly expressed,
- the type of locations involved.

STRICT RULES:
- NEVER output labels like "Emotion: X" or "Location: X".
- NEVER output standalone emotion words.
- NEVER output standalone location category words.
- ALL emotions must be embedded into conceptual tags.
- ALL locations must be converted into actual location.
- DO NOT use any exact words from the article.
- DO NOT output city names, country names, or real entities seperately
- DO NOT output categories like "emotion", "location", "state", "country".

Example good tags:
[
    "news","opinion","analysis","feature","interview","guide","how-to","tips","trends",
    "United States","United Kingdom","Canada","Australia","India","South Africa",
    "Germany","France","Brazil","Japan","New York","London","Toronto","Sydney",
    "Mumbai","Cape Town","Berlin","Paris","SÃ£o Paulo","Tokyo","North America",
    "Europe","Southeast Asia","Middle East","Sub-Saharan Africa","Latin America",
    "Silicon Valley","The Caribbean","The Arctic","West Coast","Midwest",
    "inspirational","emotional","uplifting","hopeful","tense","dramatic",
    "heartbreaking","motivating","empowering","humorous","reflective",
    "optimistic","appreciative","nostalgic","shocking","urgent","Google",
    "Apple","Microsoft","Meta","Amazon","Tesla","Samsung","OpenAI","Red Cross",
    "UNICEF","WWF","Amnesty International","Doctors Without Borders","NASA",
    "NHS","CDC","EPA","European Union","United Nations","Harvard University",
    "MIT","Oxford University","Stanford University"
]

OUTPUT ONLY JSON:

[
{{
    "article_id": 0,
    "tags": []
}},
{{
    "article_id": 1,
    "tags": []
}}
]

ARTICLES:
{formatted_articles}
"""
    return prompt

def process_batch(batch):
    prompt = build_batch_prompt(batch)
    resp = ollama.generate(model=MODEL, prompt=prompt, stream=False)
    raw = resp.get("response", "")
    js_array = extract_json_array(raw)

    try:
        data = json.loads(js_array)
    except:
        data = []

    fixed = {aid: {"article_id": aid, "tags": []} for aid, _ in batch}

    for item in data:
        aid = item.get("article_id")
        if aid in fixed:
            tags = item.get("tags", [])
            cleaned = []
            for t in tags:
                if isinstance(t, str) and t.strip() and "placeholder" not in t.lower():
                    cleaned.append(t.strip())
            cleaned = cleaned[:MAX_TAGS]
            fixed[aid]["tags"] = cleaned

    return list(fixed.values())

def pipeline(csv_path):
    contents = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for idx, row in enumerate(reader):
            if idx >= LIMIT:
                break
            contents.append((idx, row["articleBody"]))
    batches = [contents[i:i+BATCH_SIZE] for i in range(0, len(contents), BATCH_SIZE)]
    results = {}
    with ThreadPoolExecutor(max_workers=THREADS) as ex:
        futures = {ex.submit(process_batch, batch): batch for batch in batches}
        for future in as_completed(futures):
            for item in future.result():
                results[item["article_id"]] = item
    return [results[i] for i in range(LIMIT)]

output = pipeline("foxnews_articless.csv")

with open("semantic_tags.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=4)

print("Saved as semantic_tags.json")
