In [2]:
from transformers import pipeline

In [3]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cuda:0


In [None]:
labels = ["finance", "politics", "culture"]

def classify_query(query):
    result = classifier(query, candidate_labels=labels)
    return {
        "query": query,
        "top_label": result["labels"][0],
        "scores": dict(zip(result["labels"], result["scores"]))
    }

# Example
print(classify_query("What's the Fed doing about inflation?"))

{'query': "What's the Fed doing about inflation?", 'top_label': 'finance', 'scores': {'finance': 0.900808572769165, 'politics': 0.049987439066171646, 'culture': 0.049204062670469284}}


In [None]:
import json
import random
import re

# CONFIG
INPUT_FILE = "finance_news.json"  # save your JSON above as this file
OUTPUT_FILE = "finance_pairs.jsonl"
NUM_PAIRS = 10  # generate one per article; adjust as needed

# Very simple "query synthesizer" from titles
def make_query_from_title(title: str):
    title = title.strip()
    title = re.sub(r"[\.\!\?]+$", "", title)  # remove trailing punctuation
    patterns = [
        f"What happened regarding {title.lower()}?",
        f"Can you summarize: {title}?",
        f"Explain the news about {title.lower()}",
        f"Give me an update on {title.lower()}",
        f"Details on {title.lower()}?"
    ]
    return random.choice(patterns)

def main():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        articles = json.load(f)

    random.shuffle(articles)
    out = []

    for art in articles[:NUM_PAIRS]:
        q = make_query_from_title(art["title"])
        pos = art["summary"].strip()
        out.append({"query": q, "positive": pos})

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for ex in out:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

    print(f"Wrote {len(out)} pairs to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()
