In [1]:
from pipeline import ClaimifyPipeline
from llm_client import LLMClient
import polars as pl
import json
import os

classification_data_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data_for_git/llm_classification_results_with_ids_10k.jsonl")

articles = []
with open(classification_data_path, "r", encoding="utf-8") as f:
    for line in f:
        articles.append(json.loads(line))

article_data_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data/article_texts_with_metadata.parquet")

article_data = pl.read_parquet(article_data_path)
print(articles[0]["result"])

kategorier = set([article["result"]["huvudkategori"] for article in articles])

kategori = "Svenska strategin"
selected_articles = []
for article in articles:
    if article["result"]["huvudkategori"] == kategori:
        if article["result"].get("personer") is not None:
            if "Anders Tegnell" in article["result"].get("personer", []):
                selected_articles.append(article)

print(len(selected_articles))

combined_data = []
for article in selected_articles:
    combined = {"classification": article["result"], "article": article_data.filter(pl.col("id") == article["id"]).to_dicts()[0]}
    combined_data.append(combined)

print(len(articles))

{'resonemang': 'Artikeln handlar om att Sverige öppnar upp men pandemin fortsätter, med fokus på att befolkningen har byggt upp immunitet genom vaccinationer och att den mer smittsamma delta-varianten sprids. Detta berör både immunitet, smittspridning och den svenska strategin för att leva med viruset.', 'kategorier': ['Immunitet', 'Smittspridning', 'Svenska strategin'], 'huvudkategori': 'Immunitet', 'källor': None, 'personer': ['Therese Bergstedt'], 'länder': ['Sverige']}
204
9986


In [None]:
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

from pipeline import UnifiedSentenceProgress

os.environ["OPENAI_API_KEY"] = ""
model = "openai/gpt-oss-120b"
llm_client = LLMClient(model)
question = """
Vilka påståenden nämns om Anders Tegnell?
"""

pipeline = ClaimifyPipeline(llm_client, question=question)
results = {}
result_lock = threading.Lock()

# One shared progress bar across all articles + all sentence threads.
# Total starts unknown (0) and grows as sentence-splitting discovers work.
sentence_progress = UnifiedSentenceProgress(total=0, desc="Processing sentences", unit="sent")

save_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data_for_git/extracted_svenska_modellen_10k.json")
# save results

def run_pipeline(sample, result_lock, sentence_progress):
    article = sample["article"]
    prompt = article["title"] + "\n" + article["actual_lead_text"] + "\n" + article["body_text"]

    result = pipeline.run(prompt, progress=sentence_progress)

    # Thread-safe: store result and immediately save to file
    with result_lock:
        results[article["id"]] = result
        # Save incrementally to avoid losing work if process hangs
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False)


MAX_WORKERS = 250
futures = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    for sample in combined_data:
        futures.append(executor.submit(run_pipeline, sample, result_lock, sentence_progress))

    for future in as_completed(futures):
        future.result()

sentence_progress.close()

# Results are already saved incrementally as they come in

# save result to json
notebook_dir = os.getcwd()
dir_path = os.path.join(notebook_dir, "claims_results")
print(dir_path)
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
# find the current highest run number
if not os.path.exists(os.path.join(dir_path, "current_run.json")):
    with open(os.path.join(dir_path, "current_run.json"), "w") as f:
        json.dump({model: 0}, f)
with open(os.path.join(dir_path, "current_run.json"), "r") as f:
    runs_dict = json.load(f)
    if runs_dict.get(model) is None:
        run_number = 0
        runs_dict[model] = 0
    else:
        run_number = runs_dict[model]



file_path = os.path.join(dir_path, f"{model.split('/')[-1]}_{run_number}.json")
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False)

runs_dict[model] = run_number + 1
with open(os.path.join(dir_path, "current_run.json"), "w") as f:
    json.dump(runs_dict, f)


Processing sentences: 0sent [00:00, ?sent/s]

Error validating response, retrying: 1 validation error for UrvalsSvar
  Invalid JSON: EOF while parsing an object at line 3494 column 0 [type=json_invalid, input_value='{"språk":"sv","mening":...\n\n\n\n\n \n\n \n\n \n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid
Error validating response, retrying: 1 validation error for UrvalsSvar
  Invalid JSON: EOF while parsing an object at line 3655 column 0 [type=json_invalid, input_value='{"språk":"sv","mening":...n\n\n\n\n\n\n\n\n\n\n\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid
Error validating response, retrying: 1 validation error for UrvalsSvar
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```jsoncjsonc5e9d71ff-7b...  \n  \n  \n  \n  \n  }', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid
Error validating response, retrying: 1 validation error

In [3]:
print(len(results))
for id, result in results.items():
    print(id)
    print(len(result))



50
1480959720953417730
0
1487364520829980678
2
1403788063126560772
1
1386636992944001028
7
1324492047340965888
57
1272742065621938181
0
1484408352411168770
0
1254473741523976202
0
1242099449289211905
3
1301509054527930369
0
1267733644447227904
1
1456895101003833346
5
1233374419818549248
40
1268767808827121664
5
1300407285324427265
1
1266366721146761217
0
1514172405001953287
2
1354094918793392128
0
1443212502104432645
31
1234487970226491392
0
1318514843394125832
1
1293066845629698049
0
1267862142960766977
5
1265660044823183360
0
1493625412047495169
1
1286984970469343232
0
1370348240709767168
17
1287984006588444673
4
1453705550424911882
0
1276174247656816646
89
1282379639185383424
1
1265388057596964869
14
1241797228722827265
0
1343566388489809922
18
1265237706084438016
17
1363900933030240267
6
1277969939454779395
0
1338880629191168001
1
1330620701208932358
13
1269705932528930816
4
1255463868727537670
22
1242062455066542081
0
1333757913433780231
178
1250359103186182144
84
1294151527402287

In [4]:
save_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data_for_git/extracted_svenska_modellen.json")
# save results
with open(save_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False)

In [5]:
load_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data_for_git/extracted_svenska_modellen.json")
# load results
with open(load_path, "r", encoding="utf-8") as f:
    results = json.load(f)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI


Client = OpenAI(base_url="http://localhost:4000/v1", api_key="EMPTY")
def get_embeddings(prompts):
    response = Client.embeddings.create(
        model="Qwen/Qwen3-Embedding-8B",
        input=prompts
    )
    return [item.embedding for item in response.data]

print("Embedding claims...")
all_article_claims = []
for id, text_claims in results.items():
    if len(text_claims) == 0:
        continue
    article_claims = []
    embeddings = get_embeddings(text_claims)
    for claim, embedding in zip(text_claims, embeddings):
        article_claims.append({"claim": claim, "id": id, "embedding": embedding})
    all_article_claims.append(article_claims)
print("Done embedding claims")

Embedding claims...
Done embedding claims


In [7]:
print(len(all_article_claims[0]))

2


In [8]:
all_articles_embedding_groups = []
for article_claims in all_article_claims:
    embedding_groups = []
    for claim in article_claims:
        embedding = claim["embedding"]
        if len(embedding_groups) == 0:
            embedding_groups.append({"representative": claim, "members": [claim]})
        else:
            most_similar_group = max(embedding_groups, key=lambda group: cosine_similarity([embedding], [group["representative"]["embedding"]])[0])
            if most_similar_group is not None and cosine_similarity([embedding], [most_similar_group["representative"]["embedding"]])[0] > 0.65:
                most_similar_group["members"].append(claim)
            else:
                embedding_groups.append({"representative": claim, "members": [claim]})
    all_articles_embedding_groups.append(embedding_groups)
final_claims = []
for article in all_articles_embedding_groups:
    for article_embedding_group in article:
        representative = article_embedding_group["representative"]
        representative["count"] = len(article_embedding_group["members"])
        final_claims.append(representative)


In [9]:
for i in range(len(all_articles_embedding_groups)):
    print(len(all_article_claims[i]))
    print(len(all_articles_embedding_groups[i]))
    print()


2
1

1
1

7
4

57
14

3
1

1
1

5
3

40
24

5
3

1
1

2
1

31
10

1
1

5
2

1
1

17
6

4
4

89
27

1
1

14
9

18
17

17
8

6
3

1
1

13
11

4
3

22
8

178
51

84
30

129
37

1
1

72
24

3
1

59
14



In [10]:
for article in all_articles_embedding_groups:
    article.sort(key=lambda x: len(x["members"]), reverse=True)
    for article_embedding_group in article:
        # print(article_embedding_group["representative"]["claim"])
        print(len(article_embedding_group["members"]))
        for member in article_embedding_group["members"]:
            print(member["claim"])
        print()


2
Inga påståenden om Anders Tegnell finns i utdraget.
Ingen av de medföljande meningarna nämner Anders Tegnell alls.

1
Inga påståenden om Anders Tegnell finns i det givna utdraget.

3
Inga påståenden om Anders Tegnell förekommer i utdraget.
Ingen av de citerade meningarna nämner Anders Tegnell – alla uttalanden i utdraget tillskrivs Stefan Löfven eller KU.
Ingen av de citerade påståendena i texten berör Anders Tegnell – han nämns inte alls i utdraget.

2
Vi ser inget tydligt mönster mellan lockdown och succé.
Det enda påståendet som förekommer i utdraget om Anders Tegnell är att han har sagt: “Vi ser inget tydligt mönster mellan lockdown och succé.”

1
False

1
Kort sagt: Anders Tegnell framställs i utdraget som den som betonar myndigheternas mandat, påpekar att brist på resurser hindrade tidig smittspårning, avvisar idén om flockimmunitet och menar att det vore fel att ignorera Folkhälsomyndighetens expertis.

9
När Tüll gick i pension var det Anders Tegnell som fick hans jobb, om än

In [63]:
data_result_path = os.path.join(os.getcwd(), "../Data-Analysis-Project-28/data/extracted_claims.json")
with open(data_result_path, "w", encoding="utf-8") as f:
    json.dump(final_claims, f, ensure_ascii=False)

In [12]:
import faiss
import numpy as np
search_claim = final_claims[303]
search_term = "Folkhälsomyndigheten var för passiv"
search_claim = {"claim": search_term, "embedding": get_embeddings([search_term])[0]}

embedding_index = faiss.IndexFlatL2(len(claim["embedding"]))
for claim in final_claims:
    embedding_index.add(np.array(claim["embedding"]).reshape(1, -1))


_, I = embedding_index.search(np.array(search_claim["embedding"]).reshape(1, -1), 10)

print("Original claim: ", search_claim["claim"])
for i in I[0][1:]:
    print(i)
    print(final_claims[i]["claim"])






Original claim:  Folkhälsomyndigheten var för passiv
294
Expertmyndighet på smittspridning
224
Folkhälsomyndigheten använder osäkra prognoser enligt kritiken.
218
Anders Tegnell är statsepidemiolog
230
Folkhälsomyndigheten har en genomarbetad strategi
252
Folkhälsomyndigheten har lämnat ut en del av mejlen men flera av de begärda saknades.
180
Uppmanar till fortsatt följsamhet av myndighetens råd
26
Risk för fullskalig pandemi
96
Allvarliga fall minskar i Sverige
184
Berömmer myndighetens personal för hårt arbete
