Imports:

In [1]:
import os
import time
import json
import csv

import numpy as np
import pandas as pd
import torch
import random
import re

from dotenv import load_dotenv
from openai import OpenAI, AzureOpenAI
from openai._exceptions import OpenAIError, RateLimitError
from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

import faiss

from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed

from collections import Counter
from torch.nn import CrossEntropyLoss

  from .autonotebook import tqdm as notebook_tqdm


Create dataFrame for the question assessment

In [3]:
# Specify the path to the text file
file_path = '2024-question-assessment.txt'

array = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.split("\t")
        if (line[5] == "4" or line[5] == "3" or line[5] == "1"):
            array.append(line)
            
dataFrame = pd.DataFrame(array)
dataFrame.columns = ["AID", "Article", "RID", "Model", "Order", "Assessment", "Useless", "Question"]
dataFrame.drop(columns=["AID", "Model", "RID", "Useless", "Order"], inplace=True)

dataFrame.head()

Unnamed: 0,Article,Assessment,Question
0,clueweb22-en0030-87-05450,1,"Who is Helen Russell, the author of the articl..."
1,clueweb22-en0030-87-05450,1,Has the show John Dillermand received support ...
2,clueweb22-en0030-87-05450,1,What is the professional credentials of Christ...
3,clueweb22-en0030-87-05450,1,"Who is Erla Heinesen Højsted, cited in the art..."
4,clueweb22-en0030-87-05450,1,Does The Guardian clearly described how the sh...


Prompt for labeling our data into categories, outputs as an csv file named output.csv

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Adjust with your data loading method
qArray = [row[7] for row in array]

categories = [
    "Author Bias",
    "Claim Verification",
    "Source Background & Credibility",
    "Publication Reputation",
    "Public Reaction",
    "Bad Question"
]

def format_prompt(qs):
    header = (
        f"Classify each of the following questions into one of these categories per line: {', '.join(categories)}.\n\n"
        "These are the criteria for a bad question:\n"
        "- The question assumes prior knowledge of the article or refers vaguely to \"the claim,\" \"the event,\" or \"this person\" without explanation.\n"
        "- If the question contains \"in the article\" it is a bad question.\n"
        "- The question can’t be answered from a single source or needs synthesis from multiple unrelated documents.\n"
        "- The question contains multiple parts, asks more than one thing at once, or is vague in its focus.\n"
        "- The question asks for interpretations, motivations, or hypothetical outcomes instead of concrete, verifiable facts."
        '\nReturn the result as plain CSV with no extra text.'
        ' Format like this:\n\n'
        '"Example question here?","Author Bias"\n'
    )
    body = "\n".join([f"\"{q.strip()}\"" for q in qs])
    return header + body

def process_batch(index, chunk, retries=3):
    prompt = format_prompt(chunk)
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=4096,
            )
            return index, response.choices[0].message.content.strip().splitlines()
        except (OpenAIError, RateLimitError) as e:
            wait = 2 ** attempt
            time.sleep(wait)
    return index, []

batch_size = 50
batches = [(i, qArray[i:i + batch_size]) for i in range(0, len(qArray), batch_size)]

results = {}
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_batch, i, chunk) for i, chunk in batches]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        idx, lines = future.result()
        if lines:
            results[idx] = lines
        else:
            print(f"Batch {idx // batch_size} failed.")

# Sort results by original batch order
ordered = []
for i in sorted(results):
    ordered.extend(results[i])

print(f"✅ Total processed: {len(ordered)}")

# Save to CSV
with open("output.csv", "w", encoding="utf-8") as f:
    f.write('"question","category"\n')
    for line in ordered:
        if "," in line:
            f.write(line.strip() + "\n")

Processing:  47%|████▋     | 53/113 [02:51<04:12,  4.20s/it]

Batch 53 failed.


Processing:  85%|████████▍ | 96/113 [05:44<01:03,  3.75s/it]

Batch 95 failed.


Processing:  89%|████████▉ | 101/113 [06:04<00:47,  3.96s/it]

Batch 100 failed.


Processing: 100%|██████████| 113/113 [06:49<00:00,  3.62s/it]

✅ Total processed: 5698





Split Data into train/test split

In [33]:
def splitData(Question, Category):
    Q_train, Q_test, c_train, c_test = train_test_split(Question, Category, test_size=0.3, random_state=42)
    return Q_train, Q_test, c_train, c_test

Load all data into train_test_split and add it to a BERT readable dataset

In [49]:
df1 = pd.read_csv("output.csv", header=None, names=["Question", "Category"])

col1 = df1.iloc[:, 0].to_numpy()
col2 = df1.iloc[:, 1].to_numpy()

Q_train, Q_test, c_train, c_test = splitData(col1, col2)

# Load tokenizer and model with built-in classification head
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)


le = LabelEncoder()

class TextDataset(Dataset):
    def __init__(self, questions, categories):
        self.encodings = tokenizer(questions, truncation=True, padding=True)
        self.labels = torch.tensor(le.fit_transform(categories))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Prepare dataset
dataset = TextDataset(list(Q_train), list(c_train))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Count label frequencies
label_counts = Counter([dataset[i]["labels"].item() for i in range(len(dataset))])

# Make sure the order matches label encoding
num_labels = len(label_counts)
total = sum(label_counts.values())
weights = [total / (num_labels * label_counts[i]) for i in range(num_labels)]
class_weights = torch.tensor(weights, dtype=torch.float)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn = CrossEntropyLoss(weight=class_weights.to(device))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


Defines hyperparameters for the model and trains it.

In [None]:
# Define training arguments
training_args = TrainingArguments(
    num_train_epochs=3,
    auto_find_batch_size = True,
    save_strategy="no",
    output_dir="fine_tuned_model"
)

# Set up Trainer API to handle training loop
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()


Train BERT using KFOLD validation

In [None]:
# Load data
df1 = pd.read_csv("output.csv", header=None, names=["Question", "Category"])
col1 = df1.iloc[:, 0].to_numpy()
col2 = df1.iloc[:, 1].to_numpy()
# Initialize label encoder and tokenizer
le = LabelEncoder()
le.fit(col2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Change to your model if needed
# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
        self.labels = torch.tensor(le.transform(labels))
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []
for fold, (train_indices, val_indices) in enumerate(skf.split(col1, col2)):
    print(f"\nTraining Fold {fold+1}/5")
    # Convert from NumPy array to list[str]
    train_texts = col1[train_indices].tolist()
    train_labels = col2[train_indices].tolist()
    val_texts = col1[val_indices].tolist()
    val_labels = col2[val_indices].tolist()
    # Prepare datasets
    train_dataset = TextDataset(train_texts, train_labels)
    val_dataset = TextDataset(val_texts, val_labels)
    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    # Reinitialize model per fold
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    # Training loop
    model.train()
    for epoch in range(3):  # Adjust as needed
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels_encoded = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted.cpu().numpy())
            val_labels_encoded.extend(labels.cpu().numpy())
    # Decode labels
    predicted_labels = le.inverse_transform(val_predictions)
    real_labels = le.inverse_transform(val_labels_encoded)
    # Report accuracy
    accuracy = accuracy_score(val_labels_encoded, val_predictions)
    fold_accuracies.append(accuracy)
    print(f"Fold {fold+1} Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(real_labels, predicted_labels))
# Average score
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Cross-Validation Accuracy: {average_accuracy:.4f}")

Load trained model from disk

In [55]:
model = BertForSequenceClassification.from_pretrained("fine_tuned_model")

Function for making predictions with BERT

In [36]:
def predict(Test_Questions: list, batch_size: int):
    predictions = []
    for i in tqdm(range(0, len(Test_Questions), batch_size), desc="Running inference"):
        batch = Test_Questions[i:i+batch_size]
        test_encodings = tokenizer(batch, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**test_encodings)
            batch_preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(batch_preds)

    # Convert predicted tensor to list of integers
    predicted_labels = [label.item() for label in predictions]

    predicted_labels = le.inverse_transform(predicted_labels)

    # Print results
    for i in range(len(Test_Questions)):
        print(Test_Questions[i] + "----->" + predicted_labels[i])
    return predictions

In [37]:
def data_report(predictions: list, actual_labels: list):
    # Convert predicted tensor to list of integers
    predicted_labels = [label.item() for label in predictions]

    predicted_labels = le.inverse_transform(predicted_labels)
    # Accuracy
    print("Accuracy:", accuracy_score(actual_labels, predicted_labels))

    # Classification report
    print("\nClassification Report:")
    print(classification_report(actual_labels, predicted_labels))

    # Confusion matrix
    cm = confusion_matrix(actual_labels, predicted_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_,
                yticklabels=le.classes_)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

Prints out a classifcation report for the test data output

In [None]:
Q_test_list = Q_test.astype(str).tolist()
predictions = predict(Q_test_list, 64)

c_test_list = c_test.astype(str).tolist()
data_report(predictions, c_test_list)

Running inference: 100%|██████████| 1/1 [00:00<00:00, 23.23it/s]

Is the sky blue?----->Claim Verification





Creates a Pandas dataFrame from the clueweb database

In [13]:
# Specify the path to the text file
file_path = 'trec-2024-lateral-reading-task2-baseline-documents.jsonl'


with open(file_path, 'r', encoding="utf8") as f:
    data = [json.loads(line) for line in f]

clue_df = pd.DataFrame(data)

print(len(clue_df))
clue_df.drop(columns=["URL-hash", "Language"], inplace=True)
clue_df.head()

38131


Unnamed: 0,URL,ClueWeb22-ID,Clean-Text
0,https://www.dailymail.co.uk/news/article-10130...,clueweb22-en0032-91-05114,"Japan's Princess Mako marries commoner, loses ..."
1,https://www.nytimes.com/2021/08/26/opinion/afg...,clueweb22-en0027-70-17775,Opinion | Let’s Not Pretend That the Way We Wi...
2,https://www.politicshome.com/thehouse/article/...,clueweb22-en0032-18-01494,No longer can China’s atrocities against the U...
3,https://medicaladvise.org/clinical-trials-rese...,clueweb22-en0012-53-13803,How does molnupiravir work? - Medical Advise\n...
4,https://www.euronews.com/2021/12/10/mexico-tru...,clueweb22-en0002-69-11564,Mexico truck crash: Dozens killed after lorry ...


Create FAISS index and save it to disk

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Good balance of speed/accuracy

# Your text database
texts = clue_df["Clean-Text"].tolist()

# Create embeddings
embeddings = model.encode(texts,batch_size=32, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

# Initialize FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use IndexFlatIP with normalized vectors for cosine similarity

np.save("embedding_matrix.npy", embeddings)
# Add embeddings to index
index.add(embeddings)
faiss.write_index(index, "semantic_index.faiss")

Load FAISS index

In [15]:
index = faiss.read_index("semantic_index.faiss")

In [None]:
vector = index.reconstruct(20)
query = "Newsweek Gets DESTROYED For Fearmongering on Kids & Vaccine"
query_embedding = model.encode(query, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
query_embedding = np.expand_dims(query_embedding, axis=0)
scores, indices = index.search(query_embedding, 5)

count = 0

with open("trec-2024-lateral-reading-task2-baseline-documents.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if count == 19000:
            data = json.loads(line)
            print(data)  # do whatever with one object at a time
            break
        count += 1


{'URL': 'https://poorrichardsnews.com/ana-navarro-is-a-moron/\n', 'URL-hash': 'D06D4FDF2BF042D631B9C28D2A57C4D8', 'Language': 'en', 'ClueWeb22-ID': 'clueweb22-en0000-61-05623', 'Clean-Text': 'Ana Navarro is a moron – PoorRichardsNews.com\nAna Navarro is a moron\nHome /Illegal Aliens, International/Ana Navarro is a moron\nIllegal Aliens,International|\nOctober 23, 2019\nNavarro denies Venezuela’s Maduro is a socialist\nOn the view Ana Navarro cannot control her mouth.\nThe View remains such an awful example example of how moronic Hollywood and commentators are who don’t know the facts.\n“Maduro is NOT a socialist, he is a corrupt Murderer… blah, blah blah….”\nAnna Navarro files her nails on TV\nNavarro files her nails during border wall debate\nWatch on\n“Hypocrisy needs to be called out in American Politics, and the absurd has reached the point that it is insufferable”\n“Democrat Congressman Ted Lieu and CNN’s resident RINO Ana Navarro both fell for a hoax tweet posted by leftist Time 

Query the index

In [17]:
# Data prep
texts = clue_df["Clean-Text"].tolist()
IDs = clue_df["ClueWeb22-ID"].tolist()

# Query
query = "How does the glue gunner work in BloonsTD6?"
query_tag = "clueweb22-en0030-87-05450"
model = SentenceTransformer('all-MiniLM-L6-v2')
# Encode query
query_embedding = model.encode(query, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
query_embedding = np.expand_dims(query_embedding, axis=0)  # shape (1, dim)

# Load FAISS index (assumed to be cosine similarity = inner product of normalized vectors)
index = faiss.read_index("semantic_index.faiss")

# Search
k = 10  # search more than 5 in case we exclude the query itself
scores, indices = index.search(query_embedding, k)

# Filter out the query itself and build results
results = []
for score, idx in zip(scores[0], indices[0]):
    if IDs[idx] == query_tag:
        continue  # skip the query origin

    text = texts[idx]
    snippet = text[:200].replace('\n', ' ')
    if len(text) > 200:
        snippet += '...'

    results.append({
        "rank": len(results) + 1,
        "score": round(float(score), 4),
        "snippet": snippet,
        "id": IDs[idx],
    })

    if len(results) == 5:
        break  # stop once we have top 5 (excluding query)

# Print results
for r in results:
    print(f"Rank: {r['rank']}, Score: {r['score']}, ID: {r['id']}, Snippet: {r['snippet']}")

Rank: 1, Score: 0.7362, ID: clueweb22-en0016-63-04220, Snippet: Super Glue | Bloons Wiki | Fandom Bloons Wiki 4,424 pages Explore Bloons Games Game Mechanics Other Pages Wiki-Related Info in: Glue Gunner, Bloons TD 6, Upgrades, and 5 more Super Glue Edit BTD6 BTDB...
Rank: 2, Score: 0.4847, ID: clueweb22-en0022-56-01572, Snippet: Shattering Shells | Bloons Wiki | Fandom Bloons Wiki 4,428 pages Explore Bloons Games Game Mechanics Other Pages Wiki-Related Info in: Articles in construction with a WIP, Mortar Monkey, Bloons TD 6, ...
Rank: 3, Score: 0.4604, ID: clueweb22-en0002-59-07035, Snippet: Sticky Bomb | Bloons Wiki | Fandom Bloons Wiki 4,428 pages Explore Bloons Games Game Mechanics Other Pages Wiki-Related Info in: Ninja Monkey, Bloons TD 6, Upgrades, and 4 more Sticky Bomb Edit BTD6 B...
Rank: 4, Score: 0.4592, ID: clueweb22-en0011-30-13287, Snippet: Fortified Bloon | Bloons Wiki | Fandom Bloons Wiki 4,434 Explore Bloons Games Game Mechanics Other Pages Wiki-Related Info in: Bloon

Load in marco database in batches into dataframe and save to disk

In [9]:
# Specify the path to the text file
file_path = 'msmarco_v2.1_doc_00.json'


with open(file_path, 'r', encoding="utf8") as f:
    data = [json.loads(line) for line in f]

ms_df = pd.DataFrame(data)
ms_df.drop(columns=["url", "title", "headings"], inplace=True)
print(len(ms_df))
ms_df.head()

193732


Unnamed: 0,body,docid
0,0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile T...,msmarco_v2.1_doc_00_0
1,Ethel Percy Andrus Gerontology Center [WorldCa...,msmarco_v2.1_doc_00_4810
2,Museo Nacional de Bellas Artes (Cuba) [WorldCa...,msmarco_v2.1_doc_00_18906
3,File extension 00000 is used by operating syst...,msmarco_v2.1_doc_00_32860
4,Open 00001 File\n\nOpen 00001 File\nTo open 00...,msmarco_v2.1_doc_00_37424


Code for vectorizing marcoDB

In [None]:
json_input_dir = ""
output_dir = "output/"
model = SentenceTransformer('all-MiniLM-L6-v2')


# Specify the path to the text file
for entry in os.scandir(json_input_dir):
    if entry.is_file():  # check if it's a file
        print("Starting: ", entry.name)
        with open(entry, 'r', encoding="utf8") as f:
            data = [json.loads(line) for line in f]
        marco_df = pd.DataFrame(data)
        marco_df.drop(columns=["url", "title", "headings"], inplace=True)


        # Your text database
        texts = marco_df["body"].tolist()

        # Create embeddings
        embeddings = model.encode(texts,batch_size=200, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

        # Initialize FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)  # Use IndexFlatIP with normalized vectors for cosine similarity

        # Add embeddings to index
        index.add(embeddings)
        faiss.write_index(index, "semantic_index_" + entry.name[17] + entry.name[18] + ".faiss")
        data.clear()

Simple Question Generation

In [2]:
file_path = 'trec-2024-lateral-reading-task2-baseline-documents.jsonl'
with open(file_path, 'r', encoding="utf8") as f:
    data = [json.loads(line) for line in f]

clueweb_df = pd.DataFrame(data)
clueweb_df.drop(columns=["URL-hash", "Language"], inplace=True)

# Azure OpenAI setup
endpoint = "https://at23s-mb8dlh6z-eastus2.cognitiveservices.azure.com/"
model_name = "gpt-4.1"
deployment = "gpt-4.1"
subscription_key = "650DCTqhZvPkvgkoUBIzKQogLDUINt8xfEnP0PIC0Y0mqWYryxBDJQQJ99BEACHYHv6XJ3w3AAAAACOGYEve"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# Prompt setup
prompt1 = (
    "You are a professional fact-checker and media literacy expert. Your task is to generate exactly 10 concise and actionable questions "
    "that a thoughtful reader should ask when evaluating the trustworthiness of a news article.\n"
    "Each question should:\n"
    "- Be under 300 characters.\n"
    "- Address only one idea (no compound or multi-part questions).\n"
    "- Require external research (not answerable directly from the article).\n"
    "- Be specific to the article’s content.\n"
    "- Be clear and actionable, avoiding vagueness or academic jargon.\n"
    "Types of Questions to Include:\n"
    "1. Source History: e.g., What is [source name]'s history of reporting on this issue?\n"
    "2. Source Bias: e.g., What are the known biases of [source name]?\n"
    "3. Expert Credentials: e.g., What is [expert name]’s academic or professional background?\n"
    "4. Evidence Scrutiny: e.g., Are there independent studies supporting the claim about X?\n"
    "5. Perspective Balance: e.g., Are key opposing viewpoints on [issue] missing?\n"
    "6. Contextualization: e.g., Has this event occurred before, and how was it covered then?\n"
    "Examples of Ideal Output:\n"
    "- What is DR's history in producing children's television shows?\n"
    "- What is Erla Heinesen Højsted's professional background in children's psychology?\n"
    "- Is Christian Groes' view on patriarchal societies representative of the academic consensus?\n"
    "- Academic studies on using humor to teach children responsibility and accountability?\n"
    "- Male genitalia in kids' TV: aligned with international children's health recommendations?\n"
    "- Is filtering and diluting water enough to make it safe for ocean discharge?\n"
    "Final Instruction: Only return the 10 questions with no commentary. Each question must be under 300 characters, focused, "
    "and answerable through research outside the article."
)

# Prepare data
article_ids = clueweb_df["ClueWeb22-ID"].tolist()
article_texts = clueweb_df["Clean-Text"].tolist()
if len(article_ids) != len(article_texts):
    raise ValueError("article_ids and article_texts must be the same length")

max_articles = 2000
sample_indices = random.sample(range(len(article_ids)), min(len(article_ids), max_articles))
output_csv_path = "CSVs/article_questions.csv"

# Clean question utility
def clean_question(text):
    return re.sub(r"^\s*(\d{1,2}[.)]|[-*])\s*", "", text).strip()

max_article_chars = 30000  # Maximum characters per article

# Worker function for a single article
def process_article(idx):
    article_id = article_ids[idx]
    article_text = article_texts[idx]

    if len(article_text) > max_article_chars:
        return []  # Skip articles that are too long

    messages = [
        {"role": "system", "content": "You are a professional fact-checker and media literacy expert."},
        {"role": "user", "content": f"{prompt1}\n\nArticle:\n{article_text}\n\nGenerate the 10 questions now."}
    ]

    try:
        response = client.chat.completions.create(
            model=deployment,
            messages=messages,
            max_tokens=800,
            temperature=0.7,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )

        questions_text = response.choices[0].message.content.strip()
        lines = [line.strip() for line in questions_text.split('\n') if line.strip()]
        question_list = [clean_question(line) for line in lines]
        return [(article_id, q) for q in question_list[:10]]

    except Exception as e:
        return []


# Run multithreaded processing
results = []
with ThreadPoolExecutor(max_workers=2) as executor:  # Adjust `max_workers` as needed
    futures = [executor.submit(process_article, idx) for idx in sample_indices]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing articles"):
        result = future.result()
        if result:
            results.extend(result)

# Save to CSV
with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(results)

print(f"Saved questions for {len(results)//10} articles to {output_csv_path}")

Processing articles: 100%|██████████| 2000/2000 [1:31:47<00:00,  2.75s/it]  

Saved questions for 1444 articles to CSVs/article_questions.csv





Bad Question Filtering and Question Labeling

In [None]:
questions = pd.read_csv("CSVs/article_questions.csv", header=None)
questions = questions[[1]].values.tolist()

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Adjust with your data loading method

def format_prompt(qs):
    header = (
        f"""
            You will review a list of questions and classify each one based on the rules below.

            A question is a "Bad Question" if it violates **any** of the following:
            - It assumes prior knowledge ("the claim", "the author", etc.) without clarity.
            - It refers vaguely to the text or says "in the article" .
            - It asks for multiple things at once (compound question).
            - It is vague, broad, or asks for interpretation or hypotheticals.
            - It cannot be answered using a single source/document.
            - It includes multiple subjects or multiple objects.

            If a question violates **any** of these rules, label it as "Bad Question".
            Otherwise, label it as "OK".

            Return plain CSV only. Format:
            "Question here","Bad Question" or "OK"

            Example:
            "What is the author's intent and how do readers feel about it?","Bad Question"
            "What is the background of journalist Jane Doe?","OK"
            """ 
        )

    body = "\n".join([f"\"{q[0].strip()}\"" for q in qs])
    return header + body

def process_batch(index, chunk, retries=3):
    prompt = format_prompt(chunk)
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=4096,
            )
            return index, response.choices[0].message.content.strip().splitlines()
        except (OpenAIError, RateLimitError) as e:
            wait = 2 ** attempt
            time.sleep(wait)
    return index, []

batch_size = 50
batches = [(i, questions[i:i + batch_size]) for i in range(0, len(questions), batch_size)]

results = {}
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_batch, i, chunk) for i, chunk in batches]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        idx, lines = future.result()
        if lines:
            results[idx] = lines
        else:
            print(f"Batch {idx // batch_size} failed.")

# Sort results by original batch order
ordered = []
for i in sorted(results):
    ordered.extend(results[i])

print(f"Total processed: {len(ordered)}")

# Save to CSV
with open("CSVs/bad_questions.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)

    # Filter out code block markers if present
    filtered_lines = [line for line in ordered if line.strip() not in ("```", "```plaintext")]

    for line in filtered_lines:
        line = line.strip().strip('"')
        if not line:
            continue
        # Split on the last comma
        if "," in line:
            split_idx = line.rfind(",")
            question = line[:split_idx].strip().strip('"')
            category = line[split_idx + 1:].strip().strip('"')
            writer.writerow([question, category])

with open("CSVs/bad_questions.csv", "r", encoding="utf-8") as f:
    ok_rows = []
    reader = csv.reader(f)
    for row in reader:
        if len(row) >= 2 and row[1].strip() == "OK":
            ok_rows.append([row[0]])

    with open("CSVs/ok_questions.csv", "w", newline='', encoding="utf-8") as out_f:
        writer = csv.writer(out_f, quoting=csv.QUOTE_ALL)
        for row in ok_rows:
            writer.writerow(row)


questions2 = pd.read_csv("CSVs/ok_questions.csv", header=None)
questions2 = questions2[[0]].values.tolist()

def format_prompt2(qs):

    header = f"""
        You will classify each of the following questions into exactly one of the categories below.
        Pick the **single best** category that fits the main intent of the question.

        Categories:
        - Author Bias
        - Claim Verification
        - Source Background & Credibility
        - Publication Reputation
        - Public Reaction

        Return plain CSV only. Format:
        "<Question>","<Category>"

        Example:
        "What is the background of journalist Jane Doe?","Source Background & Credibility"
        """

    body = "\n".join([f"\"{q[0].strip()}\"" for q in qs])
    return header + body

def process_batch2(index, chunk, retries=3):
    prompt = format_prompt2(chunk)
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=4096,
            )
            return index, response.choices[0].message.content.strip().splitlines()
        except (OpenAIError, RateLimitError) as e:
            wait = 2 ** attempt
            time.sleep(wait)
    return index, []
batch_size = 50
batches = [(i, questions2[i:i + batch_size]) for i in range(0, len(questions2), batch_size)]

results = {}
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_batch2, i, chunk) for i, chunk in batches]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        idx, lines = future.result()
        if lines:
            results[idx] = lines
        else:
            print(f"Batch {idx // batch_size} failed.")

# Sort results by original batch order
ordered = []
for i in sorted(results):
    ordered.extend(results[i])

print(f"Total processed: {len(ordered)}")

# Save to CSV
with open("output1.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)

    # Filter out code block markers if present
    filtered_lines = [line for line in ordered if line.strip() not in ("```", "```plaintext")]

    for line in filtered_lines:
        line = line.strip().strip('"')
        if not line:
            continue
        # Split on the last comma
        if "," in line:
            split_idx = line.rfind(",")
            question = line[:split_idx].strip().strip('"')
            category = line[split_idx + 1:].strip().strip('"')
            writer.writerow([question, category])

Processing:  13%|█▎        | 39/289 [02:01<16:54,  4.06s/it] 

Batch 38 failed.


Processing:  15%|█▍        | 42/289 [02:09<12:01,  2.92s/it]

Batch 40 failed.


Processing:  19%|█▊        | 54/289 [03:03<17:45,  4.54s/it]

Batch 54 failed.


Processing:  25%|██▍       | 71/289 [04:11<11:20,  3.12s/it]

Batch 70 failed.


Processing:  28%|██▊       | 81/289 [04:45<10:12,  2.95s/it]

Batch 78 failed.


Processing:  31%|███       | 90/289 [05:26<12:08,  3.66s/it]

Batch 87 failed.


Processing:  33%|███▎      | 94/289 [05:47<16:59,  5.23s/it]

Batch 92 failed.


Processing:  38%|███▊      | 109/289 [06:41<09:03,  3.02s/it]

Batch 107 failed.


Processing:  44%|████▎     | 126/289 [08:00<12:28,  4.59s/it]

Batch 124 failed.


Processing:  46%|████▌     | 132/289 [08:25<10:55,  4.18s/it]

Batch 131 failed.


Processing:  50%|████▉     | 144/289 [09:12<09:10,  3.80s/it]

Batch 142 failed.


Processing:  52%|█████▏    | 149/289 [09:33<11:35,  4.97s/it]

Batch 148 failed.


Processing:  54%|█████▍    | 157/289 [09:58<08:11,  3.73s/it]

Batch 154 failed.


Processing:  56%|█████▌    | 161/289 [10:14<09:20,  4.38s/it]

Batch 159 failed.


Processing:  58%|█████▊    | 169/289 [10:44<07:48,  3.91s/it]

Batch 167 failed.


Processing:  60%|█████▉    | 173/289 [10:59<08:09,  4.22s/it]

Batch 171 failed.


Processing:  61%|██████    | 176/289 [11:07<06:02,  3.21s/it]

Batch 172 failed.


Processing:  67%|██████▋   | 194/289 [12:27<08:01,  5.07s/it]

Batch 191 failed.


Processing:  72%|███████▏  | 207/289 [13:14<05:05,  3.72s/it]

Batch 205 failed.


Processing:  73%|███████▎  | 210/289 [13:27<05:07,  3.89s/it]

Batch 207 failed.


Processing:  76%|███████▌  | 220/289 [14:00<03:20,  2.90s/it]

Batch 216 failed.


Processing:  80%|███████▉  | 230/289 [14:43<03:11,  3.25s/it]

Batch 228 failed.


Processing:  82%|████████▏ | 238/289 [15:19<04:09,  4.89s/it]

Batch 236 failed.


Processing:  87%|████████▋ | 250/289 [16:12<03:14,  4.99s/it]

Batch 249 failed.


Processing:  90%|█████████ | 261/289 [16:50<01:30,  3.24s/it]

Batch 258 failed.


Processing:  91%|█████████ | 262/289 [16:53<01:27,  3.22s/it]

Batch 260 failed.


Processing:  93%|█████████▎| 269/289 [17:20<01:11,  3.56s/it]

Batch 267 failed.


Processing:  95%|█████████▍| 274/289 [17:36<00:51,  3.45s/it]

Batch 271 failed.


Processing:  98%|█████████▊| 283/289 [18:10<00:20,  3.45s/it]

Batch 279 failed.


Processing:  99%|█████████▉| 286/289 [18:21<00:10,  3.50s/it]

Batch 282 failed.


Processing: 100%|██████████| 289/289 [18:26<00:00,  3.83s/it]


Total processed: 13232


Processing:   9%|▉         | 17/190 [01:20<11:39,  4.05s/it]

Batch 15 failed.
Batch 16 failed.


Processing:  15%|█▍        | 28/190 [01:52<06:02,  2.24s/it]

Batch 25 failed.


Processing:  20%|██        | 38/190 [02:37<08:49,  3.48s/it]

Batch 35 failed.


Processing:  23%|██▎       | 43/190 [02:56<09:29,  3.88s/it]

Batch 41 failed.


Processing:  32%|███▏      | 60/190 [04:03<08:57,  4.13s/it]

Batch 57 failed.


Processing:  35%|███▍      | 66/190 [04:27<08:35,  4.16s/it]

Batch 65 failed.


Processing:  35%|███▌      | 67/190 [04:33<10:04,  4.92s/it]

Batch 66 failed.


Processing:  41%|████      | 77/190 [05:02<05:11,  2.76s/it]

Batch 74 failed.


Processing:  43%|████▎     | 82/190 [05:27<09:33,  5.31s/it]

Batch 81 failed.


Processing:  49%|████▉     | 93/190 [06:05<05:53,  3.64s/it]

Batch 91 failed.


Processing:  50%|█████     | 95/190 [06:11<04:59,  3.15s/it]

Batch 92 failed.


Processing:  54%|█████▍    | 103/190 [06:38<04:03,  2.80s/it]

Batch 100 failed.


Processing:  58%|█████▊    | 110/190 [07:09<05:19,  4.00s/it]

Batch 109 failed.


Processing:  62%|██████▏   | 118/190 [07:37<04:01,  3.36s/it]

Batch 116 failed.


Processing:  64%|██████▍   | 122/190 [07:53<04:42,  4.16s/it]

Batch 120 failed.


Processing:  71%|███████   | 134/190 [08:38<02:53,  3.10s/it]

Batch 132 failed.


Processing:  74%|███████▎  | 140/190 [09:09<05:10,  6.22s/it]

Batch 139 failed.


Processing:  81%|████████  | 153/190 [09:49<01:47,  2.89s/it]

Batch 151 failed.


Processing:  83%|████████▎ | 158/190 [10:09<02:08,  4.00s/it]

Batch 156 failed.


Processing:  87%|████████▋ | 166/190 [10:39<01:24,  3.53s/it]

Batch 164 failed.


Processing:  96%|█████████▌| 182/190 [11:45<00:30,  3.81s/it]

Batch 181 failed.


Processing: 100%|██████████| 190/190 [12:25<00:00,  3.92s/it]

Total processed: 8609



