Plan of Action:
    1. Use starter pack to generate questions to see how good it is at doing it.
    2. If it is bad at it, we will come back and write our own question generator in the future, but we will use it for now.
    3. Start the report generation task
    4. Sample ideas include: Using NLP models to "average out" the repetitive questions for less API usage. Have predetermined categories for types of questions. Each category must be answered in the report. Some examples of category include but are not limited to, author bias, fact cross checking, author reputation, general question asking related to the topic at hand.

This block of code reads the 2024 question document and filters out any questions not rated as '4'. Prints the output to a file.

In [15]:
import pandas as pd
import numpy as np
# Specify the path to the text file
file_path = '2024-question-assessment.txt'

array = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.split("\t")
        if (line[5] == "4" or line[5] == "3"):
            array.append(line)
            
        
dataFrame = pd.DataFrame(array)
dataFrame.columns = ["AID", "Article", "RID", "Model", "Order", "Assessment", "Useless", "Question"]
dataFrame.drop(columns=["AID", "Model", "RID", "Useless", "Order"], inplace=True)

dataFrame.head()

Unnamed: 0,Article,Assessment,Question
0,clueweb22-en0030-87-05450,3,What are the qualifications of the creators of...
1,clueweb22-en0030-87-05450,3,What is Anne Lise Marstrand-Jørgensen's expert...
2,clueweb22-en0030-87-05450,3,Does the Danish public service broadcaster DR ...
3,clueweb22-en0030-87-05450,3,What evidence supports the claim that the show...
4,clueweb22-en0030-87-05450,3,What is the broader context of Danish children...


In [3]:
import pandas as pd
import json 

# Specify the path to the text file
file_path = '../trec-2024-lateral-reading-task2-baseline-documents.json'

f = open(file_path)
data = json.load(f)
articles_df = pd.DataFrame.from_dict(data)
articles_df.head()



Unnamed: 0,URL,URL-hash,Language,ClueWeb22-ID,Clean-Text
0,https://www.dailymail.co.uk/news/article-10130...,1F1D55AC16DCD50B1560DE585165466D,en,clueweb22-en0032-91-05114,"Japan's Princess Mako marries commoner, loses ..."
1,https://www.nytimes.com/2021/08/26/opinion/afg...,4CC82FB7D4CB6DE296C887E0F7F82C57,en,clueweb22-en0027-70-17775,Opinion | Let’s Not Pretend That the Way We Wi...
2,https://www.politicshome.com/thehouse/article/...,357F722430ABDA02F9757BD8E4DF0CAA,en,clueweb22-en0032-18-01494,No longer can China’s atrocities against the U...
3,https://medicaladvise.org/clinical-trials-rese...,D0823CAF7F01DCC0D8112D527D936B86,en,clueweb22-en0012-53-13803,How does molnupiravir work? - Medical Advise\n...
4,https://www.euronews.com/2021/12/10/mexico-tru...,C623D58493D0F372CC5E56F59BD20611,en,clueweb22-en0002-69-11564,Mexico truck crash: Dozens killed after lorry ...


In [28]:
from openai import OpenAI
import os
from dotenv import load_dotenv
from tqdm import tqdm
import time

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Initialize client properly
client = OpenAI(api_key=api_key)
qArray = [row[7] for row in array]  # Adjust if needed

def format_prompt(qs):
    categories = [
        "Author Bias",
        "Factual Query",
        "Author Reputation",
        "Publication Reputation"
    ]
    prompt = f"Classify each of the following questions into one of these categories per line: {', '.join(categories)}.\n\n"
    prompt_tail = (
        '\nReturn the result as plain CSV with no extra text. Each row should have two columns: '
        '"Question" and "Category". Format like this:\n\n'
        '"Question","Category"\n'
        '"Example question here?","Author Bias"\n')
    for i, q in enumerate(qs, 1):
        prompt += f"{i}. {q.strip()}\n"
    prompt += prompt_tail
    return prompt

# Batch the questions to avoid exceeding token/context limits
batch_size = 20  # Adjust as needed for your model/context
prompts = [format_prompt(qArray[i:i+batch_size]) for i in range(0, len(qArray), batch_size)]
outputs = []

for prompt_text in tqdm(prompts, desc="Processing batches"):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt_text}],
        temperature=0
    )
    outputs.append(response.choices[0].message.content)
    time.sleep(0.2) 
# Save all outputs to a single CSV file
with open("output.csv", "w", encoding="utf-8") as f:
    header_written = False
    for output in outputs:
        lines = output.strip().splitlines()
        if not header_written:
            f.write(lines[0] + "\n")  # Write header
            header_written = True
        for line in lines[1:]:
            f.write(line + "\n")

Processing batches: 100%|██████████| 236/236 [23:14<00:00,  5.91s/it]


In [93]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

df1 = pd.read_csv("output.csv", header=None, names=["Question", "Category"])

col1 = df1.iloc[:, 0].to_numpy()
col2 = df1.iloc[:, 1].to_numpy()


Q_train, Q_test, c_train, c_test = train_test_split(col1, col2, test_size=0.3, random_state=42)


labels = [0, 1, 2, 3]  # You define your label mapping

# Define label names for readability (optional)
label_names = ["Author Bias",
        "Factual Query",
        "Author Reputation",
        "Publication Reputation"]

# Load tokenizer and model with built-in classification head
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_names))

# Define custom Dataset class
class TextDataset(Dataset):
    def __init__(self, Q_train, c_train):
        self.encodings = tokenizer(Q_train, truncation=True, padding=True, return_tensors="pt")
        le = LabelEncoder()
        c_train_encoded = le.fit_transform(c_train) 
        self.labels = torch.tensor(c_train_encoded)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Prepare dataset
dataset = TextDataset(list(Q_train), list(c_train))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

# Set up Trainer API to handle training loop
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Fine-tune the model
trainer.train()

# Predict on new texts
test_encodings = tokenizer(Q_test, return_tensors="pt", truncation=True, padding=True)
outputs = model(**test_encodings)
predictions = torch.argmax(outputs.logits, dim=1)
for text, label_idx in zip(Q_test, predictions):
    print(f"{text} -> {label_names[label_idx]}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`