# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.utils.data import DataLoader,Dataset
import transformers
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sentence_transformers import SentenceTransformer, InputExample, losses
import faiss

import colbert

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Indexer, Searcher


from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


In [2]:
import os
os.chdir('Automated-Fact-checking-System')
print("Working space:", os.getcwd())


Working space: /home/lhg45/ColBERT/Automated-Fact-checking-System


# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [3]:
class ClaimEvidenceDataset(Dataset):

    def __init__(self, claims, evidence, tokenizer, max_len=512, if_train=True):
        self.claim_texts = []
        self.evidence_texts = []
        self.tokenizer = tokenizer
        self.max_length = max_len
        self.if_train = if_train
        self.claim_ids = []
        if if_train:
            self.labels = []

        self.label2id = {
            "SUPPORTS": 0,
            "REFUTES": 1,
            "NOT_ENOUGH_INFO": 2,
            "DISPUTED": 3
        }

        for claim_idx, claim_info in claims.items():
            self.claim_ids.append(claim_idx)
            self.claim_texts.append(claim_info['claim_text'])
            evidence_text = ""
            for evid in claim_info['evidences']:
                if evid in evidence:
                    evidence_text += evidence[evid] + " "
            self.evidence_texts.append(evidence_text.strip())
            if if_train:
                self.labels.append(self.label2id[claim_info['claim_label']])

            
    def __len__(self):
        return len(self.claim_texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.claim_texts[idx],
            self.evidence_texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        if self.if_train:
            return {
                'input_ids': encoded['input_ids'].squeeze(0),
                'attention_mask': encoded['attention_mask'].squeeze(0),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long),
                'claim_id': self.claim_ids[idx]
            }
        else:
            return {
                'input_ids': encoded['input_ids'].squeeze(0),
                'attention_mask': encoded['attention_mask'].squeeze(0),
                'claim_id': self.claim_ids[idx]
            }


In [None]:
with open("data/train-claims.json", 'r') as f:
    train_claims = json.load(f)

with open("data/test-claims-unlabelled.json", 'r') as f:
    test_claims = json.load(f)

with open("data/evidence.json", 'r') as f:
    evidence = json.load(f)

evidence_ids = list(evidence.keys())
evidence_texts = list(evidence.values())

with open("data/dev-claims-baseline.json", "r") as f:
    dev_data_baseline = json.load(f)

with open("data/dev-claims.json", "r") as f:
    dev_data = json.load(f)

'''
with open('data/evidence_colbert.tsv', 'w', encoding='utf-8') as f_out:
    for idx, text in enumerate(evidence.values()):
        text = text.replace("\t", " ")
        f_out.write(f"{idx}\t{text.strip()}\n")
'''


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
#This method has been deprecated, do not use it
#!!!*****only run once(I have run it in local), if you want to re-index, set overwrite=True*****!!!

#Colbert Indexing, Tokenization and embedding in evidence_colbert.txt
'''
checkpoint = "colbert-ir/colbertv2.0"
index_name = "evidence_index"
doc_maxlen = 180

with Run().context(RunConfig(nranks=1, experiment="colab_run", root="./colbert")):
    config = ColBERTConfig(nbits=2, root="./colbert")
    indexer = Indexer(checkpoint=checkpoint, config=config)
    #indexer.index(name=index_name, collection="data/evidence_colbert.tsv",overwrite=True)
    indexer.index(name=index_name, collection="data/evidence_colbert.tsv")
'''

In [5]:
#!!!Only run once, otherwise you want to refune-tune the model!!!
#Finetuning the intfloat/e5-base-v2 model base on our training data
train_pairs = []

for claim_info in train_claims.values():
    claim_text = claim_info["claim_text"]
    for evid_id in claim_info["evidences"]:
        if evid_id in evidence:
            evidence_text = evidence[evid_id]
            train_pairs.append((claim_text, evidence_text))

train_examples = [
    InputExample(texts=["query: " + claim, "passage: " + evidence]) for claim, evidence in train_pairs
]

e5_base_model = SentenceTransformer("intfloat/e5-base-v2") 

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(e5_base_model)

e5_base_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=100,
    show_progress_bar=True,
    output_path="e5_finetuned_co2"
)

#model will be saved in the e5_finetuned_co2 folder in current working directory

Step,Training Loss
500,0.4511
1000,0.0791
1500,0.0387
2000,0.0263
2500,0.023
3000,0.0223
3500,0.0186
4000,0.0175
4500,0.0151
5000,0.0152


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [8]:
#Load the fine-tuned model
E5_model = SentenceTransformer("e5_finetuned_co2")

'''
# Create Indexing system for evidence texts(only run once)
evidence_texts_with_prefix = ["passage: " + text for text in evidence_texts]

evidence_embeddings = E5_model.encode(evidence_texts_with_prefix, convert_to_tensor=False, show_progress_bar=True)

embedding_dim = evidence_embeddings[0].shape[0]
index = faiss.IndexFlatL2(embedding_dim)

index.add(np.array(evidence_embeddings))

faiss.write_index(index, "E5/e5_index.faiss")

'''
# Load the index
e5_index = faiss.read_index("E5/e5_index.faiss")

In [44]:
#Roberta Model setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4).to(device)
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=4).to(device)

epochs = 24

batch_size = 16

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler()#accelerate

train_steps = int((len(train_claims) * epochs)/batch_size)
num_steps = int(train_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

train_dataset = ClaimEvidenceDataset(train_claims, evidence, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)

#valid_dataset = ClaimEvidenceDataset(test_claims, evidence, tokenizer, if_train = False)
#valid_dataloader = DataLoader(valid_dataset, batch_size, shuffle=True)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
#freeze last 3 layer
for param in model.parameters():
    param.requires_grad = False

for layer in model.roberta.encoder.layer[-3:]:
    for param in layer.parameters():
        param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True


In [21]:
def training(train_dataloader, model, optimizer, scheduler, scaler, device):
    model.train()
    correct_predictions = 0
    total_loss = 0
    num_batches = 0
    total_samples = 0

    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
        num_batches += 1

    avg_epoch_loss = total_loss / num_batches
    accuracy = correct_predictions / total_samples

    return avg_epoch_loss, accuracy


def validating(val_dataloader, model, device):
    model.eval()
    all_predictions = []
    claim_ids = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            outputs = model(ids, attention_mask=mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            all_predictions.extend(preds.cpu().tolist())
            claim_ids.extend(batch["claim_id"])

    return claim_ids, all_predictions
    


In [None]:
#Training
train_losses_list = []
train_accuracies = []

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    pbar = tqdm(train_dataloader)
    train_loss, accuracy = training(train_dataloader, model, optimizer, scheduler, scaler, device)

    pbar.set_postfix({ 
        print(f'Train Loss: {train_loss:.4f}'),
        print(f'Train Accuracy: {accuracy:.4f}')
    })
    train_losses_list.append(train_loss)
    train_accuracies.append(accuracy)

epoch_list = range(1, len(train_accuracies) + 1)
plt.figure(figsize=(8, 5))
plt.plot(epoch_list, train_losses_list, 'bo-', label="Training Loss", color='blue')
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Loss During Training")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(epoch_list, train_accuracies, 'bo-', label='Training Accuracy', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


NameError: name 'epochs' is not defined

In [49]:
save_path = "model_checkpoints/roberta_final.pt"
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scaler_state_dict': scaler.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'epoch': epochs,
    'loss': train_loss
}, save_path)

print(f"Final model saved to {save_path}")

Final model saved to model_checkpoints/roberta_final.pt


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
#(!!!Deprecated!!!)Colbert Retrieval
checkpoint = "colbert-ir/colbertv2.0"
index_name = "evidence_index"
doc_maxlen = 180
searcher = Searcher(index=index_name, checkpoint=checkpoint, index_root="colbert/colab_run/indexes")

def retrieve(claim, k=10, threshould=2.0):
    with torch.no_grad():
        doc_ids, rank, scores = searcher.search(claim, k=k)

        print("scores: " , scores)
        Top_score = scores[0]
        print(doc_ids)

        filtered = [
            (doc_id, score)
            for doc_id, score in zip(doc_ids, scores)
            if score >= Top_score - threshould
        ]

        if not filtered:
            return ""
        
        retrieved_text = " ".join([
            searcher.collection[doc_id].replace("\n", " ").strip()
            for doc_id, score in filtered
        ])
        return retrieved_text

claim = "Weather Channel co-founder John Coleman provided evidence that convincingly refutes the concept of anthropogenic global warming."
retrieved_evidence = retrieve(claim)

print(retrieved_evidence)

In [None]:
#E5 Retrieval， we used L2 distance and returned the similar evidences to the top one(based on epsilon).
def E5_retrieve(claim, E5_model, index, epsilon=0.15, top_k=6, use_inner_product=True):
    result = {}

    for claim_id, info in claim.items():
        claim_text = info["claim_text"]
        claim_emb = E5_model.encode(f"query: {claim_text}")

        D, I = index.search(np.array([claim_emb]), k=top_k)

        scores = D[0]
        if not use_inner_product:
            scores = -scores

        indices = I[0]

        best_score = max(scores)
        relative_threshold = best_score - epsilon

        passed = [(i, s) for i, s in zip(indices, scores) if s >= relative_threshold]

        top_evid_ids = [evidence_ids[i] for i, _ in passed]

        result[claim_id] = {
            "claim_text": claim_text,
            "evidences": top_evid_ids,
        }

    return result

'''
claim = {"claim-2967": {"claim_text": "Tree-ring proxy reconstructions are reliable before 1960, tracking closely with the instrumental record and other independent proxies."}}
claim_e5 = E5_retrieve(claim, E5_model,e5_index)
print(claim_e5)
'''


'\nclaim = {"claim-2967": {"claim_text": "Tree-ring proxy reconstructions are reliable before 1960, tracking closely with the instrumental record and other independent proxies."}}\nclaim_e5 = E5_retrieve(claim, E5_model,e5_index)\nprint(claim_e5)\n'

In [None]:
checkpoint = torch.load("model_checkpoints/roberta_final.pt", map_location='cuda') 
model.load_state_dict(checkpoint['model_state_dict'])

print("Model loaded successfully")

model.eval()

batch_size = 16

val_data = E5_retrieve(dev_data, E5_model,e5_index)
print("retrieved evidences finished")

valid_dataset = ClaimEvidenceDataset(val_data, evidence, tokenizer, if_train = False)
valid_dataloader = DataLoader(valid_dataset, batch_size)

id2label = {
    0: "SUPPORTS",
    1: "REFUTES",
    2: "NOT_ENOUGH_INFO",
    3: "DISPUTED"
}

claim_ids, predictions = validating(valid_dataloader, model, 'cuda')
print("validating finished")

label_names = [id2label[p] for p in predictions]

for cid, label in zip(claim_ids, label_names):
    val_data[cid]["claim_label"] = label





In [23]:
print(val_data)
with open("data/dev-claims-test.json", "w") as f:
    json.dump(val_data, f)



## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*