# Bert Sequence To Sequence for claim verification

In [1]:
from transformers import DataCollatorWithPadding,Trainer, AutoTokenizer, TFTrainingArguments, TFTrainer, TFBertModel, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import pandas as pd
from datasets import Dataset
import evaluate
import numpy as np
from huggingface_hub import notebook_login
from transformers import create_optimizer, TFAutoModelForSequenceClassification, AutoModelForSequenceClassification, TrainingArguments, Trainer
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback



# Hugging face setup

In [2]:
access_token = "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"
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data processing

A class to retrieve a fact verfication dataset from an excel file of either csv or xscl

In [3]:

def dataframe_from_excel(filepath,sep=None):
    claims = []
    label = []
    evidence_accumulator = []
    
    if sep == None:
        # Load claims
        raw_fact_ver_dataframe = pd.read_excel(filepath)
    else:
        raw_fact_ver_dataframe = pd.read_csv(filepath,sep=sep)


    # Iterate through the rows
    for index, row in raw_fact_ver_dataframe.iterrows():
        # Append the claim every 6 rows
        if index % 6 == 0:
            # Combine all pieces of evidence into one long string
            combined_evidence = "".join(str(evidence) for evidence in evidence_accumulator)

            # Combine claim and evidence pieces
            claim = row['Claim_text'] + "".join(combined_evidence)

            # Append the claim and combined evidence to their respective lists
            claims.append(claim)
            label.append(row['Label'])

            # Reset the evidence accumulator
            evidence_accumulator = []

        # Append the current evidence to the accumulator
        evidence_accumulator.append(row['Evidence_text'])

    # If there are any remaining rows with evidence, add them as the last entry
    if evidence_accumulator:
        combined_evidence = "".join(evidence_accumulator)
        claims.append(raw_fact_ver_dataframe.iloc[-1]['Claim_text'])
        # evidence.append(combined_evidence)
        label.append(row['Label'])
    
    # Create a DataFrame
    fact_ver_dataframe = pd.DataFrame({
        'label': label,
        'text': claims
    })
    
    # Create a mapping from labels to numerical values
    label_mapping = {'F': 0, 'N': 1, 'T': 2}

    # Replace the labels in the 'Label' column with numerical values
    fact_ver_dataframe['label'] = fact_ver_dataframe['label'].map(label_mapping)

    return fact_ver_dataframe


## generate the fact verification dataframe

Call the dataframe class to generate a pandas dataframe containing claims,labels, text etc

In [5]:
fact_ver_dataframe = dataframe_from_excel("Claims.xlsx")
text = fact_ver_dataframe.iloc[1]['text']

count =0
for char in text:
    if char == ' ':
        count += 1

# Print the count of spaces
print("Number of spaces:", count)

Number of spaces: 204


# Dataset Train Test Split

Set up a hugging face Dataset class with a 80/20 train test split.

In [8]:
# Create a Dataset from pandas DataFrame
dataset = Dataset.from_pandas(fact_ver_dataframe)

dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
dataset.push_to_hub("Train_Test")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/587 [00:00<?, ?B/s]

# Tokenizor 
Sets up a tokenizing function using bert base base uncasssed with truncation and padding enabled to ensure uniform size.

In [9]:
# Set up tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", tokenizer_options={"truncation": True , "padding": True})

# Tokenization function
def preprocess_function(token):
    return tokenizer(token["text"], truncation=True, padding=True)

# Tokenize dataset

Tokenizes the dataset using tokenizor function and sets up a data collator for use in training

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Map:   0%|          | 0/122 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

# Evaluation Metrics

Sets up an evaluation metric for use in anaylsis of the model

In [11]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


# Label Encoding

Sets up label encoding for use in model

In [12]:
id2label = {0: "FALSE", 1: "NOT_ENOUGH_INFO", 2: "TRUE"}
label2id = {"FALSE": 0, "NOT_ENOUGH_INFO": 1, "TRUE": 2}


# Model

Use Bert Sequence classification model with 3 classes

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id, token=access_token
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Trainer

Sets up training for model

In [12]:
training_args = TrainingArguments(
    output_dir="training_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs = 4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.push_to_hub()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.917704,0.354839
2,No log,0.891521,0.354839
3,No log,0.893651,0.290323
4,No log,0.896307,0.290323


'https://huggingface.co/Brecon/training_model/tree/main/'

# Optimizer

In [None]:

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


# Model

In [None]:

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

# Validation

In [None]:

tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


model.compile(optimizer=optimizer)  # No loss argument!


# Modle send

In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

push_to_hub_callback = PushToHubCallback(
    output_dir="validation_model",
    tokenizer=tokenizer,
)

callbacks = [metric_callback, push_to_hub_callback]

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

In [None]:
fact_ver_dataframe = dataframe_from_excel("Master_Claims.csv",sep='|')

In [None]:
# Create a Dataset from pandas DataFrame
dataset = Dataset.from_pandas(fact_ver_dataframe)

dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)


In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)



In [None]:
id2label = {0: "FALSE", 1: "NOT_ENOUGH_INFO", 2: "TRUE"}
label2id = {"FALSE": 0, "NOT_ENOUGH_INFO": 1, "TRUE": 2}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id, token=access_token
)

In [None]:
training_args = TrainingArguments(
    output_dir="training_master_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.push_to_hub()

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

In [None]:

tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


model.compile(optimizer=optimizer)  # No loss argument!


In [None]:

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

push_to_hub_callback = PushToHubCallback(
    output_dir="master_validation_model",
    tokenizer=tokenizer,
)

callbacks = [metric_callback, push_to_hub_callback]

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)