## 2.1 Dataset Description

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("CLAN_data.csv")

# First split: 70% training and 30% temporary set
train_data, temp_data = train_test_split(data, test_size=0.30, random_state=42)

# Second split: split the temporary set equally into validation and test sets (15% each of the original dataset)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Training set samples:", train_data.shape[0])
print("Validation set samples:", val_data.shape[0])
print("Test set samples:", test_data.shape[0])

train_data.to_csv("train.csv", index=False)
val_data.to_csv("validation.csv", index=False)
test_data.to_csv("test.csv", index=False)


Training set samples: 1967
Validation set samples: 422
Test set samples: 422


## 2.2 Preprocessing

In [3]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

CONTRACTION_MAP = {
    "he‚Äôll": "he will",
    "she‚Äôs": "she is",
    "gov.": "governor",
    "feb.": "february",
    "vp": "vice president",
    "eta": "estimated time of arrival",
    "i'm": "i am",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "it's": "it is",
    "that's": "that is",
    "there's": "there is",
    "what's": "what is",
    "could've": "could have",
    "should've": "should have",
    "would've": "would have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "let's": "let us",
    "c'mon": "come on",
    "gotta": "got to",
    "wanna": "want to",
    "gonna": "going to",
    "ain't": "is not",
    "u": "you",
    "r": "are",
    "pls": "please",
    "thx": "thanks",
    "kinda": "kind of",
    "sorta": "sort of",
    "idk": "i do not know",
    "btw": "by the way",
    "imho": "in my humble opinion"
}

def expand_contractions(text, contraction_map=CONTRACTION_MAP):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contraction_map.keys()) + r')\b', flags=re.IGNORECASE)

    def replace(match):
        return contraction_map[match.group(0).lower()]

    return pattern.sub(replace, text)

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def preprocess_text(text):
    text = text.lower()
    text = expand_contractions(text)
    text = clean_text(text)
    return text

data = pd.read_csv("CLAN_data.csv")

data['Processed_Post'] = data['Social Media Post'].apply(preprocess_text)

train_data, temp_data = train_test_split(data, test_size=0.30, random_state=42)

val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Training set samples:", train_data.shape[0])
print("Validation set samples:", val_data.shape[0])
print("Test set samples:", test_data.shape[0])

train_data.to_csv("train.csv", index=False)
val_data.to_csv("validation.csv", index=False)
test_data.to_csv("test.csv", index=False)


Training set samples: 1967
Validation set samples: 422
Test set samples: 422


In [4]:
!pip install evaluate
!pip install rouge_score
!pip install bert_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [5]:
import re
import pandas as pd
import torch
import gc
from torch.utils.data import Dataset
from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments, TrainerCallback
)
import evaluate

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 2.3 Model Training

In [7]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments
)
import h5py

class ClaimNormalizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=128, model_type="bart"):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        self.model_type = model_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]['Processed_Post']
        target_text = self.data.iloc[idx]['Normalized Claim']

        if self.model_type == "t5":
            source_text = "normalize: " + source_text

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze(),
        }

def fine_tune_model(model_name, model_type="bart", train_df=None, val_df=None, output_dir="./results", num_train_epochs=3):
    if model_type == "bart":
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
    elif model_type == "t5":
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
    else:
        raise ValueError("Unsupported model type. Choose 'bart' or 't5'.")

    train_dataset = ClaimNormalizationDataset(train_df, tokenizer, model_type=model_type)
    val_dataset = ClaimNormalizationDataset(val_df, tokenizer, model_type=model_type)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=3e-5,
        weight_decay=0.01,
        fp16=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    h5_filename = output_dir + "_model.h5"
    state_dict = model.state_dict()
    with h5py.File(h5_filename, 'w') as hf:
        for key, tensor in state_dict.items():
            hf.create_dataset(key, data=tensor.cpu().numpy())
    print(f"Model weights saved in H5 format at: {h5_filename}")

    tokenizer.save_pretrained(output_dir + "_tokenizer")

    return model, tokenizer, trainer

if __name__ == "__main__":
    train_df = pd.read_csv("train.csv")
    val_df = pd.read_csv("validation.csv")
    print("Fine-tuning BART model...")
    bart_model_name = "facebook/bart-base"
    fine_tune_model(
        model_name=bart_model_name,
        model_type="bart",
        train_df=train_df,
        val_df=val_df,
        output_dir="./bart_results",
        num_train_epochs=15
    )

    print("Fine-tuning T5 model...")
    t5_model_name = "t5-base"
    fine_tune_model(
        model_name=t5_model_name,
        model_type="t5",
        train_df=train_df,
        val_df=val_df,
        output_dir="./t5_results",
        num_train_epochs=15
    )


Fine-tuning BART model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maaditya23006[0m ([33maaditya23006-indraprastha-institute-of-information-techn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.5561,0.488229
2,0.4538,0.461334
3,0.3584,0.462885
4,0.3024,0.461901
5,0.2365,0.472606
6,0.2297,0.48206
7,0.1838,0.494246
8,0.167,0.502168
9,0.1338,0.5132
10,0.1234,0.514722




Model weights saved in H5 format at: ./bart_results_model.h5
Fine-tuning T5 model...


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.5682,0.512522
2,0.5165,0.485241
3,0.4604,0.473311
4,0.4417,0.466291
5,0.4018,0.462945
6,0.4191,0.458371
7,0.3851,0.453217
8,0.4026,0.448579
9,0.3776,0.447423
10,0.387,0.455739


Model weights saved in H5 format at: ./t5_results_model.h5


In [10]:
from google.colab import drive
drive.mount('/content/drive')
!cp bart_results_model.h5 /content/drive/MyDrive/


Mounted at /content/drive


In [11]:
from google.colab import drive
!cp t5_results_model.h5 /content/drive/MyDrive/


In [12]:
!pip install evaluate




## 2.4 Evaluation

In [13]:
import torch
from torch.utils.data import DataLoader
import evaluate
import pandas as pd
import h5py

def load_model_weights_from_h5(model, h5_filepath):
    state_dict = {}
    with h5py.File(h5_filepath, 'r') as hf:
        for key in hf.keys():
            state_dict[key] = torch.tensor(hf[key][:])
    model.load_state_dict(state_dict)
    return model

class ClaimNormalizationDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=128, model_type="bart"):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        self.model_type = model_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]['Processed_Post']
        target_text = self.data.iloc[idx]['Normalized Claim']

        if self.model_type == "t5":
            source_text = "normalize: " + source_text

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )
        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze(),
        }

def evaluate_model(model, tokenizer, test_df, model_type="bart", batch_size=4, device="cuda"):
    test_dataset = ClaimNormalizationDataset(test_df, tokenizer, model_type=model_type)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model.to(device)
    model.eval()

    all_predictions = []
    all_references = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_refs = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

            all_predictions.extend(decoded_preds)
            all_references.extend(decoded_refs)

    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")
    bertscore_metric = evaluate.load("bertscore")

    rouge_results = rouge_metric.compute(predictions=all_predictions, references=all_references, use_stemmer=True)

    bleu_results = bleu_metric.compute(
        predictions=all_predictions,
        references=[[ref] for ref in all_references]
    )

    bertscore_results = bertscore_metric.compute(predictions=all_predictions, references=all_references, lang="en")
    avg_bertscore_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

    print("ROUGE Results:")
    print(rouge_results)
    print("\nBLEU Results:")
    print(bleu_results)
    print("\nBERTScore (average F1):")
    print(avg_bertscore_f1)

    return all_predictions, all_references, rouge_results, bleu_results, bertscore_results

if __name__ == "__main__":
    test_df = pd.read_csv("test.csv")
    from transformers import BartTokenizer, BartForConditionalGeneration
    bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    bart_h5_path = "./bart_results_model.h5"
    bart_model = load_model_weights_from_h5(bart_model, bart_h5_path)

    print("Evaluating BART model on test data:")
    bart_preds, bart_refs, bart_rouge, bart_bleu, bart_bertscore = evaluate_model(
        bart_model, bart_tokenizer, test_df, model_type="bart", batch_size=4, device="cuda" if torch.cuda.is_available() else "cpu"
    )
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
    t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
    t5_h5_path = "./t5_results_model.h5"
    t5_model = load_model_weights_from_h5(t5_model, t5_h5_path)

    print("\nEvaluating T5 model on test data:")
    t5_preds, t5_refs, t5_rouge, t5_bleu, t5_bertscore = evaluate_model(
        t5_model, t5_tokenizer, test_df, model_type="t5", batch_size=4, device="cuda" if torch.cuda.is_available() else "cpu"
    )

    best_model_name = None
    if bart_bleu["bleu"] > t5_bleu["bleu"]:
        best_model = bart_model
        best_tokenizer = bart_tokenizer
        best_model_name = "BART"
    else:
        best_model = t5_model
        best_tokenizer = t5_tokenizer
        best_model_name = "T5"

    print(f"\nBest model based on BLEU score: {best_model_name}")
    best_model.save_pretrained("best_model")
    best_tokenizer.save_pretrained("best_model")
    print("Best model saved in the directory 'best_model'.")



Evaluating BART model on test data:


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE Results:
{'rouge1': np.float64(0.41313154845136735), 'rouge2': np.float64(0.2820969403705932), 'rougeL': np.float64(0.3819289745530024), 'rougeLsum': np.float64(0.383040035030908)}

BLEU Results:
{'bleu': 0.20414310328058877, 'precisions': [0.36661384046487056, 0.22657342657342658, 0.17969678953626636, 0.15080875356803045], 'brevity_penalty': 0.9372135048154641, 'length_ratio': 0.9391045516557113, 'translation_length': 7572, 'reference_length': 8063}

BERTScore (average F1):
0.8890135937957402

Evaluating T5 model on test data:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE Results:
{'rouge1': np.float64(0.3486721482798906), 'rouge2': np.float64(0.21755575324811913), 'rougeL': np.float64(0.32087188149016604), 'rougeLsum': np.float64(0.32173402663488215)}

BLEU Results:
{'bleu': 0.13905448072174112, 'precisions': [0.2976416373000813, 0.16259344450833813, 0.11798010711553175, 0.09519136408243375], 'brevity_penalty': 0.9107186376013938, 'length_ratio': 0.9144769459593456, 'translation_length': 7378, 'reference_length': 8068}

BERTScore (average F1):
0.8747888701786927

Best model based on BLEU score: BART
Best model saved in the directory 'best_model'.


In [14]:
import h5py

best_model_name = None
if bart_bleu["bleu"] > t5_bleu["bleu"]:
    best_model = bart_model
    best_tokenizer = bart_tokenizer
    best_model_name = "BART"
else:
    best_model = t5_model
    best_tokenizer = t5_tokenizer
    best_model_name = "T5"

print(f"\nBest model based on BLEU score: {best_model_name}")

best_h5_path = "best_model.h5"
state_dict = best_model.state_dict()
with h5py.File(best_h5_path, 'w') as hf:
    for key, tensor in state_dict.items():
        hf.create_dataset(key, data=tensor.cpu().numpy())
print(f"Best model weights saved in H5 format at: ./content/{best_h5_path}")



Best model based on BLEU score: BART
Best model weights saved in H5 format at: ./content/best_model.h5


In [15]:
from google.colab import drive
!cp best_model.h5 /content/drive/MyDrive/

## 2.5 Testing and Model Inference


In [16]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import evaluate
import h5py
from transformers import BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

def load_model_weights_from_h5(model, h5_filepath):
    state_dict = {}
    with h5py.File(h5_filepath, 'r') as hf:
        for key in hf.keys():
            state_dict[key] = torch.tensor(hf[key][:])
    model.load_state_dict(state_dict)
    return model

class ClaimNormalizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_length=512, target_max_length=128, model_type="bart"):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        self.model_type = model_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]['Processed_Post']
        target_text = self.data.iloc[idx]['Normalized Claim']

        if self.model_type == "t5":
            source_text = "normalize: " + source_text

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze(),
        }

def test_and_infer(model_type="bart", test_csv="test.csv", batch_size=4,
                   device="cuda" if torch.cuda.is_available() else "cpu", h5_filepath="best_model.h5"):
    if model_type == "bart":
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    elif model_type == "t5":
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
        model = T5ForConditionalGeneration.from_pretrained("t5-base")
    else:
        raise ValueError("Unsupported model type. Choose 'bart' or 't5'.")

    model = load_model_weights_from_h5(model, h5_filepath)

    test_df = pd.read_csv(test_csv)

    test_dataset = ClaimNormalizationDataset(test_df, tokenizer, model_type=model_type)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model.to(device)
    model.eval()

    all_predictions = []
    all_references = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_refs = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

            all_predictions.extend(decoded_preds)
            all_references.extend(decoded_refs)

    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")
    bertscore_metric = evaluate.load("bertscore")

    rouge_results = rouge_metric.compute(predictions=all_predictions, references=all_references, use_stemmer=True)

    bleu_results = bleu_metric.compute(
        predictions=all_predictions,
        references=[[ref] for ref in all_references]
    )

    bertscore_results = bertscore_metric.compute(predictions=all_predictions, references=all_references, lang="en")
    avg_bertscore_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

    print("Evaluation Metrics:")
    print("ROUGE Results:", rouge_results)
    print("BLEU Results:", bleu_results)
    print("BERTScore (average F1):", avg_bertscore_f1)

    test_df["Predicted_Normalized_Claim"] = all_predictions
    test_df.to_csv("test_with_predictions.csv", index=False)
    print("Inference completed. Predictions saved to 'test_with_predictions.csv'.")

    return all_predictions, all_references, rouge_results, bleu_results, bertscore_results

if __name__ == "__main__":
    predictions, references, rouge_res, bleu_res, bertscore_res = test_and_infer(
        model_type="bart",
        test_csv="test.csv",
        batch_size=4,
        h5_filepath="/content/drive/MyDrive/best_model.h5"
    )


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
ROUGE Results: {'rouge1': np.float64(0.41313154845136735), 'rouge2': np.float64(0.2820969403705932), 'rougeL': np.float64(0.3819289745530024), 'rougeLsum': np.float64(0.383040035030908)}
BLEU Results: {'bleu': 0.20414310328058877, 'precisions': [0.36661384046487056, 0.22657342657342658, 0.17969678953626636, 0.15080875356803045], 'brevity_penalty': 0.9372135048154641, 'length_ratio': 0.9391045516557113, 'translation_length': 7572, 'reference_length': 8063}
BERTScore (average F1): 0.8890135937957402
Inference completed. Predictions saved to 'test_with_predictions.csv'.
