In [1]:
# !pip install -U  torch transformers torchsummary  
# apex jupyter ipywidgets

In [2]:
import copy
import math
import random
import torch
import warnings
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from transformers.file_utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from transformers.modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.models.bart.modeling_bart import (
    BartLearnedPositionalEmbedding,
    BartDecoderLayer,
    BartPreTrainedModel,
)
from transformers.utils import logging
from typing import List, Optional, Tuple, Union
from transformers.modeling_attn_mask_utils import (
    _prepare_4d_attention_mask,
    _prepare_4d_causal_attention_mask,
)
from transformers.activations import ACT2FN
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import get_scheduler

# BART Model Set Up

## Helper code

In [3]:
class VectorizedFeatureSpecificMultiHeadAttention(torch.nn.Module):
    def __init__(self, num_heads, d_model, num_features):
        """
        Initializes the VectorizedFeatureSpecificMultiHeadAttention module.

        Parameters:
        num_heads (int): Number of attention heads.
        d_model (int): Dimensionality of the input feature space.
        num_features (int): Number of distinct features / experts.
        """
        super(VectorizedFeatureSpecificMultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.num_features = num_features

        assert d_model % self.num_heads == 0, "d_model must be divisible by num_heads"

        self.depth = d_model // self.num_heads

        self.wq = torch.nn.Linear(d_model, d_model)
        self.wk = torch.nn.Linear(
            d_model, d_model
        )  # Same as original since key is already feature-specific

    def split_heads(self, x, batch_size, seq_len):
        """
        Splits the last dimension of x into (num_heads, depth) and reshapes.

        Parameters:
        x (torch.Tensor): Input tensor.
        seq_len (int): Sequence length for the reshaping process.

        Returns:
        torch.Tensor: Reshaped tensor.
        """
        new_shape = x.size()[:-1] + (self.num_heads, self.depth)
        x = x.reshape(*new_shape).permute(
            0, 1, 3, 2, 4
        )  # (num_features, batch_size, num_heads, seq_len, depth)
        return x

    def forward(self, query, key):
        """
        Forward pass for the VectorizedFeatureSpecificMultiHeadAttention.

        Parameters:
        query (torch.Tensor): Query tensor of shape (batch_size, seq_len_q, d_model).
        key (torch.Tensor): Key tensor of shape (num_features, batch_size, seq_len_k, d_model).

        Returns:
        torch.Tensor: Concatenated attention weights across all heads and features.
        """
        batch_size = query.size(0)
        seq_len_q = query.size(1)
        seq_len_k = key.size(2)

        # Prepare query and key
        query = self.split_heads(
            self.wq(query), batch_size, seq_len_q
        )  # (batch_size, num_heads, seq_len_q, depth)
        key = self.split_heads(
            self.wk(key), batch_size, seq_len_k
        )  # (num_features, batch_size, num_heads, seq_len_k, depth)

        # Compute attention
        attention_weights = self.compute_attention(query, key, seq_len_k)

        return attention_weights

    def compute_attention(self, query, key, seq_len_k):
        """
        Computes the scaled dot-product attention.

        Parameters:
        query (torch.Tensor): Query tensor.
        key (torch.Tensor): Key tensor.
        seq_len_k (int): Key sequence length.

        Returns:
        torch.Tensor: Attention weights.
        """
        # Expand query to match key's features
        # query = query.unsqueeze(0).expand(
        #     self.num_features, -1, -1, -1, -1
        # )  # (num_features, batch_size, num_heads, seq_len_q, depth)

        # Perform batch matrix multiplication
        matmul_qk = torch.matmul(
            query, key.transpose(-2, -1)
        )  # (num_features, batch_size, num_heads, seq_len_q, seq_len_k)

        # Scale
        scale_factor = 1 / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
        attention_weights = matmul_qk * scale_factor

        # # Apply softmax along the dimension of key's sequence length
        # attention_weights = F.softmax(scaled_attention_logits, dim=-1)

        # Reshape to combine features and batch
        # _, _, num_heads, _, _ = scaled_attention_logits.shape
        # attention_weights = scaled_attention_logits.permute(
        #     1, 2, 0, 3, 4
        # ).contiguous()  # (batch_size, num_heads, num_features, seq_len_q, seq_len_k)

        return attention_weights

## Decoder Layer

In [4]:
# BartDecoderLayer Modification
class CustomBartDecoderLayer(nn.Module):
    def __init__(self, config: BartConfig, layer):
        super().__init__()
        self.layer = layer
        self.num_replicas = num_replicas
        self.embed_dim = config.d_model  # Assuming embed_dim is d_model
        self.fc1 = nn.ModuleList(
            [
                nn.Linear(self.embed_dim, self.embed_dim, bias=False)
                for _ in range(num_replicas)
            ]
        )
        # print(self.embed_dim, config.decoder_ffn_dim)
        # self.fc2 = nn.ModuleList(
        #     [nn.Linear(self.embed_dim, self.embed_dim, bias=False) for _ in range(num_replicas)]
        # )
        self.q1 = nn.Linear(self.embed_dim, self.embed_dim)
        self.k1 = nn.Linear(self.embed_dim, self.embed_dim)

        self.attention_weight_bias = nn.Parameter(torch.zeros(num_replicas, 1, 1, 1))

        # self.q2 = nn.Linear(self.embed_dim, self.embed_dim)
        # self.k2 = nn.Linear(self.embed_dim, self.embed_dim)

        # print(self.embed_dim, config.decoder_ffn_dim, config.decoder_attention_heads)

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.mh_attn = VectorizedFeatureSpecificMultiHeadAttention(
            num_heads=config.decoder_attention_heads,
            d_model=self.embed_dim,
            num_features=self.num_replicas,
        )
        # config.decoder_attention_heads
        # for i, fc1 in enumerate(self.fc1):
        #     # Initialize with a simple pattern, e.g., all elements in the weight matrix are set to the index of the layer
        #     nn.init.constant_(fc1.weight, 17 * i)

    def forward(self, x, *args, **kwargs):
        outputs = self.layer(x, *args, **kwargs)
        hidden_states = outputs[0]
        residual = hidden_states

        batch_size = hidden_states.size(0)
        seq_len = hidden_states.size(1)

        p1 = torch.stack([fc1.weight for fc1 in self.fc1], dim=0)
        # .transpose(0, 1)
        p1 = p1.unsqueeze(1).expand(-1, batch_size, -1, -1)

        hidden_states_reshaped = hidden_states.unsqueeze(0).expand(
            self.num_replicas, -1, -1, -1
        )
        fc1_concat = torch.matmul(hidden_states_reshaped, p1)
        # fc1_concat1 = torch.stack([fc1(hidden_states) for fc1 in self.fc1], dim=0)

        # print("p1", p1.shape)
        # print("hidden_states_reshaped",hidden_states_reshaped.shape)
        # print("fc1_concat1",fc1_concat1.shape)
        # print("fc1_concat", fc1_concat.shape)
        # torch.set_printoptions(sci_mode=False)
        # print("Same:", torch.allclose(fc1_concat, fc1_concat1, rtol = 0.001))
        # print(torch.abs(fc1_concat - fc1_concat1).mean())
        # print(torch.abs(fc1_concat - fc1_concat1).max())
        # print()

        # q1 = self.q1(hidden_states).unsqueeze(0).expand(self.num_replicas, -1, -1, -1)
        # k1 = self.k1(p1)
        # attention_weight_1 = scaled_dot_product_attention(q1, k1, self.embed_dim)
        # attention_weight_1 = attention_weight_1 + self.attention_weight_bias

        # print("q1", q1.shape)
        # print("k1", k1.shape)
        mh_attn_weight = self.mh_attn(
            query=hidden_states_reshaped, key=p1
        )  # shape: (num_experts, batch_size, num_heads, seq_len, embed_dim)
        # print("attention_weight_1", attention_weight_1.shape)
        # print("mh_attn_weight", mh_attn_weight.shape)
        # print("Same:", torch.allclose(attention_weight_1, mh_attn_weight, rtol=0.001))
        # print(torch.abs(attention_weight_1 - mh_attn_weight).mean())
        # print(torch.abs(attention_weight_1 - mh_attn_weight).max())
        # print()

        # attention_weight_1_norm_expert = nn.functional.softmax(
        #     mh_attn_weight, dim=0
        # ).mean(dim = 2)
        # attention_weight_1_norm_feature = nn.functional.softmax(
        #     mh_attn_weight, dim=-1
        # ).mean(dim = 2)
        # mh_attn_weight = (
        #     3 * attention_weight_1_norm_expert + attention_weight_1_norm_feature
        # )
        mh_attn_weight = mh_attn_weight.mean(dim=2)
        # print(self.num_replicas)
        # print("q1", q1.shape)
        # print("k1", k1.shape)
        # print("fc1_concat", fc1_concat.shape)
        # print("attention_weight_1", attention_weight_1.shape)
        fc1_weighted = fc1_concat * mh_attn_weight
        hidden_states = fc1_weighted.mean(dim=0)
        # self.activation_fn
        # print((attention_weight_1.sum(dim=(1,2,3))))
        # hidden_states = nn.functional.dropout(
        #     hidden_states, p=self.activation_dropout, training=self.training
        # )

        # # Vectorized operation for fc2 layers
        # q2 = self.q2(hidden_states)
        # fc2_concat = torch.stack([fc2(hidden_states) for fc2 in self.fc2], dim=0)
        # p2 = torch.stack([fc2.weight.data for fc2 in self.fc2], dim=0)
        # p2 = p2.unsqueeze(2).expand(-1, -1, batch_size, -1)
        # k2 = self.k2(p2)
        # attention_weight_2 = scaled_dot_product_attention(q2, k2, self.embed_dim)
        # fc2_weighted = fc2_concat * attention_weight_2
        # hidden_states = fc2_weighted.mean(dim=0)

        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )

        hidden_states = hidden_states + residual
        hidden_states = self.final_layer_norm(hidden_states)

        return (hidden_states,) + outputs[1:]

## Load Model

In [5]:
# Load the pretrained BART model
model_name = "facebook/bart-large-cnn"
config = BartConfig.from_pretrained(model_name)
# print(config)
model = BartForConditionalGeneration.from_pretrained(model_name, config=config)
tokenizer = BartTokenizer.from_pretrained(model_name)

# model_save_path = "./BART_model"
# config = BartConfig.from_pretrained(model_save_path)
# # print(config)
# model = BartForConditionalGeneration.from_pretrained(model_save_path, config=config)
# tokenizer = BartTokenizer.from_pretrained(model_save_path)


# Check if CUDA GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

num_replicas = 3

# Replace all customized layers
for i, layer in enumerate(model.model.decoder.layers):    
    if i == len(model.model.decoder.layers) - 1:
        model.model.decoder.layers[i] = CustomBartDecoderLayer(model.config, layer)

model_weights_path = r"./model_weights/facebook/bart-large-cnn.pth"
#model_weights_path = r"./model_weights/facebook/bart-large-cnn2.pth"
# # Load model weights to the device
if torch.cuda.is_available():
    model.load_state_dict(torch.load(model_weights_path, map_location="cuda"))
else:
    model.load_state_dict(torch.load(model_weights_path, map_location="cpu"))

# Move the model to the specified device
model.to(device)

Using device: cuda


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [6]:
#Code Warehouse
# model_weigts_path = r"./model_weights/facebook/bart-large-cnn2.pth"
# model = BartForConditionalGeneration(config=config)

# for i, layer in enumerate(model.model.encoder.layers):
#     model.model.encoder.layers[i] = CustomBartEncoderLayer(model.config, layer)


# #save pretrained model weights
# torch.save(model.state_dict(), model_weigts_path)

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters:", total_params)

Total number of parameters: 413636611


In [8]:
# # Freeze pretrained weight

# Step 1: Freeze all pretrained weights
for param in model.parameters():
    param.requires_grad = False
# Step 2: Unfreeze the weights in custom layers
for i in range(len(model.model.decoder.layers)):
    layer = model.model.decoder.layers[i]
    if i == len(model.model.decoder.layers) - 1 and isinstance(layer, CustomBartDecoderLayer):
#     if isinstance(layer, CustomBartDecoderLayer):
        for param in layer.parameters():
            param.requires_grad = True

    if isinstance(layer, BartDecoderLayer):
        for param in layer.parameters():
            param.requires_grad = False
            
#     if i == len(model.model.decoder.layers) - 1:
#         for param in layer.parameters():
#             param.requires_grad = True

## Inference / Generate Text

In [9]:
# # # Sample text to summarize
# text = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
# A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
# Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
# In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
# Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
# 2010 marriage license application, according to court documents.
# Prosecutors said the marriages were part of an immigration scam.
# On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
# After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
# Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
# All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
# Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
# Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
# The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
# Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
# Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
# If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18."""

# # Encode the text into tokens
# inputs = tokenizer([text], return_tensors="pt")
# # , max_length=1024

# # Move the input tensors to the same device as the model
# inputs = inputs.to(device)

# # Generate a summary of the encoded text
# summary_ids = model.generate(
#     inputs["input_ids"],
#     num_beams=4,
#     # max_length=51,
#     # early_stopping=True
# )

# # Decode the summary
# summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# print(summary)

In [10]:
# Prepare the batched input
input_texts = [
    "Who is the president of China?",
    "Who is the president of the US?",
    "Who is the president of Russia?",
    # """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.""",
    # """Your model's primary bottlenecks appear to be matrix multiplication and linear layer operations, both in terms of computation and possibly memory usage. Focusing your optimization efforts on these areas, along with minimizing unnecessary memory operations, could lead to significant improvements in performance. Remember, optimizations can sometimes affect model accuracy, so it's important to validate your model's performance after making any changes.""",
    # """Which technology was developed most recently? Options:
    # A. television
    # B. refrigerator
    # C. cellular telephone
    # D. airplane
    # Which is the correct answer?""",
]

inputs = tokenizer(
    input_texts,
    return_tensors="pt",
    padding=True,
    # truncation=True
)

# Move the input tensors to the same device as the model
inputs = inputs.to(device)

# Generate the output
model.eval()
with torch.no_grad():
    output_tokens = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=4,
        # max_length=20,  # Optionally set a max length if desired
    )

# Decode the generated tokens for each input in the batch
output_texts = [
    tokenizer.decode(token, skip_special_tokens=True) for token in output_tokens
]

# Print each output
for output in output_texts:
    print(output)

Who is the president of China? China is home to one third of the world’s Aussen Aussen Ministerial staff as it is also home to 40% of the nation’’. China is also seat of one of the largest military bases in the world.
Who is the president of the US? The correct choice is George W. Bush as it is conflicting with C which is who is really in charge of US foreign policy. The war is raging as both sides are competing for control of the affairs of the U.S.
Who is the president of Russia? The correct choice is C which is Russian president is Vladimir Putin as it is conflicting with C. Dmitry Medvedev is also president of Ukraine as he is heir to Russian empire as well as president of the Soviet Union. Putin is also head of Russia's Communist Party.


In [11]:
# from torch.profiler import profile, record_function, ProfilerActivity

# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
#              record_shapes=True) as prof:
#     with record_function("model_inference"):
#         # Your model inference code here
#         outputs = model.generate(
#         inputs["input_ids"],
#         attention_mask=inputs["attention_mask"],
#         num_beams=4,
#         # max_length=50,  # Optionally set a max length if desired
#         )
#         # model(input_data)

# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# Training Model

## Load Data

### Pretrain Data

In [12]:
# import pandas as pd
# import random
# from torch.utils.data import Dataset

# class PretrainedDataset(Dataset):
#     def __init__(self, csv_file_paths, tokenizer, mask_probability=0.2, delete_probability=0.2):
#         if not isinstance(csv_file_paths, list):
#             raise TypeError("csv_file_paths should be a list of file paths.")

#         # Combine CSV files into one dataframe
#         self.data  = pd.concat(
#             [pd.read_csv(file_path, header=None) for file_path in csv_file_paths], 
#             ignore_index=True
#         )
#         self.data = self.data.values.squeeze(1)
        
# #         self.data = pd.read_csv(csv_file_path, header=None).iloc(:, 0).values.squeeze(1)
# #         [:100]
#         self.tokenizer = tokenizer
#         self.mask_probability = mask_probability
#         self.delete_probability = delete_probability

#     def __len__(self):
#         return len(self.data)

#     def mask_text(self, text):
#         tokens = text.split()
#         return ' '.join([token if random.random() > self.mask_probability else '[MASK]' for token in tokens])

#     def delete_end_text(self, text):
#         tokens = text.split()
#         cut_off = int(len(tokens) * random.uniform(0.7, 0.9))
#         return ' '.join(tokens[:cut_off])

#     def shuffle_text(self, text):
#         tokens = text.split()
#         random.shuffle(tokens)
#         return ' '.join(tokens)

#     def delete_random_text(self, text):
#         tokens = text.split()
#         return ' '.join([token if random.random() > self.delete_probability else '' for token in tokens])

#     def corrupt_text(self, text):
#         corruption_methods = [self.mask_text, self.delete_end_text, self.shuffle_text, self.delete_random_text]
#         corruption_method = random.choice(corruption_methods)
#         return corruption_method(text)

#     def __getitem__(self, idx):
#         text = self.data[idx]
#         corrupted_text = "Please reconstruct the original text: " + self.corrupt_text(text)

#         # Tokenize both original and corrupted text
#         encoding = self.tokenizer(text, truncation=True, padding='max_length', return_tensors='pt')
#         corrupted_encoding = self.tokenizer(corrupted_text, truncation=True, padding='max_length', return_tensors='pt')

#         return {
#             'input_ids': corrupted_encoding['input_ids'].squeeze(0),
#             'attention_mask': corrupted_encoding['attention_mask'].squeeze(0),
#             'labels': encoding['input_ids'].squeeze(0)
#         }
    
# train_paths = [
#     r"../data/1k ARC Corpus.csv",
#     r"../data/fillered2.csv",     
#     r"../data/1.5k_data1.csv",    
#     r"../data/1.5k_data2.csv",   
#     r"../data/wiki_1.csv"
# ]
# val_paths = [  
#     r"../data/1.5k_data3.csv",    
# ]
# train_ds = PretrainedDataset(train_paths, tokenizer)
# val_ds = PretrainedDataset(val_paths, tokenizer)

### Finetune Data

In [13]:
class FinetuneDataset(Dataset):
    def __init__(self, csv_file_paths, tokenizer):
        # Ensure csv_file_paths is a list
        if not isinstance(csv_file_paths, list):
            raise TypeError("csv_file_paths should be a list of file paths.")

        # Combine CSV files into one dataframe
        self.dataframe = pd.concat(
            [pd.read_csv(file_path) for file_path in csv_file_paths], 
            ignore_index=True
        )
#         .iloc[:10, :]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        question_choices = self.dataframe.iloc[idx, 0]
        answer = self.dataframe.iloc[idx, 1]

        # Tokenize without padding
        encoding = self.tokenizer(
            question_choices,
            truncation=True,
            return_tensors="pt",
            add_special_tokens=True,
        )
        label_encoding = self.tokenizer(
            answer, truncation=True, return_tensors="pt", add_special_tokens=True
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label_encoding["input_ids"].squeeze(0),
        }

csv_file_paths = [
    r"../data/train_df-arc_challenge.csv",
    r"../data/train_df-ARC-Easy.csv",
    r"../data/train_df_common.csv",
    r"../data/train_df-TRIP.csv",
]

val_csv_file_paths = [
     r"../data/valid_df-CE.csv",
     r"../data/valid_df-TRIP.csv",
     r"../data/valid_df-arc_challenge.csv",
     r"../data/valid_df-ARC-Easy.csv"
 ]

train_ds = FinetuneDataset(csv_file_paths, tokenizer)

val_ds = FinetuneDataset(val_csv_file_paths, tokenizer)

In [14]:
test_csv_file_paths = [
#      r"../data/valid_df-CE.csv",
#      r"../data/valid_df-TRIP.csv",
     r"../data/valid_df-arc_challenge.csv",
#      r"../data/valid_df-ARC-Easy.csv"
 ]

test_ds =FinetuneDataset(test_csv_file_paths, tokenizer)

## Configure DataLoader

In [15]:
def dynamic_padding_collate_fn(batch):
    max_length = max(
        max([len(sample["input_ids"]) for sample in batch]),
        max([len(sample["labels"]) for sample in batch]),
    )

    padded_batch = {
        "input_ids": torch.stack(
            [
                torch.cat(
                    [
                        sample["input_ids"],
                        torch.zeros(
                            max_length - len(sample["input_ids"]), dtype=torch.long
                        ),
                    ]
                )
                for sample in batch
            ]
        ),
        "attention_mask": torch.stack(
            [
                torch.cat(
                    [
                        sample["attention_mask"],
                        torch.zeros(
                            max_length - len(sample["attention_mask"]), dtype=torch.long
                        ),
                    ]
                )
                for sample in batch
            ]
        ),
        "labels": torch.stack(
            [
                torch.cat(
                    [
                        sample["labels"],
                        torch.zeros(
                            max_length - len(sample["labels"]), dtype=torch.long
                        ),
                    ]
                )
                for sample in batch
            ]
        ),
    }

    return padded_batch


bs = 8 # finetune dataset
# bs = 6 # finetune dataset
# bs = 3 # pretrain dataset
train_dataloader = DataLoader(
    train_ds, batch_size=bs, shuffle=True, collate_fn=dynamic_padding_collate_fn
)
val_dataloader = DataLoader(
    val_ds, batch_size=bs, shuffle=True, collate_fn=dynamic_padding_collate_fn
)
test_dataloader = DataLoader( test_ds, batch_size=bs, shuffle=True, collate_fn=dynamic_padding_collate_fn
)

### Verify Dataset

In [16]:
# Load a single batch from the DataLoader
data_iter = iter(train_dataloader)
batch = next(data_iter)

# Extracting data from the batch
input_ids = batch["input_ids"]
attention_masks = batch["attention_mask"]
labels = batch["labels"]

# Decode the token IDs back to text for a human-readable format
decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

# Display the information
print("Batch Content:\n")
for i in range(len(decoded_inputs)):
    print(f"Sample {i+1}:")
    print("Input:", decoded_inputs[i])
    print("Label:", decoded_labels[i])
    # print("Input IDs:", input_ids[i])
    # print("Attention Mask:", attention_masks[i])
    # print("Label IDs:", labels[i])
    print("\n" + "-" * 50 + "\n")

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [17]:
import subprocess
import re

try:
    nvidia_smi_output = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.total,memory.used,memory.free', '--format=csv,nounits,noheader']).decode('utf-8')
    
    # Parsing the output
    memory_info = nvidia_smi_output.strip().split('\n')[0].split(',')
    total_memory, used_memory, free_memory = [float(x) for x in memory_info]

    print(f"Total GPU Memory: {total_memory} MB")
    print(f"Used GPU Memory: {used_memory} MB")
    print(f"Free GPU Memory: {free_memory} MB")

except Exception as e:
    print("Failed to run nvidia-smi:", e)


if torch.cuda.is_available():
#     torch.cuda.empty_cache()  # Clear cache for a better measure of free memory
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
    free_memory = torch.cuda.mem_get_info()[0] / (1024 ** 2)  # Returns (free, total) memory
    used_memory = total_memory - free_memory

    print(f"Total GPU Memory: {total_memory:.2f} MB")
    print(f"Used GPU Memory: {used_memory:.2f} MB")
    print(f"Free GPU Memory: {free_memory:.2f} MB")
else:
    print("CUDA is not available.")


Total GPU Memory: 46068.0 MB
Used GPU Memory: 2602.0 MB
Free GPU Memory: 42810.0 MB
Total GPU Memory: 45413.12 MB
Used GPU Memory: 2602.88 MB
Free GPU Memory: 42810.25 MB


In [18]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
# #optimizer = torch.optim.AdamW(model.parameters(), lr=7e-4)
# # optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
# num_epochs = 10

# lr_scheduler = get_scheduler(
#     name="linear",
#     optimizer=optimizer,
#     num_warmup_steps=0.07*num_epochs * len(train_dataloader),
#     num_training_steps=num_epochs * len(train_dataloader)
# )

# model.train()
# min_epoch_loss = 1
# # tqdm
# for epoch in (range(num_epochs)):  # Define num_epochs
#     epoch_loss = 0
#     epoch_loss_val=0
#     progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
#     for batch in progress_bar:
#         optimizer.zero_grad()
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         # # Debugging: Print shapes
#         # print(f"Input IDs shape: {input_ids.shape}")
#         # print(f"Attention mask shape: {attention_mask.shape}")

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        
#         # # Convert model outputs to token IDs
#         # pred_token_ids = torch.argmax(outputs.logits, dim=-1)
#         # # Decode labels and outputs
#         # decoded_labels = [
#         #     tokenizer.decode(ids, skip_special_tokens=True) for ids in labels
#         # ]
#         # decoded_outputs = [
#         #     tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_token_ids
#         # ]
#         # Debugging: Print decoded texts for verification
# #         for i in range(
# #             min(len(decoded_labels), 5)
# #         ):  # Adjust number of examples to print
# #             print(f"Example {i+1}")
# #             print("Label:", decoded_labels[i])
# #             print("Output:", decoded_outputs[i])
# #             print("\n" + "-" * 50 + "\n")

#         loss = outputs.loss
#         epoch_loss += loss.item()
#         loss.backward()
#         optimizer.step()
#         lr_scheduler.step() 
#         progress_bar.set_postfix(batch_loss=f"{(loss.item()/bs):.4f}")
            
#     progress_bar_val = tqdm(val_dataloader, desc=f"Epoch {epoch+1}")
#     for batch in progress_bar_val:
#         optimizer.zero_grad()
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)


#         loss_val = outputs.loss
#         epoch_loss_val += loss.item()
#         loss_val.backward()
#         optimizer.step()
#         lr_scheduler.step() 
#         progress_bar_val.set_postfix(batch_loss=f"{(loss.item()/bs):.4f}")       
            
            
            
            
            
            
#     epoch_loss_val /= len(val_dataloader)        
#     epoch_loss /= len(train_dataloader)
#     model_saving = ""
# #     if epoch_loss < min_epoch_loss:
# #         min_epoch_loss = epoch_loss
# #         torch.save(model.state_dict(), model_weights_path)
# #         model_saving = "Model has been saved"    
#     if epoch_loss_val < min_epoch_loss:
#         min_epoch_loss = epoch_loss_val
#         torch.save(model.state_dict(), model_weights_path)
#         model_saving = "Model has been saved" 
#     print(f"train Epoch {epoch+1} loss: {epoch_loss:.4f}. {model_saving}")
#     print(f"validation Epoch {epoch+1} loss: {epoch_loss_val:.4f}. {model_saving}")

# # 06:08
# # 0.1781
# # 0.0317

# # 0.1813
# # 0.1742
# # 0.1227
# # 12:00
# # 0.0118

# #pretrain:0.3316
# #finetune:0.0010

In [None]:
model.eval()

# Disable gradient calculations
with torch.no_grad():
    for batch in test_dataloader:  # Consider using a validation dataloader
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                num_beams=3,  # Beam search with 2 beams
#                 min_length=1,  # Set a min length if desired
#                 max_length=2,  # Set a max length if desired
                # Add more generation parameters if needed
            )
    
        input_ques = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
        decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
        decoded_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]

        # Display decoded texts for verification (for the first example in the batch)
        for i in range(bs):
            print("Input: ", input_ques[i])
            print("Label: ", decoded_labels[i])
            print("Output: ", decoded_outputs[i])
            print()
        break
# 
#  

In [None]:
# # save pretrained model weights
# torch.save(model.state_dict(), model_weights_path)
# print(f"Model and tokenizer have been saved")

In [19]:
test_csv_file_paths = [
     r"../data/valid_df-CE.csv",
#        r"../data/valid_df-TRIP.csv",
#     r"../data/valid_df-arc_challenge.csv",
#       r"../data/valid_df-ARC-Easy.csv"
 ]

test_ds =FinetuneDataset(test_csv_file_paths, tokenizer)
test_dataloader = DataLoader( test_ds, batch_size=bs, shuffle=True, collate_fn=dynamic_padding_collate_fn
)

In [None]:
import evaluate
metric_acc = evaluate.load("accuracy")
f1_metric_ma = evaluate.load("f1")
f1_metric_mi = evaluate.load("f1")
f1_metric_no = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load('recall')

for batch in test_dataloader:  # Consider using a validation dataloader
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                num_beams=3,  # Beam search with 2 beams
#                 min_length=1,  # Set a min length if desired
#                 max_length=2,  # Set a max length if desired
                # Add more generation parameters if needed
            )
    
    input_ques = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
    decoded_labels = [ord(tokenizer.decode(ids, skip_special_tokens=True).split()[4])-65 for ids in labels]

    
    decoded_outputs = [ord(tokenizer.decode(ids, skip_special_tokens=True).split()[4])-65 for ids in outputs]
    
    references = decoded_labels
    predictions = decoded_outputs
    metric_acc.add_batch(predictions=predictions, references=references)
    f1_metric_ma.add_batch(predictions=predictions, references=references)
    f1_metric_mi.add_batch(predictions=predictions, references=references)
    f1_metric_no.add_batch(predictions=predictions, references=references)
    precision_metric.add_batch(predictions=predictions, references=references)
    recall_metric.add_batch(predictions=predictions, references=references)
    
    
    
score_acc = metric_acc.compute()
print(score_acc)
f1_macro = f1_metric_ma.compute(average="macro")
print(f1_macro)

f1_micro = f1_metric_mi.compute(average="micro")
print(f1_macro)

f1_None = f1_metric_no.compute(average=None)
print(f1_macro)

precision = precision_metric.compute(average="macro")
print(precision)


recall = recall_metric.compute(average="macro")
print(recall)


In [None]:
from datasets import load_metric 

metric_b = evaluate.load("bleu")
for batch in test_dataloader:  # Consider using a validation dataloader
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                num_beams=3,  # Beam search with 2 beams
#                 min_length=1,  # Set a min length if desired
#                 max_length=2,  # Set a max length if desired
                # Add more generation parameters if needed
            )
    
    input_ques = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
    decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
    decoded_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
    
    references = decoded_labels
    predictions = decoded_outputs
    metric_b.add_batch(predictions=predictions, references=references)
    
        # Display decoded texts for verification (for the first example in the batch)
#     for i in range(bs):
#         print("Input: ", input_ques[i])
#         print("Label: ", decoded_labels[i])
#         print("Output: ", decoded_outputs[i])
#         print()
#     break

score_b = metric_b.compute()
print(score_b)

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

In [21]:
from datasets import load_metric 
import evaluate
metric = evaluate.load('rouge')
for batch in test_dataloader:  # Consider using a validation dataloader
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                num_beams=3,  # Beam search with 2 beams
#                 min_length=1,  # Set a min length if desired
#                 max_length=2,  # Set a max length if desired
                # Add more generation parameters if needed
            )
    
    input_ques = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
    decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
    decoded_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
    
    references = decoded_labels
    predictions = decoded_outputs
    metric.add_batch(predictions=predictions, references=references)
    
        # Display decoded texts for verification (for the first example in the batch)
#     for i in range(bs):
#         print("Input: ", input_ques[i])
#         print("Label: ", decoded_labels[i])
#         print("Output: ", decoded_outputs[i])
#         print()
#     break

score = metric.compute()
print(score)

{'rouge1': 0.14927265620147687, 'rouge2': 0.008849277894255509, 'rougeL': 0.14889841463764086, 'rougeLsum': 0.14915796423343156}
