## Importing libraries and tokenizer

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5ForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import gc

In [None]:
# Load model directly
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
if(torch.cuda.is_available()):
    device = "gpu"  
elif(torch.backends.mps.is_available()):
    device = "mps"
else:
    device = "cpu"
device = torch.device(device=device)

## Cleaning Dataset and EDA

In [None]:
# class: prepare dataset
# method: clean_dataset (involving mkaing data balanced, removing NaNs)
# method: preparing tensors from the cleaned dataset
# method: splitting the data
# method: preparing the data loaders

class prepare_data:

    def __init__(tokenizer,train_size,path):
        self.tokenizer = tokenizer
        self.train_size =  train_size
        self.path = path

    def clean_dataset(self):
        df = pd.read_csv(self.path)
        df_infavor = df[df['label_majority'] == 'in-favor']
        df_against = df[df['label_majority'] == 'against']
        df_neutral = df[df['label_majority'] == 'neutral-or-unclear']
        df_infavor = df_infavor.sample(n=len(df_neutral))
        df_against = df_against.sample(n=len(df_neutral))
        df_finetuning = pd.concat([df_infavor,df_against,df_neutral])[['tweet','label_majority']]
        return df_finetuning
    
    def prepare_tensors(self,df):
        list_tweets = list(df_finetuning['tweet'])
        list_labels = list(df_finetuning['label_majority'])

        for index in range(len(list_tweets)):
            list_tweets[index] =  f""" What is the stance of the following tweet with respect to COVID-19 vaccine?
                                        Here is the tweet. "{list_tweets[index]}"
                                        Please use exactly one word from the following 3 categories to label it: "in-favor", "against", "neutral-or-unclear" """
        
        list_tweets_tokenized = [tokenizer(tweet).input_ids[:self.tokenizer.max_token_length] for tweet in list_tweets]

        max_length = max(len(tweet) for tweet in list_tweets_tokenized)
        pad_token_id = tokenizer.eos_token_id
        X = [ tweet_tokenized + [pad_token_id]*(max_length-len(tweet_tokenized)) for tweet_tokenized in list_tweets_tokenized]

        unique_values = list(set(list_labels))
        dic = {}
        max_len = 0
        for val in unique_values:
            dic[val] = tokenizer(val,return_tensors="pt").input_ids.squeeze().tolist()
            max_len = max(max_len,len(dic[val]))

        for val in dic:
            dic[val] = dic[val] + [pad_token_id]*(max_len-len(dic[val]))

        y = [dic[label] for label in list_labels]

        return X,y,dic


    def split_data(feature_arr, label_arr, train_size):
        X_train, X_test, y_train, y_test = train_test_split(feature_arr,label_arr,train_size=train_size,random_state=42,shuffle=True)
        return X_train,X_test, y_train, y_test
    
    def load_tensors(X_train,y_train,X_test,y_test):
        X_train = torch.tensor(X_train)
        y_train = torch.tensor(y_train)
        X_test = torch.tensor(X_test)
        y_test = torch.tensor(y_test)
        return X_train,y_train,X_test,y_test

    def load_loader(X,y,batch_size,shuffle=True):
        dataset = TensorDataset(X,y)
        loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=shuffle)
        return loader


In [5]:
df = pd.read_csv('Q2_20230202_majority 1.csv')

In [6]:
df['label_majority'].value_counts()

label_majority
in-favor              2907
against               1804
neutral-or-unclear    1040
Name: count, dtype: int64

In [None]:
df_infavor = df[df['label_majority'] == 'in-favor']
df_against = df[df['label_majority'] == 'against']
df_neutral = df[df['label_majority'] == 'neutral-or-unclear']

In [8]:
df_infavor = df_infavor.sample(n=len(df_neutral))
df_against = df_against.sample(n=len(df_neutral))

In [9]:
df_finetuning = pd.concat([df_infavor,df_against,df_neutral])[['tweet','label_majority']]

In [10]:
list_tweets = list(df_finetuning['tweet'])
list_labels = list(df_finetuning['label_majority'])

In [11]:
le = LabelEncoder()
y = le.fit_transform(list_labels)
y = [[i] for i in y]

In [12]:
max_token_length = tokenizer.model_max_length
list_tweets_tokenized = [tokenizer(tweet).input_ids[:max_token_length] for tweet in list_tweets]

In [13]:
max_token_length = max(len(tweet) for tweet in list_tweets_tokenized)
pad_token_id = tokenizer.eos_token_id
X = [ tweet_tokenized + [pad_token_id]*(max_token_length-len(tweet_tokenized)) for tweet_tokenized in list_tweets_tokenized ]

## splitting training and testing set

In [14]:
def split_data(feature_arr, label_arr, train_size):
  X_train, X_test, y_train, y_test = train_test_split(feature_arr,label_arr,train_size=train_size,random_state=42,shuffle=True)
  return X_train,X_test, y_train, y_test

In [15]:
train_size = 0.8
X_train, X_test, y_train, y_test = split_data(X,y,train_size)

## Loading the final tensor dataset 

In [16]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

In [17]:
train_dataset = TensorDataset(X_train,y_train)
train_loader = DataLoader(train_dataset, batch_size=8,shuffle=True)
# test_dataset = TensorDataset(X_test,y_test)
# test_loader = DataLoader(test_dataset, batch_size=8,shuffle=True)

## Loading the model, attaching classification head and replacing linear with LORA and linear

In [18]:
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
num_classes = 3

In [19]:
import math

class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        self.A = torch.nn.Parameter(torch.zeros(in_dim, rank))
        torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))  # similar to standard weight initialization
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x
    
    def parameters(self):
        return self.alpha * (self.A @ self.B)


In [20]:
class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
        self.weight = self.lora.parameters()

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [21]:
def replace_linear_with_lora(model, rank, alpha):
    for name, module in model.named_children():
        if isinstance(module, torch.nn.Linear):
            # Replace the Linear layer with LinearWithLoRA
            setattr(model, name, LinearWithLoRA(module, rank, alpha))
        else:
            # Recursively apply the same function to child modules
            replace_linear_with_lora(module, rank, alpha)

In [22]:

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

for param in model.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

Total trainable parameters before: 783,150,080
Total trainable parameters after: 0


In [23]:
replace_linear_with_lora(model, rank=16, alpha=16)

In [None]:
# model.lm_head = torch.nn.Linear(in_features=1024, out_features=num_classes,bias=False)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable LoRA parameters: {total_params:,}")

Total trainable LoRA parameters: 18,287,616


In [25]:
model = model.to(device)

In [26]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): LinearWithLoRA(
                (linear): Linear(in_features=1024, out_features=1024, bias=False)
                (lora): LoRALayer()
              )
              (k): LinearWithLoRA(
                (linear): Linear(in_features=1024, out_features=1024, bias=False)
                (lora): LoRALayer()
              )
              (v): LinearWithLoRA(
                (linear): Linear(in_features=1024, out_features=1024, bias=False)
                (lora): LoRALayer()
              )
              (o): LinearWithLoRA(
                (linear): Linear(in_features=1024, out_features=1024, bias=False)
                (lora): LoRALayer()
              )
              (relative_attention_bias): Em

# Finetuning loop

In [None]:
# class finetuning 
# method to finetune 
# haev loss, accuracy attributes , epochs(in arguments)etc

class tuning_and_inference:

    def __init__(optimizer,loss_fn,no_of_epochs,tokenizer)
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.no_of_epochs = no_of_epochs
        self.tokenizer = tokenizer
        
    def fine_tune():
        # labels can be a one dimensional tensor(N)
        # output is N by 1
        for epoch in range(no_of_epochs):
            iteration_number = 0
            for features,labels in train_loader:
                model.train()
                features = features.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                outputs = model(input_ids=features,labels=labels)
                del features,labels
                torch.cuda.empty_cache()
            iteration_number += 1     

    def generate_category(tweet,token_mapper):
        input_ids = tokenizer(tweet, return_tensors="pt").input_ids[:max_tokens].to("cpu")
        output_tokens = model.generate(input_ids)
        predicted_label = tokenizer.decode(output_tokens[0][1:len(output_tokens[0])-1])
        if(predicted_label not in token_mapper):
            print(predicted_label)
            return ""
        return predicted_label 
    
    def generate_predicted_labels(df)
        label_preds = []
        for index,row in df.iterrows():
            prompt = f""" What is the stance of the following tweet with respect to COVID-19 vaccine?
                        Here is the tweet. "{row['tweet']}"
                        Please use exactly one word from the following 3 categories to label it: 
                        - "in-favor", 
                        - "against", 
                        - "neutral-or-unclear" 
                        Only output the **exact** category name and nothing else. 
                        """
            label_pred = generate_category(prompt,tokenizer.model_max_length,dic)
            label_preds.append(label_pred)
        return label_preds


In [28]:
optimizer= torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.CrossEntropyLoss()
no_of_epochs = 2

## Inference

In [None]:
# class inference
# method to do ifnerence from a list of inputs and return the output
# method to evaluate accuracy between predicted and real

In [51]:
def generate_category(tweet,max_tokens,mapper):
  input_ids = tokenizer(tweet, return_tensors="pt").input_ids[:max_tokens]
  label_output = model(input_ids)
  return mapper.inverse_transform(label_output)[0]

In [None]:
label_preds = []
for index,row in df.iterrows():
    tweet = row['tweet']
    label_pred = generate_category(tweet,tokenizer.model_max_length,le)
    label_preds.append(label_preds)

In [None]:
df["label_pred"] = label_preds

In [None]:
df.to_csv('final_output.csv')

## Saving the model files

In [None]:
torch.save(model.state_dict(), 'model_weights.pth')
torch.save(model, 'model.pth')

In [None]:
# freeze the weights
# replace the final layer with a classification layer
# intialize lora layers and dont freeze their weights (which is by default)
# model.train()
# do the typical training loop with optimizer.step, optimizer.zero_grad, loss.backward()

In [None]:
# training loop

# set to training
# initialise optimzier 
# no of epochs 
# define loss function 

# in each iteration
# optimizer.zero_grad()
# forward pass
# loss.backward 
# optimizer.step 

# set to evaluation
# evaluate on test data

In [None]:
# Learnings: 

# model.eval()
# 	• Disables dropout and sets batch norm to inference mode.
# 	• Does NOT disable gradient tracking.

# torch.no_grad()
# 	Disables gradient tracking (no .grad, saves memory, speeds up inference).

# model.train()
# • enables dropout and sets batch norm to inference mode.

# optimizer.zero_grad()
# sets all gradients set now to 0 

# improtance of handlign hardware and removing intermediate tensors 
# importance of researhcing about the finetuning script before


In [None]:
# make it into OOPS strcuture

# do we need a validation set and why? What are the different hyperparameters...epochs, batch size  actually its rank and alpha for lora
# the validation set in the code is actually a test set 

In [190]:
# hyperpapramters - learning rate, weight decay
# model structure - rank, alpha 