### Importing libraries

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5ForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import gc

In [3]:
# Load model directly
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
if(torch.cuda.is_available()):
    device = "cuda"
elif(torch.backends.mps.is_available()):
    device = "mps"
else:
    device = "cpu"

device = torch.device(device=device)

### Preparing data


In [5]:
df = pd.read_csv('Q2_20230202_majority 1.csv')

In [6]:
df['label_majority'].value_counts()

label_majority
in-favor              2907
against               1804
neutral-or-unclear    1040
Name: count, dtype: int64

In [7]:
df_infavor = df[df['label_majority'] == 'in-favor']
df_against = df[df['label_majority'] == 'against']
df_neutral = df[df['label_majority'] == 'neutral-or-unclear']

In [8]:
df_infavor = df_infavor.sample(n=len(df_neutral))
df_against = df_against.sample(n=len(df_neutral))

In [9]:
df_finetuning = pd.concat([df_infavor,df_against,df_neutral])[['tweet','label_majority']]

In [10]:
list_tweets = list(df_finetuning['tweet'])
list_labels = list(df_finetuning['label_majority'])

In [11]:
for index in range(len(list_tweets)):
    list_tweets[index] =  f"""
        What is the stance of the following tweet with respect to COVID-19 vaccine?
        Here is the tweet. "{list_tweets[index]}"

        Please use exactly one word from the following 3 categories to label it:
        - "in-favor",
        - "against",
        - "neutral-or-unclear"

        Only output the **exact** category name and nothing else.
        """

In [12]:
max_token_length = tokenizer.model_max_length
list_tweets_sequences = [tokenizer(tweet) for tweet in list_tweets]
list_tweets_masks = [tokenizer(tweet).attention_mask for tweet in list_tweets]
list_tweets_tokenized = [tokenizer(tweet).input_ids[:max_token_length] for tweet in list_tweets]

In [13]:
max_token_length = max(len(tweet) for tweet in list_tweets_tokenized)
pad_token_id = tokenizer.pad_token_id
X = [ tweet_tokenized + [pad_token_id]*(max_token_length-len(tweet_tokenized)) for tweet_tokenized in list_tweets_tokenized ]

In [14]:
unique_values = list(set(list_labels))
dic = {}
max_len = 0
for val in unique_values:
  dic[val] = tokenizer(val,return_tensors="pt").input_ids.squeeze().tolist()
  max_len = max(max_len,len(dic[val]))

for val in dic:
  dic[val] = dic[val] + [pad_token_id]*(max_len-len(dic[val]))

In [15]:
y = [dic[label] for label in list_labels]

### Splitting training and testing 

In [16]:
def split_data(feature_arr, label_arr, train_size):
  X_train, X_test, y_train, y_test = train_test_split(feature_arr,label_arr,train_size=train_size,random_state=42,shuffle=True)
  return X_train,X_test, y_train, y_test

In [17]:
train_size = 0.95
X_train, X_test, y_train, y_test = split_data(X,y,train_size)

In [18]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

In [19]:
train_dataset = TensorDataset(X_train,y_train)
test_dataset = TensorDataset(X_test,y_test)
train_loader = DataLoader(train_dataset, batch_size=4,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4,shuffle=True)

In [20]:
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
num_classes = 3

In [21]:
import math

class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        self.A = torch.nn.Parameter(torch.zeros(in_dim, rank))
        torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))  # similar to standard weight initialization
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

    def parameters(self):
        return self.alpha * (self.A @ self.B)

In [22]:
class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
        self.weight = self.lora.parameters()

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [23]:
def replace_linear_with_lora(model, rank, alpha):
    for name, module in model.named_children():
        if isinstance(module, torch.nn.Linear):
            # Replace the Linear layer with LinearWithLoRA
            setattr(model, name, LinearWithLoRA(module, rank, alpha))
        else:
            # Recursively apply the same function to child modules
            replace_linear_with_lora(module, rank, alpha)

In [24]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

for param in model.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

Total trainable parameters before: 783,150,080
Total trainable parameters after: 0


In [25]:
replace_linear_with_lora(model, rank=16, alpha=16)

In [26]:
# model.lm_head = torch.nn.Linear(in_features=1024, out_features=num_classes,bias=False)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable LoRA parameters: {total_params:,}")

Total trainable LoRA parameters: 18,814,976


In [27]:
model = model.to(device)

In [28]:
def generate_category(tweet,max_tokens,mapper,device):
  input_ids = tokenizer(tweet, return_tensors="pt").input_ids[:max_tokens].to(device)
  output_tokens = model.generate(input_ids)
  del input_ids
  # torch.cuda.empty_cache()
  torch.mps.empty_cache()
  predicted_label = tokenizer.decode(output_tokens[0][1:len(output_tokens[0])-1])
  print(predicted_label)
  if(predicted_label not in mapper):
    print("issue!!!")
    return ""
  return predicted_label

In [29]:
optimizer= torch.optim.Adam(model.parameters(), lr=0.0003)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
no_of_epochs = 4

In [30]:
labels_list = ["in-favor","against","neutral-or-unclear"]
tokenizer_labels_list = [tokenizer(label,return_tensors="pt")['input_ids'] for label in labels_list]
allowed_ids = torch.reshape(torch.cat(tokenizer_labels_list,-1),(-1,)).to(device)


In [32]:
test_accuracy_arr = []
for epoch in range(no_of_epochs):
    iteration_number = 0
    cumulative_loss = 0
    for features,labels in train_loader:
        model.train()
        features = features.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        keep_mask = torch.isin(features, allowed_ids) 
        keep_mask = keep_mask.long().to(device)
        outputs = model(input_ids=features,labels=labels,attention_mask=keep_mask)
        logits = outputs.logits
        loss = outputs.loss
        cumulative_loss += loss.item()
        loss.backward()
        optimizer.step()
        iteration_number += len(features)
        del features,labels,keep_mask
        # torch.cuda.empty_cache()
        torch.mps.empty_cache()

        if((iteration_number+1)%500==0):
            model.eval()
            with torch.no_grad():
                no_of_correct = 0
                no_of_total = 0
                for features_test,labels_test in test_loader:
                    for idx in range(len(features_test)):
                        feature = features_test[idx]
                        label = labels_test[idx]
                        prompt = f""" What is the stance of the following tweet with respect to COVID-19 vaccine?
                                      Here is the tweet. "{tokenizer.decode(feature)}"

                                      Please use exactly one word from the following 3 categories to label it:
                                      - "in-favor",
                                      - "against",
                                      - "neutral-or-unclear"

                                      Only output the **exact** category name and nothing else.
                                      """
                        label_pred = generate_category(prompt,tokenizer.model_max_length,dic,"cuda")
                        if(label_pred in tokenizer.decode(label)):
                          no_of_correct+=1
                        no_of_total+=1
                test_accuracy = no_of_correct/float(no_of_total)
                test_accuracy_arr.append(test_accuracy)
                print(test_accuracy)

    print(cumulative_loss)


KeyboardInterrupt: 

In [None]:
model.to("cpu")

In [None]:
label_preds = []
for index,row in df.iterrows():
    if(index>100):
      break
    print(index)
    prompt = f""" What is the stance of the following tweet with respect to COVID-19 vaccine?
                  Here is the tweet. "{row['tweet']}"

                  Please use exactly one word from the following 3 categories to label it:
                  - "in-favor",
                  - "against",
                  - "neutral-or-unclear"

                  Only output the **exact** category name and nothing else.
                  """
    label_pred = generate_category(prompt,tokenizer.model_max_length,dic,"cpu")
    label_preds.append(label_pred)

In [None]:
true_label = df["label_majority"][:101]

In [None]:
label_preds

In [None]:
sum(label_preds==true_label)

In [None]:
df.to_csv('final_output.csv')

In [None]:
torch.save(model.state_dict(), 'model_weights.pth')
torch.save(model, 'model.pth')

In [None]:
def create_padding_mask(seq, pad_token=0):
    mask = (seq == pad_token).unsqueeze(1).unsqueeze(2)
    return mask  # (batch_size, 1, 1, seq_len)

# Example usage
seq = torch.tensor([[7, 6, 0, 0], [1, 2, 3, 0]])
padding_mask = create_padding_mask(seq)
print(padding_mask)

In [None]:
# the way i am fientuning it is not tokenwise and all batch together
# maybe i am fientuning it so much in that way that it has deviated int egenratting those random tokens and the question pormpt has lost meanong
# I have to try t finetune it on one epoch and then seeing the results
# i think just finetuning on those tokens is gonna mess up the weights for the other tokens 

In [None]:
# no of epochs, learning rate, no of examples
# the input_ids geenrate those logits

In [None]:
# logits = logits.view(-1, logits.size(-1))
# labels = labels.view(-1)
# # print(logits.shape)
# # print(labels.shape)
# loss = loss_fn(logits,labels)

In [None]:
# Prompt ≠ supervision: Keep the prompt for context, but compute loss only on the label span (set non-label positions to -100 so CE ignores them).
# Batch loss math: CE is per token; the batch loss is the average over unmasked tokens only.

In [None]:
# attention-masking them (zero their attention weights)
# logits-masking disallowed vocab at output
# ignoring their losses during cross entropy losss computation