In [1]:
%env USE_MPS=1

%env USE_PYTORCH_METAL=1

# %env PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.7 ./webui.sh --precision full --no-half

env: USE_MPS=1
env: USE_PYTORCH_METAL=1


In [2]:
!python3 --version

Python 3.12.3


In [3]:
# !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

In [4]:
#verify mps is available
import torch
torch.set_default_device("mps")

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")
device = torch.device("mps")

tensor([1.], device='mps:0')


In [5]:
#### import torch
from transformers import BertModel 
import datasets
from datasets import load_dataset
# model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english").with_format("torch")
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [6]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [7]:
dataset
# dataset['train'][0]
# dataset['train'].set_format("torch", device="mps") 
# dataset['validation'].set_format("torch", device="mps") 
# dataset.set_format("torch", device="mps") 

dataset['train'][0]

{'ID': '2017-En-21441',
 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'anger': tensor(False, device='mps:0'),
 'anticipation': tensor(True, device='mps:0'),
 'disgust': tensor(False, device='mps:0'),
 'fear': tensor(False, device='mps:0'),
 'joy': tensor(False, device='mps:0'),
 'love': tensor(False, device='mps:0'),
 'optimism': tensor(True, device='mps:0'),
 'pessimism': tensor(False, device='mps:0'),
 'sadness': tensor(False, device='mps:0'),
 'surprise': tensor(False, device='mps:0'),
 'trust': tensor(True, device='mps:0')}

In [8]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

Now we are getting ready to preprocess the data using the BERT tokenizer. This include mapping the text to float ing point labels and moving it into a matrix of size batch_size x num_labels. These should be floats per PyTorch expectation.

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    #get the text of this batch
    text=examples["Tweet"]
    #get the ecoding for this text tusing bert tokenizer
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length = 128)
    #add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    #numpy array size batch x num labels
    labels_matrix = np.zeros((len(text), len(labels)))

    # replace zeros in numpy array with values from encoding
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
    return encoding

In [10]:
text = dataset['train'][5]['Tweet']
len(text)
# encoded_dataset['train']['labels']

94

In [12]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
# encoded_dataset
# encoded_dataset["train"]
# encoded_dataset["validation"]
# encoded_dataset["train"]["input_ids"]
# encoded_dataset.with_format("torch")
# torch.Tensor(encoded_dataset)
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 886
    })
})

In [None]:
encoded_dataset['train'].set_format("torch", device="mps") 
encoded_dataset['validation'].set_format("torch", device="mps") 
encoded_dataset.set_format("torch", device="mps") 
encoded_dataset["train"]
encoded_dataset["validation"]
encoded_dataset["train"]["input_ids"]

In [None]:
example = encoded_dataset['train'][0]
print(example.keys())

len(example['input_ids'])
example['labels']

In [None]:
# tokenizer.decode(example['input_ids'])
#CLS = classify token and it is placed at the beginning of input
#SEP = end of string used for next sentence prediction
#PAD = pad to 128 characters

In [None]:
example['labels']

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

Setup model. multi_label_classification indicates the type of problem. We'll use BCEWithLogitsLoss (sigmoid layer with binary cross entropy loss - BCEWithLogitsLoss)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)
model = model.to(device)
model.device

In [None]:
batch_size = 8
metric_name = "f1"
from transformers import TrainingArguments, Trainer

In [None]:
args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    use_mps_device=True
    #push_to_hub=True,
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid(device=model.device)
    probs = sigmoid(torch.Tensor(predictions, device=device))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
encoded_dataset['train'][0]['labels'].type()

In [None]:
encoded_dataset["train"]

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_dataset = TensorDataset(encoded_dataset["train"]["input_ids"], 
                             encoded_dataset["train"]["attention_mask"],
                             encoded_dataset["train"]["labels"]);
batch_size = 16  # Adjust based on your GPU memory

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

eval_loader = DataLoader(encoded_dataset["validation"], batch_size=batch_size, shuffle=False)



# train_loader.to(device)

# encoded_dataset['train'].set_format("torch", device="mps")
# encoded_dataset['validation'].set_format("torch", device="mps")
#dataset['train']

In [None]:
# outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), 
#                 labels=encoded_dataset['train'][0]['labels'].unsqueeze(0),
#                 attention_mask=encoded_dataset['train'][0]['attention_mask'].unsqueeze(0))

encoded_dataset['train'].set_format("torch", device="mps")
encoded_dataset['train']["labels"]


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
one = set(torch.tensor(np.unique(encoded_dataset['train']["labels"].cpu())).cpu())

# one = set(np.unique(encoded_dataset['train']["labels"].cpu()))
# one 
one 

In [None]:
# trainer.train()
two = set(encoded_dataset['train']["labels"].cpu())
two
# two = set(encoded_dataset['train']["labels"].cpu())
# two

In [None]:
# two - one

# if (two - one):
#     print("hello")
# else: 
#     print("helloooo")

In [None]:
# from sklearn.utils.class_weight import compute_class_weight

# #compute the class weights
# encoded_dataset['train']["labels"]
# torch.tensor(encoded_dataset['train']["labels"]).tolist()
# class_wts = compute_class_weight(class_weight = 'balanced', 
#                                  classes = np.unique(encoded_dataset['train']["labels"].cpu()),
#                                  y = encoded_dataset['train']["labels"].cpu())

# # print(class_wts)
# # weights= torch.tensor(class_wts,dtype=torch.float)
# # weights = weights.to(device)

# # print(weights)

In [None]:
import torch.nn as nn

cross_entropy  = nn.NLLLoss() 


In [None]:
def train():
    model.train()

    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
    total_labels =[]
  
    # iterate over batches
    for step,batch in enumerate(train_loader):
    
        # progress update after every 50 batches.
        if step % 100 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_loader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        print (preds.logits)
        print (labels)
        print (preds.logits.shape)
        print (labels.shape)
        print (preds.logits.squeeze().shape)
        print (labels.squeeze().shape)
        loss = cross_entropy(preds.logits, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters"
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)
        # append the model predictions
        total_preds+=list(preds)
        total_labels+=labels.tolist()

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    #total_preds  = np.concatenate(total_preds, axis=0)
    f1 = f1_score(total_labels, total_preds, average='weighted')
    #returns the loss and predictions
    return avg_loss, f1

In [None]:
train_loss = train()
train_loss