In [2]:
!nvidia-smi

Fri Apr 12 22:24:22 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.52                 Driver Version: 551.52         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| 44%   29C    P8             13W /  125W |     718MiB /   6144MiB |     15%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import json
import re
from pprint import pprint
import numpy as np
import pandas as pd
#import bitsandbytes

# %load_ext cudf.pandas  # pandas operations now use the GPU!
import torch
from huggingface_hub import notebook_login
from transformers import(
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
    GPT2Config
)
from tqdm import tqdm
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

DEVICE =  "cuda: 0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "gpt2"
DEVICE

'cuda: 0'

In [4]:
from datasets import load_dataset, Dataset, Value, ClassLabel, Features

my_dataset = load_dataset("./Emotion_Dataset", sep=",")
# Creating a ClassLabel Object
df = my_dataset["train"].to_pandas()
labels = ['sadness','joy','love','anger', 'fear', 'surprise']
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

# Mapping Labels to IDs
def map_label2id(example):
    example['label'] = ClassLabels.str2int(example['label'])
    return example

my_dataset= my_dataset.map(map_label2id, batched=True)

# Casting label column to ClassLabel Object
my_dataset = my_dataset.cast_column('label', ClassLabels)

my_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
dataset_df = my_dataset["train"].to_pandas()
dataset_df

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [6]:
features = my_dataset["train"].features
features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [7]:
id2label = {idx:features["label"].int2str(idx) for idx in range(6)}
id2label

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [8]:
label2id = {v:k for k,v in id2label.items()}
label2id

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [9]:
dataset_df["label"].value_counts(normalize=True).sort_index()

label
0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: proportion, dtype: float64

In [48]:
def evaluate1(y_true, y_pred):

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels output: {0,1,2,3,4,5}

    # nested for loops
    for label in unique_labels:
        # will output a list of the index of one emotion at a time
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        # will output the list of one emotion
        label_y_true = [y_true[i] for i in label_indices]
        # label_y_true = [label for i in range(len(y_true))]
        # will output list of the predicted emotion in the same order as label_y_true
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2, 3, 4, 5, 6])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [12]:
# instantiate the configuration for your model, this can be imported from transformers
configuration = GPT2Config()
# set up your tokenizer, just like you described, and set the pad token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# instantiate the model
model = GPT2ForSequenceClassification(configuration).from_pretrained("gpt2",
        num_labels=6,
        id2label=id2label,
        label2id=label2id).cuda()
# set the pad token of the model's configuration
model.config.pad_token_id = model.config.eos_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = my_dataset.map(tokenize_function, batched=True, batch_size=1)

In [17]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [44]:
test = my_dataset["test"].to_pandas()
y_true = test.label
y_true

0       0
1       0
2       0
3       1
4       0
       ..
1995    3
1996    3
1997    1
1998    1
1999    4
Name: label, Length: 2000, dtype: int64

In [35]:
prompt = tokenized_datasets["test"][12]["text"]
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model(**inputs)
prediction = outputs.logits.argmax().item()
prediction

4

In [36]:
def predict(model, tokenizer):
    y_pred = []
    none_pred = []
    for i in tqdm(range(len(tokenized_datasets["test"]))):
        # prompt = X_test.iloc[i]["text"].to(DEVICE)
        prompt = tokenized_datasets["test"][i]["text"]
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            logits = model(**inputs).logits
            
        outputs = model(**inputs)
        prediction = outputs.logits.argmax().item()

        y_pred.append(prediction)

    return y_pred

In [40]:
tokenized_datasets["test"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
y_pred = predict(model, tokenizer)


In [49]:
evaluate1(y_true, y_pred)

Accuracy: 0.933
Accuracy for label 0: 0.972
Accuracy for label 1: 0.958
Accuracy for label 2: 0.824
Accuracy for label 3: 0.924
Accuracy for label 4: 0.915
Accuracy for label 5: 0.697

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       581
           1       0.95      0.96      0.95       695
           2       0.88      0.82      0.85       159
           3       0.93      0.92      0.93       275
           4       0.90      0.92      0.91       224
           5       0.79      0.70      0.74        66

    accuracy                           0.93      2000
   macro avg       0.90      0.88      0.89      2000
weighted avg       0.93      0.93      0.93      2000


Confusion Matrix:
[[565   3   0  10   3   0   0]
 [  4 666  18   1   1   5   0]
 [  1  27 131   0   0   0   0]
 [ 10   4   0 254   7   0   0]
 [  6   0   0   6 205   7   0]
 [  2   4   0   1  13  46   0]
 [  0   0   0   0   0   0   0]]


In [18]:
output_dir = "./GPT_output"

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=2,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    per_device_eval_batch_size=1,
    # gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    # gradient_checkpointing=True,              # use gradient checkpointing to save memory
    # optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-5,                       # learning rate, based on QLoRA paper
    weight_decay=0.01,
    # max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    # max_steps=-1,
    # warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    # group_by_length=True,
    # report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch",              # save checkpoint every epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
# Train model
trainer.train()

# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

  0%|          | 0/32000 [00:00<?, ?it/s]

{'loss': 5.8493, 'grad_norm': 196.4993133544922, 'learning_rate': 1.9984375000000003e-05, 'epoch': 0.0}
{'loss': 2.7695, 'grad_norm': 224.649169921875, 'learning_rate': 1.996875e-05, 'epoch': 0.0}
{'loss': 2.1698, 'grad_norm': 59.69400405883789, 'learning_rate': 1.9953125000000002e-05, 'epoch': 0.0}
{'loss': 1.9788, 'grad_norm': 73.99597930908203, 'learning_rate': 1.99375e-05, 'epoch': 0.01}
{'loss': 1.9803, 'grad_norm': 99.36345672607422, 'learning_rate': 1.9921875e-05, 'epoch': 0.01}
{'loss': 2.4645, 'grad_norm': 233.8226776123047, 'learning_rate': 1.9906250000000003e-05, 'epoch': 0.01}
{'loss': 2.1111, 'grad_norm': 80.32600402832031, 'learning_rate': 1.9890625e-05, 'epoch': 0.01}
{'loss': 2.3998, 'grad_norm': 62.693603515625, 'learning_rate': 1.9875000000000002e-05, 'epoch': 0.01}
{'loss': 1.4889, 'grad_norm': 43.38834762573242, 'learning_rate': 1.9859375e-05, 'epoch': 0.01}
{'loss': 1.7593, 'grad_norm': 0.31435427069664, 'learning_rate': 1.984375e-05, 'epoch': 0.02}
{'loss': 2.6776

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.28856348991394043, 'eval_accuracy': 0.9355, 'eval_runtime': 185.4301, 'eval_samples_per_second': 10.786, 'eval_steps_per_second': 10.786, 'epoch': 1.0}
{'loss': 0.2171, 'grad_norm': 0.013789176009595394, 'learning_rate': 9.984375e-06, 'epoch': 1.0}
{'loss': 0.3906, 'grad_norm': 0.04311097785830498, 'learning_rate': 9.96875e-06, 'epoch': 1.0}
{'loss': 0.5816, 'grad_norm': 0.026556340977549553, 'learning_rate': 9.953125000000001e-06, 'epoch': 1.0}
{'loss': 0.0388, 'grad_norm': 0.00099616264924407, 'learning_rate': 9.937500000000001e-06, 'epoch': 1.01}
{'loss': 0.0317, 'grad_norm': 0.0021745162084698677, 'learning_rate': 9.921875e-06, 'epoch': 1.01}
{'loss': 0.0844, 'grad_norm': 0.002076284494251013, 'learning_rate': 9.90625e-06, 'epoch': 1.01}
{'loss': 0.3931, 'grad_norm': 0.004194988869130611, 'learning_rate': 9.890625e-06, 'epoch': 1.01}
{'loss': 0.5633, 'grad_norm': 0.005054446868598461, 'learning_rate': 9.875000000000001e-06, 'epoch': 1.01}
{'loss': 0.054, 'grad_norm'

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.23366403579711914, 'eval_accuracy': 0.943, 'eval_runtime': 189.5796, 'eval_samples_per_second': 10.55, 'eval_steps_per_second': 10.55, 'epoch': 2.0}
{'train_runtime': 11058.6107, 'train_samples_per_second': 2.894, 'train_steps_per_second': 2.894, 'train_loss': 0.49744738424008117, 'epoch': 2.0}


('./GPT_output\\tokenizer_config.json',
 './GPT_output\\special_tokens_map.json',
 './GPT_output\\vocab.json',
 './GPT_output\\merges.txt',
 './GPT_output\\added_tokens.json')

In [20]:
model.eval()
trainer.evaluate()

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.23366403579711914,
 'eval_accuracy': 0.943,
 'eval_runtime': 185.9571,
 'eval_samples_per_second': 10.755,
 'eval_steps_per_second': 10.755,
 'epoch': 2.0}

In [21]:
trainer.predict(tokenized_datasets["test"])

  0%|          | 0/2000 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[12.127413  , -2.6804461 , -2.6463747 , -0.31706494, -1.9975392 ,
        -4.074513  ],
       [12.929155  , -2.7230878 , -1.9403381 ,  0.01938811, -2.334735  ,
        -4.0696855 ],
       [13.37953   , -2.6628666 , -1.095957  , -0.03152221, -2.9360116 ,
        -3.6829991 ],
       ...,
       [-2.3158886 , 15.004853  , -1.6367898 , -3.0036368 , -2.0701993 ,
        -2.2375994 ],
       [-3.798887  , 14.445511  , -2.3101883 , -3.8726668 ,  0.0178639 ,
        -2.6284823 ],
       [-2.5699635 , -2.0546322 , -3.915491  , -2.8241003 ,  4.582789  ,
         2.6164312 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 4], dtype=int64), metrics={'test_loss': 0.29301774501800537, 'test_accuracy': 0.9335, 'test_runtime': 187.2366, 'test_samples_per_second': 10.682, 'test_steps_per_second': 10.682})

In [None]:
y_pred = predict(model, tokenizer)
evaluate(y_true, y_pred)