## Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv
from peft import LoraConfig, get_peft_model, PeftModel
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          RobertaForSequenceClassification, TrainingArguments,
                          DistilBertForSequenceClassification, pipeline, Trainer,
                          DataCollatorWithPadding)

## Calling Env

In [2]:
_ = load_dotenv(find_dotenv())

HF_API_KEY  = os.getenv('HF_TOKEN')

## Transformers

### Distilbert

#### Initial parameters

In [10]:
model_name = "distilbert/distilbert-base-cased"
device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_VIRTUAL_TOKENS = 50
NUM_EPOCHS_PROMPT = 20
NUM_EPOCHS_CLASSIFIER = 20

#### Tokenizeing

In [5]:
# mappings
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {value:key for key, value in id2label.items()}

In [6]:
model = DistilBertForSequenceClassification.from_pretrained(
                                                model_name,
                                                token=HF_API_KEY,
                                                id2label=id2label,
                                                label2id=label2id,
                                                num_labels=3
                                            )
model=model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_API_KEY)

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert/distilbert-base-cased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [7]:
clf=pipeline("text-classification",
             model=model,
             tokenizer=tokenizer,
             device=device
             )

clf("This was a bad product, I don't know how someone could do something like that", top_k=None)

[{'label': 'NEGATIVE', 'score': 0.36534935235977173},
 {'label': 'NEUTRAL', 'score': 0.3413798213005066},
 {'label': 'POSITIVE', 'score': 0.29327085614204407}]

#### Save the model

In [8]:

clf.model.save_pretrained("./models/classifier_distilbert/")
clf.tokenizer.save_pretrained("./models/classifier_distilbert/")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./models/classifier_distilbert/tokenizer_config.json',
 './models/classifier_distilbert/tokenizer.json')

OK, mostly the model can classify, but it seems it is almost a random guess. Let's try another model and thento fine tune it.

### Roberta

#### Initial parameters

In [18]:
model_name = "roberta-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_VIRTUAL_TOKENS = 50
NUM_EPOCHS_PROMPT = 20
NUM_EPOCHS_CLASSIFIER = 20

#### Tokenizeing

In [19]:
# mappings
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {value:key for key, value in id2label.items()}

In [20]:
model = RobertaForSequenceClassification.from_pretrained(
                                                model_name,
                                                token=HF_API_KEY,
                                                id2label=id2label,
                                                label2id=label2id,
                                                num_labels=3
                                            )
model=model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_API_KEY)

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [21]:
clf=pipeline("text-classification",
             model=model,
             tokenizer=tokenizer,
             device=device
             )

clf("This was a bad product, I don't know how someone could do something like that", top_k=None)

[{'label': 'NEGATIVE', 'score': 0.3685745596885681},
 {'label': 'NEUTRAL', 'score': 0.3475748598575592},
 {'label': 'POSITIVE', 'score': 0.2838505804538727}]

#### Save the model

In [13]:

clf.model.save_pretrained("./models/classifier_roberta/")
clf.tokenizer.save_pretrained("./models/classifier_roberta/")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./models/classifier_roberta/tokenizer_config.json',
 './models/classifier_roberta/tokenizer.json')

OK, the model can classify, it doesn't show much difference from the distilbert model. Let's try to fine tune it too.

### Fine tune classifier

#### Yelp dataset: It is relatively big and clean.

In [3]:
ds = load_dataset("Yelp/yelp_review_full")

In [4]:
def map_labels(example):
    if example["label"] <= 1:
        example["label"] = 0  # NEG
    elif example["label"] == 2:
        example["label"] = 1  # NEU
    else:
        example["label"] = 2  # POS
    return example

dsm = ds.map(map_labels)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./models/classifier_roberta/")

def token_maker(dataset):
    tokens=tokenizer(dataset["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=256
                     )
    return tokens
    
tokenized_ds = dsm.map(token_maker, batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds.set_format("torch")

#### Train and Test split

In [7]:
df_train = tokenized_ds["train"]
df_test = tokenized_ds["test"]

In [8]:
print(df_train.shape, df_test.shape)

(650000, 3) (50000, 3)


#### Loading the model and tokens

In [None]:
clf = pipeline("text-classification",
               model="./models/classifier_roberta/",
               tokenizer="./models/classifier_roberta/",
               device=device
)

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [12]:
clf("This product is terrible.")

[{'label': 'NEGATIVE', 'score': 0.3849872350692749}]

In [29]:
def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0
    )

    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="weighted",
        zero_division=0
    )

    return {
        "accuracy": acc,
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1,
        "precision_weighted": precision_w,
        "recall_weighted": recall_w,
        "f1_weighted": f1_w,
    }


In [31]:
training_args = TrainingArguments(
    output_dir="./models/yelp_roberta_3class",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=50,
    learning_rate=1e-5,
    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,          
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    dataloader_num_workers=2,
    dataloader_pin_memory=True,)

In [32]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
small_train = df_train.shuffle(seed=42).select(range(8000))
small_test  = df_test.shuffle(seed=42).select(range(2000))

In [33]:
trainer = Trainer(model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    data_collator=data_collator,
    compute_metrics=metrics,
)

trainer.train()


Step,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Weighted,Recall Weighted,F1 Weighted
500,0.376159,0.589618,0.814,0.76687,0.760981,0.761006,0.815792,0.814,0.811594
1000,0.494076,0.454607,0.817,0.77548,0.781678,0.776697,0.8281,0.817,0.820677
1500,0.279443,0.586853,0.8255,0.780465,0.779722,0.779985,0.825404,0.8255,0.82532
2000,0.426375,0.527685,0.8135,0.766645,0.76349,0.763889,0.815695,0.8135,0.813181
2500,0.279805,0.737902,0.811,0.777034,0.788565,0.778028,0.833868,0.811,0.819207
3000,0.317506,0.7468,0.814,0.76862,0.771527,0.769903,0.817437,0.814,0.815594
3500,0.297549,0.851502,0.829,0.78518,0.784604,0.784887,0.828539,0.829,0.828765
4000,0.192661,0.910953,0.8185,0.772132,0.771778,0.77163,0.819921,0.8185,0.818835
4500,0.173938,0.978962,0.815,0.769551,0.773068,0.770772,0.819964,0.815,0.816925
5000,0.122154,0.986658,0.818,0.773589,0.777557,0.775216,0.82272,0.818,0.820062


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=5000, training_loss=0.30534337911605836, metrics={'train_runtime': 17680.8253, 'train_samples_per_second': 2.262, 'train_steps_per_second': 0.283, 'total_flos': 5262268354560000.0, 'train_loss': 0.30534337911605836, 'epoch': 5.0})

Benchmark [500/500 40:11, Epoch 1/1]
TrainOutput(
    global_step=500,
    training_loss=0.5430054550170899,
    metrics={
        'train_runtime': 2414.7412,
        'train_samples_per_second': 3.313, 
        'train_steps_per_second': 0.207, 
        'total_flos': 1052453670912000.0, 
        'train_loss': 0.5430054550170899, 
        'epoch': 1.0}
        )

Second Roberta [5000/5000 4:54:20, Epoch 5/5]
TrainOutput(
    global_step=5000,
    training_loss=0.30534337911605836,
    metrics={'train_runtime': 17680.8253,
             'train_samples_per_second': 2.262,
             'train_steps_per_second': 0.283, 
             'total_flos': 5262268354560000.0, 
             'train_loss': 0.30534337911605836, 
             'epoch': 5.0})
Step	Training Loss	Validation Loss	Accuracy	Precision Macro	Recall Macro	F1 Macro	Precision Weighted	Recall Weighted	F1 Weighted
500	0.376159	0.589618	0.814000	0.766870	0.760981	0.761006	0.815792	0.814000	0.811594
1000	0.494076	0.454607	0.817000	0.775480	0.781678	0.776697	0.828100	0.817000	0.820677
1500	0.279443	0.586853	0.825500	0.780465	0.779722	0.779985	0.825404	0.825500	0.825320
2000	0.426375	0.527685	0.813500	0.766645	0.763490	0.763889	0.815695	0.813500	0.813181
2500	0.279805	0.737902	0.811000	0.777034	0.788565	0.778028	0.833868	0.811000	0.819207
3000	0.317506	0.746800	0.814000	0.768620	0.771527	0.769903	0.817437	0.814000	0.815594
3500	0.297549	0.851502	0.829000	0.785180	0.784604	0.784887	0.828539	0.829000	0.828765
4000	0.192661	0.910953	0.818500	0.772132	0.771778	0.771630	0.819921	0.818500	0.818835
4500	0.173938	0.978962	0.815000	0.769551	0.773068	0.770772	0.819964	0.815000	0.816925
5000	0.122154	0.986658	0.818000	0.773589	0.777557	0.775216	0.822720	0.818000	0.820062

{'eval_loss': 0.851728618144989, 'eval_accuracy': 0.8285, 'eval_precision_macro': 0.7845923090109136, 'eval_recall_macro': 0.7841719546676816, 'eval_f1_macro': 0.7843786009486031, 'eval_precision_weighted': 0.8281781955316839, 'eval_recall_weighted': 0.8285, 'eval_f1_weighted': 0.8283356588105139, 'eval_runtime': 156.2952, 'eval_samples_per_second': 12.796, 'eval_steps_per_second': 0.8, 'epoch': 5.0}

In [34]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.851728618144989, 'eval_accuracy': 0.8285, 'eval_precision_macro': 0.7845923090109136, 'eval_recall_macro': 0.7841719546676816, 'eval_f1_macro': 0.7843786009486031, 'eval_precision_weighted': 0.8281781955316839, 'eval_recall_weighted': 0.8285, 'eval_f1_weighted': 0.8283356588105139, 'eval_runtime': 156.2952, 'eval_samples_per_second': 12.796, 'eval_steps_per_second': 0.8, 'epoch': 5.0}


In [36]:
pred = trainer.predict(small_test)
y_true = pred.label_ids
y_pred = np.argmax(pred.predictions, axis=-1)

labels = [trainer.model.config.id2label[i] for i in range(trainer.model.config.num_labels)]
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred, target_names=labels, digits=4))

[[767  67  22]
 [ 78 218  76]
 [ 15  85 672]]
              precision    recall  f1-score   support

    NEGATIVE     0.8919    0.8960    0.8939       856
     NEUTRAL     0.5892    0.5860    0.5876       372
    POSITIVE     0.8727    0.8705    0.8716       772

    accuracy                         0.8285      2000
   macro avg     0.7846    0.7842    0.7844      2000
weighted avg     0.8282    0.8285    0.8283      2000



## Using the data from `data_cleaned.csv` to classify

In [2]:
dc = pd.read_csv("data_cleaned.csv")

In [3]:
device = "cuda"

In [4]:
MODEL_PATH="./models/yelp_roberta_3class/checkpoint-1000"

In [5]:
clf = pipeline("text-classification", 
               model=MODEL_PATH,
               tokenizer=MODEL_PATH,
               device=device,
               top_k=None,)

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [7]:
texts = dc["reviews.text"].fillna("").astype(str).tolist()
texts

['I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.',
 'Bulk is always the less expensive way to go for products like these',
 'Well they are not Duracell but for the price i am happy.',
 'Seem to work as well as name brand batteries at a much better price',
 'These batteries are very long lasting the price is great.',
 "Bought a lot of batteries for Christmas and the AmazonBasics Cell have been good. I haven't noticed a difference between the brand name batteries and the Amazon Basic brand. Just a lot easier to purchase and have arrive at the house and have on hand. Will buy again.",
 'ive not had any problame with these batteries have ordered them in the past been very pleased.',
 'Well if you are looking for cheap non-rechargeable batteries that last quite a while then these are perfect. Nothing more to say.',
 'These do not hold the amount of high power juice like energizer or duracell, but th

In [8]:
out = clf(texts, batch_size=64, truncation=True, max_length=256)

In [13]:
labels = [max(item, key = lambda t: t["score"])["label"] for item in out]
scores = [max(item, key = lambda s: s["score"])["score"] for item in out]

In [14]:
def score_for(item, label):
    for i in item:
        if i["label"] == label:
            return i["score"]
    return None

In [16]:
dc["predicted_label"] = labels
dc["predicted_score"] = scores

In [23]:
dc["Negative"]=[score_for(item, "Negative") for item in out]
dc["Neutral"]=[score_for(item, "Neutral") for item in out]
dc["Positive"]=[score_for(item, "Positive") for item in out]

In [24]:
dc.to_csv("predictions.csv", index=False)

In [25]:
print(dc["sentiment"].unique())
print(dc["predicted_label"].unique())

<ArrowStringArray>
['Neutral', 'Positive', 'Negative']
Length: 3, dtype: str
<ArrowStringArray>
['NEGATIVE', 'POSITIVE', 'NEUTRAL']
Length: 3, dtype: str


In [29]:
dc[["sentiment","predicted_label"]].value_counts()

sentiment  predicted_label
Positive   POSITIVE           22799
           NEUTRAL             2189
Negative   NEGATIVE            1366
Positive   NEGATIVE             557
Neutral    NEUTRAL              548
           NEGATIVE             407
           POSITIVE             251
Negative   NEUTRAL              133
           POSITIVE              82
Name: count, dtype: int64

In [30]:
dc["true_label"] = dc["sentiment"].str.upper().str.strip()
dc["pred_label"] = dc["predicted_label"].str.upper().str.strip()

print(dc["true_label"].unique())
print(dc["pred_label"].unique())

<ArrowStringArray>
['NEUTRAL', 'POSITIVE', 'NEGATIVE']
Length: 3, dtype: str
<ArrowStringArray>
['NEGATIVE', 'POSITIVE', 'NEUTRAL']
Length: 3, dtype: str


In [32]:
from sklearn.metrics import classification_report, confusion_matrix

y_true = dc["true_label"]
y_pred = dc["pred_label"]

print("\nDetailed classification report:\n")
print(classification_report(y_true, y_pred, digits=4))



Detailed classification report:

              precision    recall  f1-score   support

    NEGATIVE     0.5863    0.8640    0.6985      1581
     NEUTRAL     0.1909    0.4544    0.2689      1206
    POSITIVE     0.9856    0.8925    0.9367     25545

    accuracy                         0.8723     28332
   macro avg     0.5876    0.7370    0.6347     28332
weighted avg     0.9295    0.8723    0.8950     28332



In [33]:
labels = ["NEGATIVE", "NEUTRAL", "POSITIVE"]

cm = confusion_matrix(y_true, y_pred, labels=labels)

print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[ 1366   133    82]
 [  407   548   251]
 [  557  2189 22799]]


In [35]:
dc["pred_label"].value_counts(normalize=True)

pred_label
POSITIVE    0.816462
NEUTRAL     0.101299
NEGATIVE    0.082239
Name: proportion, dtype: float64