In [8]:
!pip install -q transformers datasets accelerate scikit-learn torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Imports

In [9]:
import pandas as pd
import sklearn
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

### Set up dataset

In [10]:
DATA_PATH = "track-a.csv"
label_cols = ["anger", "fear", "joy", "sadness", "surprise"]
df = pd.read_csv(DATA_PATH)
train_df, test_df = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

### HuggingFace Dataset & tokenisation

In [11]:
model_name = "distilbert-base-uncased"
tokenizer   = AutoTokenizer.from_pretrained(model_name)

train_ds = datasets.Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = datasets.Dataset.from_pandas(test_df .reset_index(drop=True))
ds       = datasets.DatasetDict({"train": train_ds, "test": test_ds})

# attach list-of-ints label field expected by HF
def add_labels(example):
    example["labels"] = [float(example[c]) for c in label_cols]
    return example

ds = ds.map(lambda x: tokenizer(x['text']), batched=True, remove_columns=["text", "id"])
ds = ds.map(add_labels)
ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 2214/2214 [00:00<00:00, 55664.59 examples/s]
Map: 100%|██████████| 554/554 [00:00<00:00, 63713.86 examples/s]
Map: 100%|██████████| 2214/2214 [00:00<00:00, 25674.23 examples/s]
Map: 100%|██████████| 554/554 [00:00<00:00, 23239.23 examples/s]


### 4 Model & Trainer

In [12]:
num_labels = len(label_cols)
model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification")

def sigmoid(x): return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    preds = (probs > 0.5).astype(int)
    return {
        "micro_f1": sklearn.metrics.f1_score(labels, preds, average="micro"),
        "macro_f1": sklearn.metrics.f1_score(labels, preds, average="macro"),
    }

args = TrainingArguments(
    output_dir="sentiment_model",
    logging_steps=25,
    dataloader_pin_memory=False, # otherwise, this warning on MacBook M1: "UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used."

)

trainer = Trainer(model=model, args=args,
                  train_dataset=ds["train"],
                  eval_dataset=ds["test"],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

trainer.train()
metrics = trainer.evaluate()

print("\nHeld-out metrics:", metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
25,0.5884
50,0.5473
75,0.5345
100,0.4899
125,0.4801
150,0.4669
175,0.4721
200,0.447
225,0.434
250,0.4241



Held-out metrics: {'eval_loss': 0.41554585099220276, 'eval_micro_f1': 0.7070101857399641, 'eval_macro_f1': 0.6443928307047027, 'eval_runtime': 3.1458, 'eval_samples_per_second': 176.108, 'eval_steps_per_second': 22.252, 'epoch': 3.0}


### 5: Report

In [13]:
pred_logits = trainer.predict(ds["test"]).predictions
y_pred = (sigmoid(pred_logits) > 0.5).astype(int)
y_true = np.vstack(ds["test"]["labels"])

print("\nClassification report")
print(sklearn.metrics.classification_report(y_true, y_pred,
                            target_names=label_cols))


Classification report
              precision    recall  f1-score   support

       anger       0.70      0.32      0.44        72
        fear       0.77      0.83      0.80       330
         joy       0.69      0.57      0.63       115
     sadness       0.66      0.63      0.65       167
    surprise       0.76      0.68      0.71       179

   micro avg       0.73      0.68      0.71       863
   macro avg       0.71      0.61      0.64       863
weighted avg       0.73      0.68      0.70       863
 samples avg       0.65      0.63      0.61       863



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
print(f"Equal fields: {np.mean(y_pred == y_true):.2f}")
print(f"Equal rows: {np.mean((y_pred == y_true).all(axis=1)):.2f}")

Equal fields: 0.82
Equal rows: 0.41
