<a href="https://colab.research.google.com/github/ElenJ/NLP_demo/blob/main/NLP_processing_wTranformers_Tunstall_Ch2_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer # automatically retrieves model's configuration, weights, vocab
from transformers import AutoModel # to load weights from pretrained model
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [None]:
emotions = load_dataset("emotion")
emotions

In [None]:
# load train data
train_ds = emotions["train"]
train_ds

In [None]:
# convert emotions to pandas df
emotions.set_format(type="pandas")
df = emotions["train"][:]

def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
# look at label distribution
df["label_name"].value_counts(ascending=True).plot.barh(title="Label distribution")
plt.show()

The dataset is imbalanced and actually requires balancing of the classes to train a proper classifyer. TODO for later.

In [None]:
df.head()

In [None]:
# look at the length of the texts, as this impacts selection of transformer, which has a certain maximum content size
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False, showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

The average tweet length of ~15 words is below the maximum content size of transformer models. We can go on, not fearing the need to truncate text and lose information.

In [None]:
emotions.reset_format() # reset format, don't need to have it as pandas anymore

# Tokenization

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer.model_max_length

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)
print(tokenize(emotions["train"][:2])) # demo for one

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

## Training a text classifyer

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)


In [None]:
# dummy example
text ="this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"input tensor shape: {inputs['input_ids'].size()}")
#torch.Size([1, 6]) --> [batch size, tokens]

In [None]:
# pass the input on same device
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad(): # disable automatic calculations of the gradient, reduces memory footprint of the gradient
  outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()
# [batch, tockens, hidden_dim], a 768-dim vector is returned for each of 6 input tokens.
# For classification, it is common practice to use the hidden state associated with the [CLS] token as input feature
# [CLS] is at the start of each sequence


In [None]:
outputs.last_hidden_state[:,0].size()

In [None]:
# retrieve last hidden state for my emotions dataset
def extract_hidden_states(batch):
  # Place model inputs on device
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  # extract last hidden state
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
  # return vector for [CLS] token
  return {"hidden_state": last_hidden_state[:,0].cpu().numpy()} # map method requires teh function to return a python or numpy object when using batched inputs

In [None]:
# as the model expects tensors as input, need to convert input_ids and attention_mask to "torch" format
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# now we can extract hidden states:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True) # default batch size 1000 used

In [None]:
# function has added hidden_state column to emotions
emotions_hidden["train"].column_names

In [None]:
# now we can train the classifyer based on the last hidden state
X_train = np.array(emotions_hidden["train"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, y_train.shape

In [None]:
# recheck the input, by visualizing it
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
# scale features
X_scaled = MinMaxScaler().fit_transform(X_train)
# reduce dimensionality
mapper = UMAP(n_components=2, metric = "cosine").fit(X_scaled)
# cerate a dataFRame of 2D embeddings
db_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
db_emb["label"] = y_train
db_emb.head()

In [None]:
# @title X vs Y

from matplotlib import pyplot as plt
db_emb.plot(kind='scatter', x='X', y='Y', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
emotions_labels = emotions["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(emotions_labels, cmaps)):
  db_emb_filtered = db_emb.query(f"label == {i}")
  axes[i].hexbin(db_emb_filtered["X"], db_emb_filtered["Y"], cmap = cmap, gridsize = 20, linewidths = (0,))
  axes[i].set_title(label)
  axes[i].set_xticks([])
  axes[i].set_yticks([])
plt.tight_layout()
plt.show()

In [None]:
# training model
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
# compare this to a baseline classifyer
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6,6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, emotions["train"].features["label"].names)

## Fine-Tuning