In [1]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

In [None]:
!pip install datasets

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

In [None]:
text = """Dear Amazon, last week I ordered an Optimus Prime action figure \
from your online store in Germany. Unfortunately, when I opened the package, \
I discovered to my horror that I had been sent an action figure of Megatron \
instead! As a lifelong enemy of the Decepticons, I hope you can understand my \
dilemma. To resolve the issue, I demand an exchange of Megatron for the \
Optimus Prime figure I ordered. Enclosed are copies of my records concerning \
this purchase. I expect to hear from you soon. Sincerely, Bumblebee."""

In [None]:
import pandas as pd

outputs = classifier(text)
pd.DataFrame.from_records(outputs)

Named Entity Recognition(NER)

In [None]:

ner_tagger = pipeline("ner", aggregation_strategy="simple")
outputs = ner_tagger(text)

In [None]:
pd.DataFrame.from_records(outputs)

In [None]:
reader = pipeline("question-answering")
question = "What does the customer want?"
outputs = reader(question=question, context=text)
pd.DataFrame.from_records([outputs])

In [None]:
summarizer = pipeline("summarization")
outputs = summarizer(text, max_length=65, clean_up_tokenization_spaces=True)
print(*outputs[0]['summary_text'])

In [None]:
translator = pipeline("translation_en_to_de", model="t5-small")

In [None]:
text = "Hello world, this is just a demo text"

In [None]:
outputs = translator(text, clean_up_tokenization_spaces=True, min_length=1)

In [None]:
print(outputs)

In [None]:
from datasets import list_datasets

In [None]:
all_datasets = list_datasets()
print(f"currently there are {len(all_datasets)} in the datasets")
print(f"the top ten datsets are {all_datasets[:10]}")

In [None]:
print(*all_datasets[23:50],sep="\t\n")

In [None]:
from datasets import load_dataset
emotions = load_dataset("emotion")

In [None]:
emotions.get('train')

In [None]:
train_ds=  emotions["train"]

In [None]:
train_ds[1]

In [None]:
!pip install pyarrow

In [None]:
import pyarrow as pa

In [None]:
import pandas as pd
df  = pd.DataFrame(train_ds[:10
                      ])

In [None]:
tables = pa.Table.from_pandas(df)

Clearly pyarrow tables are much more memory efficient

In [None]:
import sys
sys.getsizeof(tables),sys.getsizeof(df)

In [None]:
print(train_ds.features, sep='\n')

In [None]:
print(train_ds[:5].items())

In [None]:
dataset_url = "https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?"
!wget{dataset_url}

In [None]:
import pandas as pd

emotions.set_format(type = 'pandas')
df = emotions["train"][:]

df.head()

In [None]:
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)

In [None]:
df["label_name"] = df["label"].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt



In [None]:
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

How long are out Tweets?

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)

In [None]:
df.boxplot("Words Per Tweet", by="label_name", grid=False,showfliers=False,color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

Using pretrained tokenizers

In [None]:
from transformers import AutoTokenizer

model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer

In [None]:
#We can examine a few attributes of hte tokenizer such as the vocab size

tokenizer.vocab_size

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.model_max_length

In [None]:
#check the encoding and decoding

encoded_str =  tokenizer.encode("this is a complicatedtest")
encoded_str

In [None]:
for token in encoded_str:
  print(token,tokenizer.decode([token]))

## Training a Text Classifier

In [None]:
import torch

In [None]:
!pip install einops

In [None]:

#Using Pretrained Models

from transformers import AutoModel

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Since the model size is really large 
## We shall prevent RAM overloading

In [None]:
import os
import tempfile
model = AutoModel.from_pretrained('facebook/bart-large-mnli')
with tempfile.TemporaryDirectory() as tmp_dir:
  model.save_pretrained(tmp_dir, max_shard_size= "124MB")
  new_model = AutoModel.from_pretrained(tmp_dir).to(device)

In [None]:
#Extracting the last hidden states

text = "this is a test"
text_tensor = tokenizer.encode(text, return_tensors="pt").to(device)

In [None]:
text_tensor

# This output is the hidden state and not any prediction

In [None]:
output = new_model(text_tensor)

In [None]:
output.last_hidden_state.shape

#([batch_size, n_tokens, hidden_dim])

Tokenizing the whole dataset

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
#we set the output format of the dataset to pandas so that the accessed data is returned 
## as a DF. we dont need that now


emotions.reset_format()

In [None]:
tokenize(emotions["train"][:3])

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size = None)

In [None]:
emotions_encoded["train"].features['input_ids']

In [None]:
import numpy as np


def forward_pass(batch):
  input_ids = torch.tensor(batch["input_ids"]).to(device)
  attention_mask = torch.tensor(batch['attention_mask']).to(device)



  with torch.no_grad():
    last_hidden_state = new_model(input_ids, attention_mask).last_hidden_state
    last_hidden_state = last_hidden_state.cpu().numpy()

  # Use average of unmasked hidden states for classification


  lhs_shape = last_hidden_state.shape
  boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
  boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
  boolean_mask = boolean_mask.reshape(lhs_shape)
  masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
  batch["hidden_state"] = masked_mean.data
  return batch
emotions_encoded = emotions_encoded.map(forward_pass, batched=True,
batch_size=16)

In [None]:
emotions_encoded["train"].features

Create a feature Matrix

In [None]:
import numpy as np

X_train = np.array(emotions_encoded["train"]["hidden_state"])
X_valid = np.array(emotions_encoded["validation"]['hidden_state'])

y_train = np.array(emotions_encoded['train']['label'])
y_valid = np.array(emotions_encoded['validation']['label'])


In [None]:
X_train.shape, X_valid.shape

In [None]:
!pip install umap-learn

In [None]:
import pandas as pd

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X_train)
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
df_emb = pd.DataFrame(mapper.embedding_, columns=['X', 'Y'])
df_emb['label'] = y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt


In [None]:
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names



for i, (label, cmap) in enumerate(zip(labels, cmaps)):
  df_emb_sub = df_emb.query(f"label=={i}")
  axes[i].hexbin(df_emb_sub["X"]**0.1, df_emb_sub["Y"], cmap=cmap, gridsize=30, linewidths=(0,))
  axes[i].set_title(label)
  axes[i].set_xticks([]), axes[i].set_yticks([])
plt.tight_layout()
plt.show()



#Clearly a strong pattern is visible among the negative as well as the positive emotions

Lets train a simple classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

lr_clf  = LogisticRegression(max_iter=3000,verbose=1,n_jobs=10)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
lr_clf.classes_

In [None]:
#although 68% is quite low but lets check it with dummy classifier which always chooses
#the most frequent class, which yields an accuracy of about 35%


from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()  
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

Fine-Tuning Transformers

In [None]:
#Loading a pretrained model

from transformers import AutoModelForSequenceClassification
import torch

In [None]:
torch.device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()
num_labels = 6
model_ckpt = "facebook/bart-large-mnli"
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels,ignore_mismatched_sizes=True).to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
#example of TrainingArguments in all its glory

model_ckpt = "facebook/bart_large_mnli"

from transformers import Trainer, TrainingArguments

batch_size = 2
logging_steps = len(emotions_encoded["train"]) //batch_size

model_name = f"{model_ckpt}_finetuned_emotion"
training_args = TrainingArguments(output_dir = model_name,
                                  num_train_epochs=2,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy = "epoch",
                                  disable_tqdm = False,
                                  logging_steps = logging_steps,
                                  push_to_hub = True,
                                  log_level = "error"
                                  )

In [None]:
torch.cuda.is_available()


In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:2'

In [None]:
from importlib import reload
reload(torch)

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = emotions_encoded["train"],
                  eval_dataset = emotions_encoded["validation"],
                  tokenizer = tokenizer)
import gc
gc.collect()
trainer.train()