In [1]:
import os
import torch as tc
from datasets import load_dataset
from transformers import (
AutoModelForSequenceClassification, 
Trainer, TrainingArguments, AutoTokenizer
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
device = tc.device("mps") if tc.backends.mps.is_available() else tc.device("cpu")

test_model = "distilbert-base-uncased"
text = "Let's work on a coding problem!"

# AutoTokenizer helps stream-line the proces by using the 
# correct tokenizer for the model being loaded.
# It also ensures the text is pre-processed the same way
# the model was trained with
tokenizer = AutoTokenizer.from_pretrained(test_model)
tokens = tokenizer(
    text,
    padding="longest",
    truncation="longest_first",
    return_tensors="pt"
)
print(tokens)

{'input_ids': tensor([[  101,  2292,  1005,  1055,  2147,  2006,  1037, 16861,  3291,   999,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [3]:
tokenizer.decode(tc.squeeze(tokens["input_ids"]), skip_special_tokens=True)

"let ' s work on a coding problem!"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    test_model,
    output_attentions=True
)

# Move model and inputs to same device
model.to(device)
tokens = {k: v.to(device) for k, v in tokens.items()}

with tc.no_grad():
    outputs = model(**tokens)
    outputs_base = model.base_model(**tokens)
    
print(f"Last hidden state shape: {outputs_base.last_hidden_state.shape}")
print(f"Attention len: {len(outputs.attentions)}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Last hidden state shape: torch.Size([1, 11, 768])
Attention len: 6


In [5]:
data = "imdb"
dataset = load_dataset(data)

# Applies the tokenizer to all samples
imdb_tokenized = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="longest"), 
    batched=True
)
print(imdb_tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [6]:
# Loads DistilBERT with a classification head on top
model = AutoModelForSequenceClassification.from_pretrained(
    test_model, 
    num_labels=2
)

# Arguments that will be passed into training
# Controls how training is done
training_args = TrainingArguments(
    output_dir="../results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=1
)

# High training wrapper for Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=imdb_tokenized["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=imdb_tokenized["test"].shuffle(seed=42).select(range(1000)),
    processing_class=tokenizer
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Trains model; logs everything to "../results"
trainer.train()

preds = trainer.predict(imdb_tokenized["test"].select(range(50)))
print(preds.metrics)

Epoch,Training Loss,Validation Loss
1,No log,0.291588


{'test_loss': 0.30576276779174805, 'test_runtime': 3.0964, 'test_samples_per_second': 16.148, 'test_steps_per_second': 2.261}


In [8]:
model_path = "../results/checkpoint-250"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

new_text = [
    "That movie was absolutely fantastic!",
    "This movie sucks",
    "I will never watch this again!",
    "Highly recommend to never watch again"
]
inputs = tokenizer(new_text, return_tensors="pt", truncation=True, padding=True)

model.eval()
with tc.no_grad():
    outputs = model(**inputs)
    
probs = tc.nn.functional.softmax(outputs.logits, dim=-1)
predicted_classes = tc.argmax(probs, dim=-1)
labels = ["negative", "positive"]

for text, pred_idx, prob in zip(new_text, predicted_classes, probs):
    print(f"Text: '{text}'")
    print(f"Probabilities: {prob}")
    print(f"Predicted sentiment: {labels[pred_idx]} ({prob[pred_idx].item():.2f} confidence)\n")

Text: That movie was absolutely fantastic!
Probabilities: tensor([0.0195, 0.9805])
Predicted sentiment: positive (0.98 confidence)

Text: This movie sucks
Probabilities: tensor([0.9403, 0.0597])
Predicted sentiment: negative (0.94 confidence)

Text: I will never watch this again!
Probabilities: tensor([0.8549, 0.1451])
Predicted sentiment: negative (0.85 confidence)

Text: Highly recommend to never watch again
Probabilities: tensor([0.2091, 0.7909])
Predicted sentiment: positive (0.79 confidence)

