In [1]:
import os
import torch as tc
from datasets import load_dataset
from transformers import (
AutoModel, AutoModelForSequenceClassification, 
Trainer, TrainingArguments, AutoTokenizer
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
test_model = "distilbert-base-uncased"
text = "Let's work on a coding problem!"

# AutoTokenizer helps stream-line the proces by using the 
# correct tokenizer for the model being loaded.
# It also ensures the text is pre-processed the same way
# the model was trained with
tokenizer = AutoTokenizer.from_pretrained(test_model)
tokens = tokenizer(
    text,
    padding="longest",
    truncation="longest_first",
    return_tensors="pt"
)
print(tokens)

{'input_ids': tensor([[  101,  2292,  1005,  1055,  2147,  2006,  1037, 16861,  3291,   999,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [3]:
tokenizer.decode(tc.squeeze(tokens["input_ids"]), skip_special_tokens=True)

"let ' s work on a coding problem!"

In [4]:
model = AutoModel.from_pretrained(test_model)
outputs = model(**tokens)

# (batch_size, seq_len, hidden_dim)
embeddings = outputs.last_hidden_state
print(embeddings.shape)

torch.Size([1, 11, 768])


In [5]:
data = "imdb"
dataset = load_dataset(data)

# Applies the tokenizer to all samples
imdb_tokenized = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="longest"), 
    batched=True
)
print(imdb_tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [6]:
# Loads DistilBERT with a classification head on top
model = AutoModelForSequenceClassification.from_pretrained(test_model, num_labels=2)

# Arguments that will be passed into training
# Controls how training is done
training_args = TrainingArguments(
    output_dir="../results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=1
)

# High training wrapper for Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=imdb_tokenized["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=imdb_tokenized["test"].shuffle(seed=42).select(range(1000)),
    processing_class=tokenizer
)

# Trains model; logs everything to "../results"
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.276476


TrainOutput(global_step=250, training_loss=0.4108782043457031, metrics={'train_runtime': 767.4768, 'train_samples_per_second': 2.606, 'train_steps_per_second': 0.326, 'total_flos': 264934797312000.0, 'train_loss': 0.4108782043457031, 'epoch': 1.0})

In [7]:
preds = trainer.predict(imdb_tokenized["test"].select(range(50)))
print(preds.metrics)

{'test_loss': 0.3424946665763855, 'test_runtime': 3.0782, 'test_samples_per_second': 16.243, 'test_steps_per_second': 2.274}


In [22]:
model_path = "../results/checkpoint-250"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

new_text = "That movie was absolutely fantastic!"
inputs = tokenizer(new_text, return_tensors="pt", truncation=True, padding=True)

model.eval()
with tc.no_grad():
    outputs = model(**inputs)
    
probs = tc.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = tc.argmax(probs, dim=-1).item()
labels = ["negative", "positive"]

print(f"Logits: {outputs.logits}")
print(f"Probabilities: {probs}")
print(f"Predicted sentiment: {labels[predicted_class]} ({probs[0][predicted_class]:.2f} confidence)")

Logits: tensor([[-1.8832,  1.2845]])
Probabilities: tensor([[0.0404, 0.9596]])
Predicted sentiment: positive (0.96 confidence)
