In [1]:
import os
import json
import torch as tc
from datasets import load_dataset
from transformers import (
AutoModelForSequenceClassification, 
Trainer, TrainingArguments, AutoTokenizer
)
from nlp_model.utils import to_device, get_device, set_seed, generate_namespace
from nlp_model.model import ModelAndTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cfg = generate_namespace(path=f"../config.yaml")
print(json.dumps(vars(cfg), indent=2))

set_seed(cfg.seed)
device = get_device()

{
  "model_name": "distilbert-base-uncased",
  "text": "Let's work on a coding problem!",
  "seed": 42,
  "dpi": 400,
  "fig_path": "../outputs/",
  "res_path": "../results/"
}


In [3]:
inst = ModelAndTokenizer(cfg.model_name)
model = inst.load_classification_model(attentions=True)
inputs = inst.load_tokenizer(cfg.text)
print(inputs)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,  2292,  1005,  1055,  2147,  2006,  1037, 16861,  3291,   999,
           102]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}


In [4]:
inst.decode_tokens(inputs["input_ids"], skip_special_tokens=True)[0]

"let ' s work on a coding problem!"

In [5]:
with tc.no_grad():
    outputs = model(**inputs)
    outputs_base = model.base_model(**inputs)
    
print(f"Last hidden state shape: {outputs_base.last_hidden_state.shape}")
print(f"Attention len: {len(outputs.attentions)}")

Last hidden state shape: torch.Size([1, 11, 768])
Attention len: 6


In [6]:
data = "imdb"
dataset = load_dataset(data)

# Applies the tokenizer to all samples
imdb_tokenized = inst.apply_tokenizer(dataset)
print(imdb_tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [7]:
# Loads DistilBERT with a classification head on top
model = inst.load_classification_model()

# Arguments that will be passed into training
# Controls how training is done
training_args = TrainingArguments(
    output_dir=cfg.res_path,
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=1
)

# High training wrapper for Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=imdb_tokenized["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=imdb_tokenized["test"].shuffle(seed=42).select(range(1000)),
    processing_class=inst.tokenizer
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Trains model; logs everything to "../results"
trainer.train()
preds = trainer.predict(imdb_tokenized["test"].select(range(50)))
print(preds.metrics)

Epoch,Training Loss,Validation Loss
1,No log,0.29947


{'test_loss': 0.4278165102005005, 'test_runtime': 3.4375, 'test_samples_per_second': 14.545, 'test_steps_per_second': 2.036}


In [11]:
new_inst = ModelAndTokenizer(cfg.res_path + "checkpoint-250")
new_model = new_inst.load_classification_model()

new_text = [
    "That movie was absolutely fantastic!",
    "This movie sucks",
    "I will never watch this again!",
    "Highly recommend to never watch again"
]
inputs = new_inst.load_tokenizer(new_text)

new_model.eval()
with tc.no_grad():
    outputs = new_model(**inputs)
    
probs = tc.nn.functional.softmax(outputs.logits.to("cpu"), dim=-1)
predicted_classes = tc.argmax(probs, dim=-1)
labels = ["negative", "positive"]

for text, pred_idx, prob in zip(new_text, predicted_classes, probs):
    print(f"Text: '{text}'")
    print(f"Probabilities: {prob}")
    print(f"Predicted sentiment: {labels[pred_idx]} ({prob[pred_idx].item():.2f} confidence)\n")

Text: 'That movie was absolutely fantastic!'
Probabilities: tensor([0.0242, 0.9758])
Predicted sentiment: positive (0.98 confidence)

Text: 'This movie sucks'
Probabilities: tensor([0.9261, 0.0739])
Predicted sentiment: negative (0.93 confidence)

Text: 'I will never watch this again!'
Probabilities: tensor([0.8690, 0.1310])
Predicted sentiment: negative (0.87 confidence)

Text: 'Highly recommend to never watch again'
Probabilities: tensor([0.0658, 0.9342])
Predicted sentiment: positive (0.93 confidence)

