In [3]:
from transformers import BertForMaskedLM, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

input_text = f"The cat sat on the {tokenizer.mask_token} and looked out the window."
inputs = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

top_k = 5
top_k_ids = torch.topk(logits[0, mask_token_index], k=top_k, dim=1).indices[0].tolist()
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_ids)

print("Top predictions:")
for i, token in enumerate(top_k_tokens, 1):
    print(f"{i}: {token}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top predictions:
1: floor
2: bed
3: couch
4: sofa
5: ground


In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# 1. Load dataset
dataset = load_dataset("imdb")
small_train = dataset["train"].select(range(500))
small_test = dataset["test"].select(range(100))

# 2. Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

small_train = small_train.map(tokenize, batched=True)
small_test = small_test.map(tokenize, batched=True)

small_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
small_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 3. Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 4. Training arguments (compatible with old version)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_dir="./logs"
)

# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test
)

# 6. Train
trainer.train()

# 7. Manual evaluation
print("\nEvaluation results:")
results = trainer.evaluate()
print(results)

# 8. Predict function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    label = "Positive ðŸ˜Š" if probs[0][1] > probs[0][0] else "Negative ðŸ˜ž"
    confidence = round(probs.max().item(), 2)
    return label, confidence

# 9. Test predictions
examples = [
    "This movie was amazing! I loved every moment.",
    "Worst movie ever. Total waste of time.",
    "It was okay, not great but not terrible either."
]

for example in examples:
    sentiment, conf = predict_sentiment(example)
    print(f"\nInput: {example}\nPrediction: {sentiment} (Confidence: {conf})")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss



Evaluation results:


{'eval_loss': 0.0063028051517903805, 'eval_runtime': 20.6497, 'eval_samples_per_second': 4.843, 'eval_steps_per_second': 0.339, 'epoch': 1.0}

Input: This movie was amazing! I loved every moment.
Prediction: Negative ðŸ˜ž (Confidence: 0.99)

Input: Worst movie ever. Total waste of time.
Prediction: Negative ðŸ˜ž (Confidence: 0.99)

Input: It was okay, not great but not terrible either.
Prediction: Negative ðŸ˜ž (Confidence: 0.98)
