[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1U-JydW1QtjyvnRkH7gx6SR-P4byU-Rt-/view?usp=share_link)

# Simple Transformer Tutorial
This notebook demonstrates how to use pretrained Transformer models (not large LLMs) for tasks like text classification, masked language modeling, and feature extraction using the Hugging Face Transformers library.

## 1. Install and Import Dependencies

In [1]:
!pip install --quiet transformers torch datasets

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 2. Text Classification with a Pretrained Transformer

In [2]:
# Load a sentiment-analysis pipeline using DistilBERT fine-tuned on SST-2
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Test on example sentences
sentences = [
    "I love this tutorial! It's very helpful.",
    "This is the worst movie I've ever seen."
]
results = classifier(sentences)
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.99985671043396}, {'label': 'NEGATIVE', 'score': 0.9997653365135193}]


## 3. Masked Language Modeling

In [3]:
# Load a fill-mask pipeline with BERT
fill_mask = pipeline('fill-mask', model='bert-base-uncased')
# [MASK] token for BERT
masked_sentence = "Transformers are amazing, they [MASK] state-of-the-art performance."

# Get predictions
mlm_results = fill_mask(masked_sentence)
for res in mlm_results:
    print(res)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0


{'score': 0.24512915313243866, 'token': 2031, 'token_str': 'have', 'sequence': 'transformers are amazing, they have state - of - the - art performance.'}
{'score': 0.13516122102737427, 'token': 2079, 'token_str': 'do', 'sequence': 'transformers are amazing, they do state - of - the - art performance.'}
{'score': 0.10090019553899765, 'token': 3749, 'token_str': 'offer', 'sequence': 'transformers are amazing, they offer state - of - the - art performance.'}
{'score': 0.09457143396139145, 'token': 3073, 'token_str': 'provide', 'sequence': 'transformers are amazing, they provide state - of - the - art performance.'}
{'score': 0.09336068481206894, 'token': 2024, 'token_str': 'are', 'sequence': 'transformers are amazing, they are state - of - the - art performance.'}


## 4. Feature Extraction

In [6]:
# Use a transformer as a feature extractor
feature_extractor = pipeline('feature-extraction', model='distilbert-base-uncased')
text = "Transformers provide contextual embeddings"
features = feature_extractor(text)
# features is a nested list: [batch_size, sequence_length, hidden_size]
print(f"Shape: {len(features)}, {len(features[0])}, {len(features[0][0])}")


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0


Shape: 1, 10, 768


## 5. (Optional) Fine-Tuning on SST-2

In [13]:
# 5. Fine-Tuning on a Tiny Dummy Dataset (no external downloads)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# 5.1 Prepare a tiny in-memory dataset
texts  = ["I enjoy this.", "I dislike that.", "This is great!", "Terrible experience."]
labels = [1, 0, 1, 0]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.5, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model     = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)

# 5.2 Tokenize
train_enc = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
val_enc   = tokenizer(val_texts,   padding=True, truncation=True, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_enc, train_labels)
val_dataset   = SimpleDataset(val_enc,   val_labels)

# 5.3 Set up Trainer (without evaluation_strategy)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=1,
    report_to=['none'],          # ← disable all logging integrations (includes WandB)
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # still pass it in so evaluate() works
)

# 5.4 Train and then explicitly evaluate
trainer.train()
metrics = trainer.evaluate()
print(metrics)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1,0.7133
2,0.62
3,0.5847


{'eval_loss': 0.8734979629516602, 'eval_runtime': 0.0242, 'eval_samples_per_second': 82.521, 'eval_steps_per_second': 41.261, 'epoch': 3.0}


## 6. Conclusion
You’ve seen how to leverage pretrained Transformer models for inference and even fine-tuning on your own data. Explore other models and tasks using the Hugging Face Model Hub!