In [2]:
# Step 1: Setup and Dependencies

!pip install --upgrade pip
!pip install torch transformers==4.33.0 datasets==2.14.5

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
Collecting datasets==2.14.5
  Downloading datasets-2.14.5-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.33.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.5)
 

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset

print("Dependencies installed and loaded.")

# Step 2: Load and Inspect Dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:5%]')

# Show sample text
print("\nSample text from dataset:")
print(dataset[0]['text'][:500])

# Step 3: Tokenize Text for Self-Supervised Learning
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_texts = [tokenizer(t['text'], truncation=False, padding=False, add_special_tokens=True) for t in dataset]

input_ids = []
for item in tokenized_texts:
    input_ids.extend(item['input_ids'])

input_ids = torch.tensor(input_ids)
print(f"\nTotal tokens available: {len(input_ids)}")

# Step 4: Create Input-Label Pairs (Self-Supervised Learning)
window_size = 16
sequences = [input_ids[i:i+window_size+1] for i in range(0, len(input_ids)-window_size-1)]

inputs = torch.stack([seq[:-1] for seq in sequences])
labels = torch.stack([seq[1:] for seq in sequences])

print(f"\nInput shape: {inputs.shape}")
print(f"Label shape: {labels.shape}")

# Step 5: Initialize Model and Training Loop
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = DataLoader(TensorDataset(inputs[:1000], labels[:1000]), batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
print("\nStarting small-scale training loop...")
for epoch in range(1):
    for batch in train_loader:
        b_input_ids, b_labels = batch
        b_input_ids, b_labels = b_input_ids.to(device), b_labels.to(device)

        outputs = model(input_ids=b_input_ids, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} Loss: {loss.item():.4f}")

# Step 6: Inference and Masked Token Prediction
model.eval()

text = "[CLS] Machine learning is fascinating because it allows computers to learn from data [SEP]"
tokenized_input = tokenizer(text, return_tensors='pt').to(device)

input_ids = tokenized_input['input_ids'].clone()
input_ids[0, 3] = tokenizer.mask_token_id  # Manually mask "learning"

with torch.no_grad():
    output = model(input_ids=input_ids).logits

predicted_token_id = output[0, 3].argmax(dim=-1)
predicted_word = tokenizer.decode(predicted_token_id)

print(f"\nPredicted word: '{predicted_word}' | Actual word: 'learning'")

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Dependencies installed and loaded.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]


Sample text from dataset:





vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (647 > 512). Running this sequence through the model will result in indexing errors



Total tokens available: 121443

Input shape: torch.Size([121426, 16])
Label shape: torch.Size([121426, 16])


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Starting small-scale training loop...
Epoch 1 Loss: 1.0224

Predicted word: 'l e a r n i n g' | Actual word: 'learning'
