In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertConfig

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define BERT configuration
config = BertConfig(
    hidden_size=768,  # Size of the hidden layers
    num_hidden_layers=12,  # Number of transformer blocks
    num_attention_heads=12,  # Number of attention heads
    intermediate_size=3072,  # Size of the feed-forward layer
)

# Construct BERT model from config
model = BertModel(config)

# Display model architecture
print(model)

# Example text input
text = "BERT is a powerful transformer model."
inputs = tokenizer(text, return_tensors="pt")

# Forward pass through BERT
outputs = model(**inputs)

# Extract hidden states
hidden_states = outputs.last_hidden_state
print("Hidden States Shape:", hidden_states.shape)




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

The hidden states tensor extracted from BERT in the previous example can be used in several practical applications. Below are some key use cases along with code snippets demonstrating how to leverage this tensor.

## Using BERT Hidden States in Practical Applications

The **hidden states tensor** extracted from BERT can be used in several practical applications. Below are some key use cases along with **code snippets** demonstrating how to leverage this tensor.

## **1. Text Classification**
The hidden states from BERT can be used as **features** for a classification task.

Applications:
- Sentiment analysis (positive/negative review classification)
- Spam detection
- Document categorization

In [2]:
### **Example: Sentiment Analysis**

import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Encode input text
text = "BERT is an amazing transformer model!"
inputs = tokenizer(text, return_tensors="pt")

# Get hidden states
with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]

# Use the [CLS] token representation for classification
cls_embedding = hidden_states[:, 0, :]  # Extract first token embedding

# Example: Pass through a classification head
classifier = nn.Linear(768, 2)  # Binary classification (e.g., positive/negative)
logits = classifier(cls_embedding)
print(logits)  # Output classification scores

tensor([[ 0.3253, -0.3554]], grad_fn=<AddmmBackward0>)


## **2. Named Entity Recognition (NER)**
BERT’s hidden states can be used for **token-level classification** tasks like identifying names, dates, or organizations.

Applications:
- Extracting named entities (e.g., names, locations, organizations)
- Legal and financial document analysis

In [5]:
import torch.nn.functional as F

# Simulating a classifier for each token
ner_classifier = nn.Linear(768, 5)  # Assume 5 entity classes (Person, Org, Date, etc.)

# Pass hidden states through classifier
token_logits = ner_classifier(hidden_states)
token_probs = F.softmax(token_logits, dim=-1)

print(token_probs.shape)  # Shape: [batch_size, seq_length, num_classes]

torch.Size([1, 10, 5])


## **3. Semantic Similarity / Sentence Embeddings**
BERT embeddings can be used to compare how **similar two sentences are**.

Applications:
- Document similarity search
- Duplicate question detection (e.g., Quora, StackOverflow)
- Recommendation systems

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

text1 = "BERT is a transformer model."
text2 = "BERT is used in NLP tasks."

inputs1 = tokenizer(text1, return_tensors="pt")
inputs2 = tokenizer(text2, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    output1 = model(**inputs1).last_hidden_state[:, 0, :]
    output2 = model(**inputs2).last_hidden_state[:, 0, :]

# Compute similarity
similarity = cosine_similarity(output1.numpy(), output2.numpy())
print("Similarity Score:", similarity[0][0])

Similarity Score: 0.9290674


---

## **4. Question Answering**
BERT’s hidden states can help **extract answers from text**.

Applications:
- Chatbots and virtual assistants
- Automated document Q&A

In [4]:
from transformers import BertForQuestionAnswering

qa_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

question = "What is BERT used for?"
context = "BERT is a deep learning model used in natural language processing."

# Encode question and context
inputs = tokenizer(question, context, return_tensors="pt")

# Get predictions
with torch.no_grad():
    outputs = qa_model(**inputs)

start_logits, end_logits = outputs.start_logits, outputs.end_logits

# Get the most likely start and end positions
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1

# Decode answer
answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx])
print("Answer:", answer)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Answer: 


## **Summary**
| Application | How Hidden States Are Used |
|-------------|---------------------------|
| **Text Classification** | Use `[CLS]` token embedding as a feature for classification |
| **Named Entity Recognition** | Use per-token hidden states to classify words into categories |
| **Semantic Similarity** | Compare sentence embeddings using cosine similarity |
| **Question Answering** | Identify answer spans using start/end token logits |
