In [7]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

🔤 What is Embedding?


Embedding is a technique used to convert high-dimensional or symbolic data (like words, images, or items) into a low-dimensional vector (a list of numbers). These vectors capture semantic or structural meaning of the original data, and they are used as input to machine learning models, especially in Natural Language Processing (NLP) and Recommendation Systems

💡 Why Use Embeddings?


Because:

Machines don’t understand text or images directly.

They need numerical representations.

Embeddings help capture relationships and similarities.

# Step 1: Input text

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch


text = "I am Ashvinkumar bari working on AI Eng Position"




# Step 2: Load tokenizer and model (we'll use BERT as an example)

In [19]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


# Step 3: Tokenization

In [20]:


tokens = tokenizer.tokenize(text)
print("🔹 Tokens:", tokens)


🔹 Tokens: ['i', 'am', 'ash', '##vin', '##kumar', 'bari', 'working', 'on', 'ai', 'eng', 'position']



# Step 4: Convert tokens to token IDs

In [21]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("🔹 Token IDs:", token_ids)


🔹 Token IDs: [1045, 2572, 6683, 6371, 18494, 22466, 2551, 2006, 9932, 25540, 2597]



# Step 5: Convert token IDs to input tensor

In [22]:
inputs = tokenizer(text, return_tensors="pt")  # automatically adds special tokens like [CLS] and [SEP]
print(inputs)

{'input_ids': tensor([[  101,  1045,  2572,  6683,  6371, 18494, 22466,  2551,  2006,  9932,
         25540,  2597,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


# Step 6: Generate embeddings from the model

In [23]:
with torch.no_grad():
    outputs = model(**inputs)

#print(outputs)

# Step 7: Extract embeddings (from the last hidden state)

In [24]:
#Shape: [batch_size, sequence_length, embedding_dimension]
embeddings = outputs.last_hidden_state

print("🔹 Embeddings shape:", embeddings.shape)

🔹 Embeddings shape: torch.Size([1, 13, 768])



# Optional: View embeddings of each token

In [25]:

for token, embedding_vector in zip(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]), embeddings[0]):
    print(f"{token:10s} → Embedding vector shape: {embeddings.shape}")


[CLS]      → Embedding vector shape: torch.Size([1, 13, 768])
i          → Embedding vector shape: torch.Size([1, 13, 768])
am         → Embedding vector shape: torch.Size([1, 13, 768])
ash        → Embedding vector shape: torch.Size([1, 13, 768])
##vin      → Embedding vector shape: torch.Size([1, 13, 768])
##kumar    → Embedding vector shape: torch.Size([1, 13, 768])
bari       → Embedding vector shape: torch.Size([1, 13, 768])
working    → Embedding vector shape: torch.Size([1, 13, 768])
on         → Embedding vector shape: torch.Size([1, 13, 768])
ai         → Embedding vector shape: torch.Size([1, 13, 768])
eng        → Embedding vector shape: torch.Size([1, 13, 768])
position   → Embedding vector shape: torch.Size([1, 13, 768])
[SEP]      → Embedding vector shape: torch.Size([1, 13, 768])


# Optional: View embeddings of each token

In [26]:

for token, embedding_vector in zip(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]), embeddings[0]):
    print(f"{token:10s} → Embedding vector shape: {embeddings.shape}")

[CLS]      → Embedding vector shape: torch.Size([1, 13, 768])
i          → Embedding vector shape: torch.Size([1, 13, 768])
am         → Embedding vector shape: torch.Size([1, 13, 768])
ash        → Embedding vector shape: torch.Size([1, 13, 768])
##vin      → Embedding vector shape: torch.Size([1, 13, 768])
##kumar    → Embedding vector shape: torch.Size([1, 13, 768])
bari       → Embedding vector shape: torch.Size([1, 13, 768])
working    → Embedding vector shape: torch.Size([1, 13, 768])
on         → Embedding vector shape: torch.Size([1, 13, 768])
ai         → Embedding vector shape: torch.Size([1, 13, 768])
eng        → Embedding vector shape: torch.Size([1, 13, 768])
position   → Embedding vector shape: torch.Size([1, 13, 768])
[SEP]      → Embedding vector shape: torch.Size([1, 13, 768])


In [28]:

for token, embedding_vector in zip(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]), embeddings[0]):
    print(f"{token:10s} → Embedding vector shape: {embeddings}")

[CLS]      → Embedding vector shape: tensor([[[-0.3441,  0.2321,  0.1651,  ..., -0.3527,  0.4396,  0.2655],
         [ 0.6251, -0.3805, -0.5655,  ..., -0.1628,  0.9383,  0.1508],
         [ 0.2569,  0.1752,  0.3612,  ..., -0.0098,  0.5865,  0.4507],
         ...,
         [-0.2569,  0.0353,  0.4537,  ..., -0.7675,  0.2623, -0.4840],
         [-0.2360, -0.2788,  0.0153,  ...,  0.3330,  0.2637, -0.3080],
         [ 0.7282, -0.0205, -0.1759,  ...,  0.2141, -0.5070, -0.3118]]])
i          → Embedding vector shape: tensor([[[-0.3441,  0.2321,  0.1651,  ..., -0.3527,  0.4396,  0.2655],
         [ 0.6251, -0.3805, -0.5655,  ..., -0.1628,  0.9383,  0.1508],
         [ 0.2569,  0.1752,  0.3612,  ..., -0.0098,  0.5865,  0.4507],
         ...,
         [-0.2569,  0.0353,  0.4537,  ..., -0.7675,  0.2623, -0.4840],
         [-0.2360, -0.2788,  0.0153,  ...,  0.3330,  0.2637, -0.3080],
         [ 0.7282, -0.0205, -0.1759,  ...,  0.2141, -0.5070, -0.3118]]])
am         → Embedding vector shape: tenso

In [31]:

# Input text
text = "Ashvin  is a Eng"

# Tokenize and convert to tensor
inputs = tokenizer(text, return_tensors="pt")

# Forward pass to get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings (last hidden state)
# Shape: [1, sequence_length, embedding_dim]
embeddings = outputs.last_hidden_state

# Optional: Convert to one sentence vector by averaging token embeddings (excluding [CLS] and [SEP])
sentence_embedding = embeddings[0][1:-1].mean(dim=0)

print("🔹 Sentence embedding shape:", sentence_embedding.shape)
print("🔹 Sentence embedding (first 5 values):", sentence_embedding[:20])


🔹 Sentence embedding shape: torch.Size([768])
🔹 Sentence embedding (first 5 values): tensor([ 0.0643, -0.0413, -0.1851, -0.4175,  0.3276, -0.4492,  0.1968,  0.3278,
        -0.2401,  0.2177,  0.0584, -0.0746, -0.1477,  0.7629,  0.7641, -0.3194,
         0.2162,  0.1506, -0.1022,  0.3476])
