**Tokenizers**

In [None]:
# whitespace_tokenizer.py

text = input("Enter a line: ")

# Split using whitespace (space, tab, newline)
tokens = text.split()

print("\nTokens:")
for token in tokens:
    print(token)


Enter a line: i am studing in third year of btech

Tokens:
i
am
studing
in
third
year
of
btech


In [None]:
# Sentence and Word Tokenization
text = "Tokenization is important. It is used in NLP. This is a simple example."

# Sentence tokenization (split by . ? !) sentences = []
current = ""

for ch in text: current += ch if ch in ".!?":
sentences.append(current.strip()) current = ""

# Word tokenization
words = text.replace(".", "").replace("!", "").replace("?", "").split()

print("Original Text:") print(text)

print("\nSentence Tokens:") for s in sentences:
print(s)

print("\nWord Tokens:") for w in words:
print(w)


SyntaxError: invalid syntax (ipython-input-2666857653.py, line 7)

In [None]:
#character level
text = input("Enter text: ")

print("\nCharacters as tokens:")
for ch in text:
    print(ch)


Enter text: learning LLM is very intresting and exciting 

Characters as tokens:
l
e
a
r
n
i
n
g
 
L
L
M
 
i
s
 
v
e
r
y
 
i
n
t
r
e
s
t
i
n
g
 
a
n
d
 
e
x
c
i
t
i
n
g
 


In [None]:
#subword
def subword_tokenize(word, size=3):
    return [word[i:i+size] for i in range(0, len(word), size)]

text = input("Enter text: ")

tokens = []
for word in text.split():
    tokens.extend(subword_tokenize(word))

print("\nSubword tokens:")
for t in tokens:
    print(t)


Enter text: Tokenization improves understanding

Subword tokens:
Tok
eni
zat
ion
imp
rov
es
und
ers
tan
din
g


In [None]:
# Tokens with Stop-word Removal

sentence = "This is a simple example sentence for tokenization and stop word removal"

stop_words = {
    "is", "a", "an", "the", "and", "or", "for", "to", "of", "in", "on", "this"
}

# Tokenization using whitespace
tokens = sentence.lower().split()

# Stop-word removal
filtered_tokens = [token for token in tokens if token not in stop_words]

print("Sentence:")
print(sentence)

print("\nTokens after stop-word removal:")
for token in filtered_tokens:
    print(token)


Sentence:
This is a simple example sentence for tokenization and stop word removal

Tokens after stop-word removal:
simple
example
sentence
tokenization
stop
word
removal


In [None]:
# Sentence and Word Tokenization

text = "Tokenization is important. It is used in NLP. This is a simple example."

# Sentence tokenization (split by . ? !)
sentences = []
current = ""

for ch in text:
    current += ch
    if ch in ".!?":
        sentences.append(current.strip())
        current = ""

# Word tokenization
words = text.replace(".", "").replace("!", "").replace("?", "").split()

print("Original Text:")
print(text)

print("\nSentence Tokens:")
for s in sentences:
    print(s)

print("\nWord Tokens:")
for w in words:
    print(w)


Original Text:
Tokenization is important. It is used in NLP. This is a simple example.

Sentence Tokens:
Tokenization is important.
It is used in NLP.
This is a simple example.

Word Tokens:
Tokenization
is
important
It
is
used
in
NLP
This
is
a
simple
example


In [None]:
#tokenization using nlp library spacy
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Balaji is studying well")

tokens = [token.text for token in doc]
print(tokens)


['Balaji', 'is', 'studying', 'well']


**Embeddings**


In [None]:
# One-Hot Encoding example (Python)

sentence = "I love NLP"

# Tokenization (word level)
words = sentence.split()

# Build vocabulary
vocab = sorted(set(words))

word_to_index = {word: i for i, word in enumerate(vocab)}

# Create one-hot vectors
one_hot_vectors = {}

for word in words:
    vector = [0] * len(vocab)
    vector[word_to_index[word]] = 1
    one_hot_vectors[word] = vector

print("Sentence:", sentence)
print("Vocabulary:", vocab)

print("\nOne-Hot Vectors:")
for word, vec in one_hot_vectors.items():
    print(word, "->", vec)


Sentence: I love NLP
Vocabulary: ['I', 'NLP', 'love']

One-Hot Vectors:
I -> [1, 0, 0]
love -> [0, 0, 1]
NLP -> [0, 1, 0]


In [None]:
# Simple Bag of Words embedding

sentence = "I like NLP"

words = sentence.lower().split()

vocab = list(set(words))

vector = []

for v in vocab:
    vector.append(words.count(v))

print("Sentence:", sentence)
print("Vocabulary:", vocab)
print("BoW Vector:", vector)



Sentence: I like NLP
Vocabulary: ['like', 'nlp', 'i']
BoW Vector: [1, 1, 1]


In [None]:
# GloVe embedding (no download, runs in Colab)

# Small pre-defined GloVe-style vectors
glove = {
    "i":    [0.1, 0.3, 0.5],
    "love": [0.8, 0.2, 0.6],
    "nlp":  [0.9, 0.7, 0.4],
    "ai":   [0.6, 0.9, 0.3]
}

sentence = "I love NLP"

words = sentence.lower().split()

print("Sentence:", sentence)
print("\nGloVe Embeddings:\n")

for w in words:
    if w in glove:
        print(w, "->", glove[w])
    else:
        print(w, "-> Not found")


Sentence: I love NLP

GloVe Embeddings:

i -> [0.1, 0.3, 0.5]
love -> [0.8, 0.2, 0.6]
nlp -> [0.9, 0.7, 0.4]


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
import numpy as np

# Sample sentences
sentences = [
    "I love machine learning",
    "Deep learning is powerful",
    "I love AI"
]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

sequences = tokenizer.texts_to_sequences(sentences)
vocab_size = len(tokenizer.word_index) + 1

# Padding
max_len = max(len(seq) for seq in sequences)
padded = pad_sequences(sequences, maxlen=max_len, padding="post")

# Build model (no input_length)
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=8),
    Flatten(),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy")

model.predict(padded)

# Now get embedding weights safely
embedding_weights = model.layers[0].get_weights()[0]

print("Vocabulary:", tokenizer.word_index)
print("\nEmbedding matrix shape:", embedding_weights.shape)

# Print embedding for a word
word = "love"
word_id = tokenizer.word_index[word]
print(f"\nEmbedding vector for '{word}':")
print(embedding_weights[word_id])



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
Vocabulary: {'i': 1, 'love': 2, 'learning': 3, 'machine': 4, 'deep': 5, 'is': 6, 'powerful': 7, 'ai': 8}

Embedding matrix shape: (9, 8)

Embedding vector for 'love':
[-0.02397566 -0.00282352 -0.04449521  0.00805445 -0.03857427 -0.02186471
 -0.00043943 -0.00260722]


In [None]:

# BERT Embeddings Extraction

!pip install -q transformers torch

import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

print("Model loaded on:", device)


# Sentence Embedding Function

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # CLS token embedding represents the whole sentence
    sentence_embedding = outputs.last_hidden_state[:, 0, :]

    return sentence_embedding

# Word Embedding Function


def get_word_embeddings(sentence):
    tokens = tokenizer.tokenize(sentence)
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.squeeze(0)

    return tokens, embeddings

# Example

sentence = "I love machine learning"

# Sentence embedding
sent_emb = get_sentence_embedding(sentence)
print("Sentence embedding shape:", sent_emb.shape)

# Word embeddings
tokens, word_embs = get_word_embeddings(sentence)

print("\nWord embeddings:")
for token, emb in zip(tokens, word_embs[1:len(tokens)+1]):
    print(f"{token:10} -> {emb[:5].cpu().numpy()} ...")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model loaded on: cpu
Sentence embedding shape: torch.Size([1, 768])

Word embeddings:
i          -> [ 0.5847864   0.26324508 -0.12469552 -0.14526315  0.3732389 ] ...
love       -> [1.3391263  1.0639459  0.6155992  0.05781863 0.8699149 ] ...
machine    -> [ 0.366308   -0.16808899  0.2151252  -0.11699688  1.3801515 ] ...
learning   -> [-0.34998518 -0.24101868 -0.45664328  0.35860792  0.38634405] ...
