In [None]:
#1. Whitespace tokenization program
# Input text
text = "Natural language processing makes computers understand text"

# Split text using whitespace
tokens = text.split()

# Display tokens
print("Original Text:", text)
print("Tokens:", tokens)

Original Text: Natural language processing makes computers understand text
Tokens: ['Natural', 'language', 'processing', 'makes', 'computers', 'understand', 'text']


In [None]:
# 2.Character-level tokenization program
# Input text
text = "NLP is powerful"
# Convert text into characters
tokens = list(text)
# Display tokens
print("Original Text:", text)
print("Character Tokens:", tokens)


Original Text: NLP is powerful
Character Tokens: ['N', 'L', 'P', ' ', 'i', 's', ' ', 'p', 'o', 'w', 'e', 'r', 'f', 'u', 'l']


In [None]:
#3.sub word tokenizer
from transformers import AutoTokenizer
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Input text
text = "Tokenization is important in NLP"
# Perform subword tokenization
tokens = tokenizer.tokenize(text)
# Display tokens
print("Original Text:", text)
print("Subword Tokens:", tokens)


Original Text: Tokenization is important in NLP
Subword Tokens: ['token', '##ization', 'is', 'important', 'in', 'nl', '##p']


In [None]:
#4.stop word removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to resolve LookupError for punkt_tab
# Input text
text = "Natural Language Processing is very important in data science"
# Tokenize text
tokens = word_tokenize(text)
# Load stop words
stop_words = set(stopwords.words('english'))
# Remove stop words
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Display result
print("Original Text:", text)
print("Tokens after Stop Word Removal:", filtered_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Text: Natural Language Processing is very important in data science
Tokens after Stop Word Removal: ['Natural', 'Language', 'Processing', 'important', 'data', 'science']


In [None]:
#5.Tokenization using NLP Library (NLTK)
import nltk
from nltk.tokenize import word_tokenize
# Download tokenizer (run once)
nltk.download('punkt')
nltk.download('punkt_tab')
# Input text
text = "Learning NLP is fun and useful"
# Tokenize the text
tokens = word_tokenize(text)
# Display tokens
print("Original Text:", text)
print("Word Tokens:", tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Text: Learning NLP is fun and useful
Word Tokens: ['Learning', 'NLP', 'is', 'fun', 'and', 'useful']


In [None]:
#6.Sentence and Word Tokenization using NLTK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Download required tokenizer models (run once)
nltk.download('punkt')

# Input text
text = "Natural Language Processing is interesting. It helps machines understand human language."

# Sentence tokenization
sentences = sent_tokenize(text)

# Word tokenization
words = word_tokenize(text)

# Display results
print("Original Text:")
print(text)

print("\nSentence Tokens:")
for s in sentences:
    print(s)

print("\nWord Tokens:")
print(words)



Original Text:
Natural Language Processing is interesting. It helps machines understand human language.

Sentence Tokens:
Natural Language Processing is interesting.
It helps machines understand human language.

Word Tokens:
['Natural', 'Language', 'Processing', 'is', 'interesting', '.', 'It', 'helps', 'machines', 'understand', 'human', 'language', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#1.Embedding using Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer
# Input text (documents)
documents = [
    "I love machine learning",
    "Machine learning is fun",
    "I love data science"
]
# Create Bag of Words model
vectorizer = CountVectorizer()
# Convert text into BoW vectors
bow_vectors = vectorizer.fit_transform(documents)
# Get feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()
# Display results
print("Vocabulary:")
print(vocabulary)
print("\nBag of Words Matrix:")
print(bow_vectors.toarray())


Vocabulary:
['data' 'fun' 'is' 'learning' 'love' 'machine' 'science']

Bag of Words Matrix:
[[0 0 0 1 1 1 0]
 [0 1 1 1 0 1 0]
 [1 0 0 0 1 0 1]]


In [None]:
#2.Embedding using Word2Vec
!pip install gensim nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

# Input sentences
sentences = [
    "I love machine learning",
    "Machine learning is very interesting",
    "I love data science"
]

# Tokenize sentences into words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Get word embedding
word = "learning"
vector = model.wv[word]

# Display result
print("Word:", word)
print("Vector length:", len(vector))
print("Word2Vec Embedding:\n", vector)

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Word: learning
Vector length: 100
Word2Vec Embedding:
 [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.333768

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#3.Embedding using GloVe
import numpy as np

# Download GloVe embeddings (if not already present)
!wget -P /content/ -nc http://nlp.stanford.edu/data/glove.6B.zip
!unzip -n /content/glove.6B.zip -d /content/

# Load GloVe embeddings file (example: glove.6B.50d.txt)
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype="float32")
            embeddings[word] = vector
    return embeddings

# Load GloVe vectors
glove_path = "/content/glove.6B.50d.txt" # Update path to downloaded file
glove_embeddings = load_glove_embeddings(glove_path)

# Input text
sentence = "machine learning is powerful"
words = sentence.lower().split()

# Display embeddings
for word in words:
    if word in glove_embeddings:
        print(f"\nWord: {word}")
        print("Vector length:", len(glove_embeddings[word]))
        print("Embedding:", glove_embeddings[word])
    else:
        print(f"\nWord: {word} not found in GloVe vocabulary")

--2026-01-22 10:28:02--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-01-22 10:28:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-01-22 10:28:02--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/content/glove.6B.z

In [None]:
#4.Embedding using BERT
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Input sentence
sentence = "Artificial intelligence is changing the world"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Get sentence embedding (CLS token)
sentence_embedding = outputs.last_hidden_state[:, 0, :]

print("Sentence:", sentence)
print("Embedding shape:", sentence_embedding.shape)
print("Embedding vector:")
print(sentence_embedding)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Sentence: Artificial intelligence is changing the world
Embedding shape: torch.Size([1, 768])
Embedding vector:
tensor([[ 6.7895e-02,  3.7280e-01,  1.8939e-01, -1.6048e-02, -4.2173e-01,
         -6.3365e-01,  8.9762e-01,  1.0387e+00, -1.0698e-01, -4.9215e-01,
          6.4150e-01,  4.6076e-02, -7.0754e-01,  5.1899e-01,  2.1290e-01,
          3.4821e-02, -6.2564e-01,  3.4357e-01,  1.1718e-01,  2.6627e-01,
         -2.5938e-01, -3.3450e-01, -2.9126e-01,  5.1975e-01,  1.1488e-01,
         -3.5947e-01, -1.9824e-02,  4.1777e-02,  2.4926e-01,  1.2468e-01,
         -2.5859e-01,  1.5602e-01, -6.6202e-01, -5.0114e-01,  4.8910e-01,
         -1.0103e-01, -1.7038e-01, -4.9001e-01, -2.2280e-01,  7.1499e-01,
         -9.9041e-01,  2.2191e-01, -1.2625e-01,  3.1067e-01, -1.4600e-01,
         -4.1252e-01, -2.2756e+00, -5.4306e-02,  1.0077e-02, -3.5601e-01,
         -1.6676e-01, -7.6894e-01,  4.6206e-02,  3.7785e-01,  7.0861e-02,
          5.3669e-01, -3.3085e-01,  5.0643e-01, -4.5634e-01,  6.6271e-01,


In [None]:
#5.Embedding using One-Hot Encoding
import numpy as np

# Input sentence
sentence = "deep learning is powerful"

# Tokenize sentence (word level)
words = sentence.lower().split()

# Create vocabulary
vocab = list(set(words))

# Create one-hot vectors
one_hot_vectors = {}

for word in vocab:
    vector = np.zeros(len(vocab))
    index = vocab.index(word)
    vector[index] = 1
    one_hot_vectors[word] = vector

# Display results
print("Vocabulary:", vocab)

for word, vector in one_hot_vectors.items():
    print(f"\nWord: {word}")
    print("One-Hot Vector:", vector)


Vocabulary: ['deep', 'powerful', 'learning', 'is']

Word: deep
One-Hot Vector: [1. 0. 0. 0.]

Word: powerful
One-Hot Vector: [0. 1. 0. 0.]

Word: learning
One-Hot Vector: [0. 0. 1. 0.]

Word: is
One-Hot Vector: [0. 0. 0. 1.]
