In [44]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\amann\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


#### Word Embeddings

In [45]:
import gensim.downloader as api
import numpy as np

In [46]:
model = api.load('word2vec-google-news-300')

In [47]:
print(model['king']) # see vector
print(model.most_similar('king')) # analogies

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [48]:
from gensim.models import KeyedVectors

print("Most similar to 'dog' : ")
print(model.most_similar('dog'))

# Word analogy: king - man + woman = ?? ---> queen
print("Word analogy: king - man + woman = ?")
print(model.most_similar(positive=['king','woman'],negative=['man'],topn=1))

# Word Similarity
print("Similarity between coffee and tea :")
print(model.similarity('coffee','tea'))

# Check if a word exists in vocabulary
print("Is 'dragon' in the vocabulary ?")
print('dragon' in model)

Most similar to 'dog' : 
[('dogs', 0.8680490851402283), ('puppy', 0.8106428384780884), ('pit_bull', 0.780396044254303), ('pooch', 0.7627375721931458), ('cat', 0.7609457969665527), ('golden_retriever', 0.7500901222229004), ('German_shepherd', 0.7465173006057739), ('Rottweiler', 0.7437615990638733), ('beagle', 0.7418619990348816), ('pup', 0.7406911253929138)]
Word analogy: king - man + woman = ?
[('queen', 0.7118191123008728)]
Similarity between coffee and tea :
0.56352925
Is 'dragon' in the vocabulary ?
True


#### Sentence Embeddings (Avg Word2Vec)

In [49]:
def avg_word2vec(sentence,model):
    words = sentence.lower().split()
    valid_vectors = [model[word] for word in words if word in model]

    if not valid_vectors:
        return np.zeros(model.vector_size)
    
    return np.mean(valid_vectors,axis=0)

sentence = "I love machine learning"
vector = avg_word2vec(sentence,model)

print("Sentence Vector (shape):", vector.shape)
print("First 5 dimensions:", vector[:5])

Sentence Vector (shape): (300,)
First 5 dimensions: [ 0.01123047 -0.01138306  0.02069092  0.14361572 -0.03967285]


### Traditional Approach

#### One Hot Encoding

In [50]:
def one_hot_encode(text):
    words = text.split()
    vocabulary = set(words)
    word_to_index = {word : i for i,word in enumerate(vocabulary)}
    one_hot_encoded = []
    for word in words:
        one_hot_vector = [0] * len(vocabulary)
        one_hot_vector[word_to_index[word]] = 1
        one_hot_encoded.append(one_hot_vector)
    return one_hot_encoded, word_to_index, vocabulary

example_text = "cat in the hat dog on the mat bird in the tree"

one_hot_encoded,word_to_index,vocabulary = one_hot_encode(example_text)

print("Vocabulary:", vocabulary)
print("Word to Index Mapping:", word_to_index)
print("One-Hot Encoded Matrix:")
for word, encoding in zip(example_text.split(), one_hot_encoded):
    print(f"{word}: {encoding}")

Vocabulary: {'the', 'hat', 'in', 'tree', 'dog', 'on', 'cat', 'mat', 'bird'}
Word to Index Mapping: {'the': 0, 'hat': 1, 'in': 2, 'tree': 3, 'dog': 4, 'on': 5, 'cat': 6, 'mat': 7, 'bird': 8}
One-Hot Encoded Matrix:
cat: [0, 0, 0, 0, 0, 0, 1, 0, 0]
in: [0, 0, 1, 0, 0, 0, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
hat: [0, 1, 0, 0, 0, 0, 0, 0, 0]
dog: [0, 0, 0, 0, 1, 0, 0, 0, 0]
on: [0, 0, 0, 0, 0, 1, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
mat: [0, 0, 0, 0, 0, 0, 0, 1, 0]
bird: [0, 0, 0, 0, 0, 0, 0, 0, 1]
in: [0, 0, 1, 0, 0, 0, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
tree: [0, 0, 0, 1, 0, 0, 0, 0, 0]


#### Bag of Word (BOW)

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
documents = ["This is the first document.", "This document is the second document.",
              "And this is the third one.", "Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag-of-Words Matrix:")
print(X.toarray())
print("Vocabulary (Feature Names):", feature_names)

Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary (Feature Names): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


#### TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [ "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step." ]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

for doc_index,doc in enumerate(documents):
    feature_index = tfidf_matrix[doc_index,:].nonzero()[1]
    tfidf_doc_values = zip(feature_index,[tfidf_matrix[doc_index,x] for x in feature_index])
    tfidf_values[doc_index] = {feature_names[i]: value for i,value in tfidf_doc_values}

for doc_index,values in tfidf_values.items():
    print(f"Document {doc_index+1}: ")
    for word,tfidf_value in values.items():
        print(f"{word}: {tfidf_value}")
    print("\n")

Document 1: 
the: 0.6030226891555273
quick: 0.30151134457776363
brown: 0.30151134457776363
fox: 0.30151134457776363
jumps: 0.30151134457776363
over: 0.30151134457776363
lazy: 0.30151134457776363
dog: 0.30151134457776363


Document 2: 
journey: 0.3535533905932738
of: 0.3535533905932738
thousand: 0.3535533905932738
miles: 0.3535533905932738
begins: 0.3535533905932738
with: 0.3535533905932738
single: 0.3535533905932738
step: 0.3535533905932738




### Neural Approach

#### Word2Vec

##### 1. Continuous Bag of Words (CBOW)

In [53]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define CBOW model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear = nn.Linear(embed_size, vocab_size)

    def forward(self, context):
        context_embeds = self.embeddings(context).sum(dim=1)
        output = self.linear(context_embeds)
        return output

context_size = 2
raw_text = "word embeddings are awesome"
tokens = raw_text.split()
vocab = set(tokens)
word_to_index = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(tokens) - 2):
    context = [word_to_index[word] for word in tokens[i - 2:i] + tokens[i + 1:i + 3]]
    target = word_to_index[tokens[i]]
    data.append((torch.tensor(context), torch.tensor(target)))

# Hyperparameters
vocab_size = len(vocab)
embed_size = 10
learning_rate = 0.01
epochs = 100

# Initialize CBOW model
cbow_model = CBOWModel(vocab_size, embed_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        optimizer.zero_grad()
        output = cbow_model(context)
        loss = criterion(output.unsqueeze(0), target.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

# Example usage
word_to_lookup = "embeddings"
word_index = word_to_index[word_to_lookup]
embedding = cbow_model.embeddings(torch.tensor([word_index]))
print(f"Embedding for '{word_to_lookup}': {embedding.detach().numpy()}")

Embedding for 'embeddings': [[-1.711798   -0.8385683   0.3857726  -0.51692265 -0.5410009   0.1587451
  -0.3844856   1.8475201  -0.8461097   0.7197072 ]]


##### 2. Skip-Gram

In [54]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

sample = "Word embeddings are dense vector representations of words."

tokenized_corpus = word_tokenize(sample.lower())

skipgram_model = Word2Vec(sentences=[tokenized_corpus],
                          vector_size=100,
                          window=5,
                          sg=1,
                          min_count=1,
                          workers=4)

# Training 
skipgram_model.train([tokenized_corpus],total_examples=1,epochs=10)
skipgram_model.save("skipgram_model.model")
loaded_model = Word2Vec.load("skipgram_model.model")
vector_representation = loaded_model.wv['word']
print("Vector representation of 'word':",vector_representation)

Vector representation of 'word': [-9.5800208e-03  8.9437785e-03  4.1664648e-03  9.2367809e-03
  6.6457358e-03  2.9233587e-03  9.8055992e-03 -4.4231843e-03
 -6.8048164e-03  4.2256550e-03  3.7299085e-03 -5.6668529e-03
  9.7035142e-03 -3.5551414e-03  9.5499391e-03  8.3657773e-04
 -6.3355025e-03 -1.9741615e-03 -7.3781307e-03 -2.9811086e-03
  1.0425397e-03  9.4814906e-03  9.3598543e-03 -6.5986011e-03
  3.4773252e-03  2.2767992e-03 -2.4910474e-03 -9.2290826e-03
  1.0267317e-03 -8.1645092e-03  6.3240929e-03 -5.8001447e-03
  5.5353874e-03  9.8330071e-03 -1.5987856e-04  4.5296676e-03
 -1.8086446e-03  7.3613892e-03  3.9419360e-03 -9.0095028e-03
 -2.3953868e-03  3.6261671e-03 -1.0080514e-04 -1.2024897e-03
 -1.0558038e-03 -1.6681013e-03  6.0541567e-04  4.1633579e-03
 -4.2531900e-03 -3.8336846e-03 -5.0755290e-05  2.6549282e-04
 -1.7014991e-04 -4.7843382e-03  4.3120929e-03 -2.1710952e-03
  2.1056964e-03  6.6702347e-04  5.9686624e-03 -6.8418151e-03
 -6.8183104e-03 -4.4762432e-03  9.4359247e-03 -1.593

##### GloVe

In [55]:
from gensim.models import KeyedVectors
from gensim.downloader import load

glove_model = load('glove-wiki-gigaword-50')
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# Compute similarity for each pair of words
for pair in word_pairs:
    similarity = glove_model.similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using GloVe: {similarity:.3f}")

Similarity between 'learn' and 'learning' using GloVe: 0.802
Similarity between 'india' and 'indian' using GloVe: 0.865
Similarity between 'fame' and 'famous' using GloVe: 0.589


#### Fasttext

In [56]:
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# Compute similarity for each pair of words

for pair in word_pairs:
    similarity = fasttext_model.similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using FastText: {similarity:.3f}")

Similarity between 'learn' and 'learning' using FastText: 0.642
Similarity between 'india' and 'indian' using FastText: 0.708
Similarity between 'fame' and 'famous' using FastText: 0.519


In [58]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.5 MB 4.2 MB/s eta 0:00:03
   ------ --------------------------------- 1.6/10.5 MB 4.7 MB/s eta 0:00:02
   --------- ------------------------------ 2.4/10.5 MB 4.6 MB/s eta 0:00:02
   ------------- -------------------------- 3.4/10.5 MB 4.5 MB/s eta 0:00:02
   -------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\amann\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [60]:
from transformers import BertTokenizer, BertModel
import torch

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

for pair in word_pairs:
    tokens = tokenizer(pair,return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)

    cls_embedding = outputs.last_hidden_state[:,0,:]

    similarity = torch.nn.functional.cosine_similarity(cls_embedding[0],cls_embedding[1],dim=0)
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Similarity between 'learn' and 'learning' using BERT: 0.930
Similarity between 'india' and 'indian' using BERT: 0.957
Similarity between 'fame' and 'famous' using BERT: 0.956
