In [1]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [53]:
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')

words = list(wn.words())
words

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['.22-caliber',
 '.22-calibre',
 '.22_caliber',
 '.22_calibre',
 '.38-caliber',
 '.38-calibre',
 '.38_caliber',
 '.38_calibre',
 '.45-caliber',
 '.45-calibre',
 '.45_caliber',
 '.45_calibre',
 '0',
 '1',
 '10',
 '10-membered',
 '100',
 '1000',
 '1000th',
 '100th',
 '101',
 '101st',
 '105',
 '105th',
 '10th',
 '11',
 '110',
 '110th',
 '115',
 '115th',
 '11th',
 '12',
 '120',
 '120th',
 '125',
 '125th',
 '12th',
 '13',
 '130',
 '130th',
 '135',
 '135th',
 '13th',
 '14',
 '140',
 '140th',
 '145',
 '145th',
 '14th',
 '15',
 '150',
 '150th',
 '155',
 '155th',
 '15th',
 '16',
 '160',
 '160th',
 '165',
 '165th',
 '16th',
 '17',
 '170',
 '170th',
 '175',
 '175th',
 '17th',
 '18',
 '180',
 '180th',
 '18th',
 '19',
 '190',
 '190th',
 '19th',
 '1st',
 '2',
 '2-dimensional',
 '20',
 '200',
 '200th',
 '20th',
 '21',
 '21st',
 '22',
 '22nd',
 '23',
 '23rd',
 '24',
 '24th',
 '25',
 '25th',
 '26',
 '26th',
 '27',
 '27th',
 '28',
 '28th',
 '29',
 '29th',
 '2d',
 '2nd',
 '3',
 '3-dimensional',
 '3-membe

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

vocab = words.copy()
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

vocab_size = len(vocab)
embedding_dim = 100

embedding_matrix = torch.randn(vocab_size, embedding_dim, requires_grad=True)


In [55]:
embedding_matrix.shape, vocab_size, embedding_dim

(torch.Size([147306, 100]), 147306, 100)

In [56]:
class myembeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(myembeddings, self).__init__()
        self.embedding_matrix = embedding_matrix
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_indices):
        embeddings = self.embedding_matrix[target_indices]
        output = self.output_layer(embeddings)
        return output


In [57]:
def generate_training_data(corpus, word_to_index, context_size=2):
    training_data = []
    for sentence in corpus:
        sentence_indices = [word_to_index[word] for word in sentence if word in word_to_index]
        for i, target in enumerate(sentence_indices):
            context = [sentence_indices[i + j] for j in range(-context_size, context_size + 1) if j != 0 and 0 <= i + j < len(sentence_indices)]
            for context_word in context:
                training_data.append((target, context_word))
    return training_data

corpus = [
    ['My', 'name', 'is','Ayush'],
    ['Sharma', 'vivo', 'acquantance'],
    ['the', 'cat', 'sat', 'on', 'the', 'mat.'],
    ['a', 'dog', 'barked', 'loudly', 'in', 'the', 'park.'],
    ['she', 'enjoys', 'reading', 'books', 'on', 'sunny', 'days.'],
    ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'],
    ['he', 'writes', 'code', 'in', 'python', 'every', 'day.'],
    ['they', 'are', 'planning', 'a', 'trip', 'to', 'the', 'mountains.'],
    ['birds', 'are', 'singing', 'in', 'the', 'morning', 'sky.'],
    ['the', 'chef', 'cooked', 'a', 'delicious', 'meal', 'for', 'dinner.'],
    ['children', 'are', 'playing', 'in', 'the', 'playground.']
]

training_data = generate_training_data(corpus, word_to_index)


In [58]:
# for i,j in (training_data[:10]):
#     print((index_to_word[i],index_to_word[j]))

training_data[:5]

[(42248, 116086),
 (42248, 12775),
 (116086, 42248),
 (116086, 12775),
 (12775, 42248)]

In [59]:
convergence_plot=[]

In [None]:
def train_model(model, training_data, epochs=1000, learning_rate=0.03):
    optimizer = optim.SGD([embedding_matrix] + list(model.parameters()), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for target, context in training_data:
            target_tensor = torch.tensor([target], dtype=torch.long)
            context_tensor = torch.tensor([context], dtype=torch.long)
            optimizer.zero_grad()
            output = model(target_tensor)
            loss = loss_function(output, context_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}, Loss: {total_loss:.4f}')
        convergence_plot.append(total_loss)


model = myembeddings(vocab_size, embedding_dim)

train_model(model, training_data)


Epoch 1, Loss: 1045.1455
Epoch 2, Loss: 835.7778
Epoch 3, Loss: 625.7438
Epoch 4, Loss: 452.2658
Epoch 5, Loss: 327.8895
Epoch 6, Loss: 256.7898
Epoch 7, Loss: 227.5696
Epoch 8, Loss: 217.6617
Epoch 9, Loss: 213.6895
Epoch 10, Loss: 211.7053
Epoch 11, Loss: 210.4903
Epoch 12, Loss: 209.6389
Epoch 13, Loss: 208.9879
Epoch 14, Loss: 208.4548
Epoch 15, Loss: 207.9937
Epoch 16, Loss: 207.5786
Epoch 17, Loss: 207.1937
Epoch 18, Loss: 206.8295
Epoch 19, Loss: 206.4796
Epoch 20, Loss: 206.1400
Epoch 21, Loss: 205.8079
Epoch 22, Loss: 205.4815
Epoch 23, Loss: 205.1593
Epoch 24, Loss: 204.8406
Epoch 25, Loss: 204.5245
Epoch 26, Loss: 204.2107
Epoch 27, Loss: 203.8988
Epoch 28, Loss: 203.5887
Epoch 29, Loss: 203.2800
Epoch 30, Loss: 202.9728
Epoch 31, Loss: 202.6669
Epoch 32, Loss: 202.3623
Epoch 33, Loss: 202.0589
Epoch 34, Loss: 201.7568
Epoch 35, Loss: 201.4560
Epoch 36, Loss: 201.1563
Epoch 37, Loss: 200.8579
Epoch 38, Loss: 200.5608
Epoch 39, Loss: 200.2650
Epoch 40, Loss: 199.9705
Epoch 41

In [64]:
# torch.save(f="model.pth", obj=model)

In [8]:

trained_embedding_matrix = model.embedding_matrix.detach()


In [12]:
trained_embedding_matrix

tensor([[ 0.5389,  0.9008,  2.0919, -0.2412, -0.4483, -2.7679,  0.0570, -0.7687,
         -0.5733, -0.0213],
        [ 0.3102, -0.7673, -0.5406, -1.5411, -0.2066, -0.0885,  0.8541, -0.7215,
          0.0326,  0.1048],
        [ 0.4902,  0.2057,  0.6303, -1.2802, -1.0341, -1.2794, -0.8239, -1.1425,
          0.3443,  1.7529],
        [-0.6403, -0.2694, -0.4505,  1.6121, -0.6620, -1.4094, -1.1139, -0.3102,
         -0.0135,  0.7385],
        [ 0.6618,  1.7445, -1.0322,  0.0384, -1.3838,  0.4387, -1.0931,  1.4958,
          0.9368,  0.1841]])

In [9]:
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedding_layer.weight = nn.Parameter(trained_embedding_matrix)


In [11]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.5389,  0.9008,  2.0919, -0.2412, -0.4483, -2.7679,  0.0570, -0.7687,
         -0.5733, -0.0213],
        [ 0.3102, -0.7673, -0.5406, -1.5411, -0.2066, -0.0885,  0.8541, -0.7215,
          0.0326,  0.1048],
        [ 0.4902,  0.2057,  0.6303, -1.2802, -1.0341, -1.2794, -0.8239, -1.1425,
          0.3443,  1.7529],
        [-0.6403, -0.2694, -0.4505,  1.6121, -0.6620, -1.4094, -1.1139, -0.3102,
         -0.0135,  0.7385],
        [ 0.6618,  1.7445, -1.0322,  0.0384, -1.3838,  0.4387, -1.0931,  1.4958,
          0.9368,  0.1841]], requires_grad=True)

In [14]:
def word_to_vector(word, embedding_layer, word_to_index):
    word_index = word_to_index.get(word, None)

    if word_index is None:
        raise ValueError(f"The word '{word}' is not in the vocabulary.")
    word_index_tensor = torch.tensor([word_index], dtype=torch.long)
    word_vector = embedding_layer(word_index_tensor)

    return word_vector.squeeze(0)


In [15]:
word = 'cat'
vector = word_to_vector(word, embedding_layer, word_to_index)

print(f"Vector for '{word}':\n{vector}")


Vector for 'word5':
tensor([ 0.6618,  1.7445, -1.0322,  0.0384, -1.3838,  0.4387, -1.0931,  1.4958,
         0.9368,  0.1841], grad_fn=<SqueezeBackward1>)
