In [38]:
import torch.nn.functional as F
import torch

In [39]:
sentences = ["the glass of milk",
             "the glass of juice",
             "the cup of tea",
             "I am a good boy",
             "I am a good developer",
             "understand meaning of words",
             "video are crazy"]

tokenized_sentences = [sentence.split() for sentence in sentences]
tokenized_sentences

[['the', 'glass', 'of', 'milk'],
 ['the', 'glass', 'of', 'juice'],
 ['the', 'cup', 'of', 'tea'],
 ['I', 'am', 'a', 'good', 'boy'],
 ['I', 'am', 'a', 'good', 'developer'],
 ['understand', 'meaning', 'of', 'words'],
 ['video', 'are', 'crazy']]

In [40]:
unique_words = set(word for sentence in tokenized_sentences for word in sentence)
unique_words

{'I',
 'a',
 'am',
 'are',
 'boy',
 'crazy',
 'cup',
 'developer',
 'glass',
 'good',
 'juice',
 'meaning',
 'milk',
 'of',
 'tea',
 'the',
 'understand',
 'video',
 'words'}

In [41]:
vocab = {word: idx for idx, word in enumerate(unique_words)}
vocab

{'glass': 0,
 'boy': 1,
 'understand': 2,
 'crazy': 3,
 'developer': 4,
 'the': 5,
 'video': 6,
 'good': 7,
 'am': 8,
 'meaning': 9,
 'milk': 10,
 'cup': 11,
 'words': 12,
 'juice': 13,
 'a': 14,
 'of': 15,
 'are': 16,
 'I': 17,
 'tea': 18}

In [42]:
vocab_size = 10000

In [43]:
indexed_sentences = [torch.tensor([vocab[word] for word in sentence]) for sentence in tokenized_sentences]
indexed_sentences

[tensor([ 5,  0, 15, 10]),
 tensor([ 5,  0, 15, 13]),
 tensor([ 5, 11, 15, 18]),
 tensor([17,  8, 14,  7,  1]),
 tensor([17,  8, 14,  7,  4]),
 tensor([ 2,  9, 15, 12]),
 tensor([ 6, 16,  3])]

In [44]:
indexed_tensors = [torch.tensor(sentence) for sentence in indexed_sentences]
indexed_tensors

  indexed_tensors = [torch.tensor(sentence) for sentence in indexed_sentences]


[tensor([ 5,  0, 15, 10]),
 tensor([ 5,  0, 15, 13]),
 tensor([ 5, 11, 15, 18]),
 tensor([17,  8, 14,  7,  1]),
 tensor([17,  8, 14,  7,  4]),
 tensor([ 2,  9, 15, 12]),
 tensor([ 6, 16,  3])]

In [45]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

padded_sequences = pad_sequence(indexed_tensors, batch_first=True)  
padded_sequences

tensor([[ 5,  0, 15, 10,  0],
        [ 5,  0, 15, 13,  0],
        [ 5, 11, 15, 18,  0],
        [17,  8, 14,  7,  1],
        [17,  8, 14,  7,  4],
        [ 2,  9, 15, 12,  0],
        [ 6, 16,  3,  0,  0]])

In [46]:
vocab_size = 100  # Assume a vocabulary of 100 words
embedding_dim = 10  # Embedding size
embedding = torch.nn.Embedding(vocab_size, embedding_dim)
embedding


Embedding(100, 10)

In [37]:
embedded_sequences = embedding(padded_sequences)
embedded_sequences

tensor([[[ 0.1563,  0.3374, -1.2173,  0.5870,  0.2763, -1.3092, -1.4945,
           0.9227,  0.5266, -0.5911],
         [-0.3922,  1.0005,  0.1720,  2.7476, -0.0382,  0.6005,  0.4771,
           2.0407, -1.0554,  0.8217],
         [-0.3766,  0.9352, -1.7333,  0.7318,  1.0319, -1.6682,  0.0359,
           0.4202,  0.2748, -0.2647],
         [ 0.2375,  0.5546, -1.4421,  0.4368, -1.6717,  0.6977, -0.7021,
           1.0328,  0.1479, -1.4860],
         [-0.3922,  1.0005,  0.1720,  2.7476, -0.0382,  0.6005,  0.4771,
           2.0407, -1.0554,  0.8217]],

        [[ 0.1563,  0.3374, -1.2173,  0.5870,  0.2763, -1.3092, -1.4945,
           0.9227,  0.5266, -0.5911],
         [-0.3922,  1.0005,  0.1720,  2.7476, -0.0382,  0.6005,  0.4771,
           2.0407, -1.0554,  0.8217],
         [-0.3766,  0.9352, -1.7333,  0.7318,  1.0319, -1.6682,  0.0359,
           0.4202,  0.2748, -0.2647],
         [ 0.8734, -1.2594,  1.5226, -1.6074, -0.5007, -0.3161, -0.2373,
           0.8848, -2.0817, -0.6906],