In [2]:
from datasets import load_dataset
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


In [4]:
# It only has the train split, so we divide it overselves
ds_raw = load_dataset("opus_books", "en-it", split='train')

In [10]:
print(ds_raw[4])

{'id': '4', 'translation': {'en': 'There was no possibility of taking a walk that day.', 'it': 'I. In quel giorno era impossibile passeggiare.'}}


In [13]:
from tokenizers import Tokenizer
tokenizer_src = Tokenizer.from_file(str(r"D:\Machine Learning\Transformers\Transformer_From_Scratch\pytorch-transformer\tokenizer_en.json"))
tokenizer_tgt = Tokenizer.from_file(str(r"D:\Machine Learning\Transformers\Transformer_From_Scratch\pytorch-transformer\tokenizer_it.json"))

In [18]:
print(tokenizer_src.token_to_id("[SOS]")," ",tokenizer_tgt.token_to_id("[SOS]"))
print(tokenizer_src.token_to_id("[EOS]")," ",tokenizer_tgt.token_to_id("[EOS]"))
print(tokenizer_src.token_to_id("[PAD]")," ",tokenizer_tgt.token_to_id("[PAD]"))
print(tokenizer_src.token_to_id("[PAD]")," ",tokenizer_tgt.token_to_id("Arunim"))

2   2
3   3
1   1
1   None


In [28]:
print(tokenizer_src.encode('There was no possibility of taking a walk that day.').ids)
print(tokenizer_src.encode('There was no possibility of taking a walk that day.').tokens)

[237, 14, 67, 1693, 10, 442, 11, 703, 15, 132, 7]
['There', 'was', 'no', 'possibility', 'of', 'taking', 'a', 'walk', 'that', 'day', '.']


In [29]:
print(tokenizer_tgt.encode('I. In quel giorno era impossibile passeggiare.').tokens)
print(tokenizer_tgt.encode('I. In quel giorno era impossibile passeggiare.').ids)
print(tokenizer_src.encode('I. In quel giorno era impossibile passeggiare.').tokens)

['I', '.', 'In', 'quel', 'giorno', 'era', 'impossibile', 'passeggiare', '.']
[330, 5, 208, 76, 147, 22, 619, 3860, 5]
['I', '.', 'In', '[UNK]', '[UNK]', 'era', '[UNK]', '[UNK]', '.']


In [40]:
import torch
enc_input_tokens = tokenizer_src.encode('There was no possibility of taking a walk that day.').ids
sos_token = torch.tensor([tokenizer_src.token_to_id("[SOS]")],dtype=torch.int64)
eos_token = torch.tensor([tokenizer_src.token_to_id("[EOS]")],dtype=torch.int64)
pad_token = torch.tensor([tokenizer_src.token_to_id("[PAD]")],dtype=torch.int64)
print(sos_token," ",eos_token," ",pad_token)
enc_num_padding_tokens = 350 - len(enc_input_tokens) - 2
print(enc_num_padding_tokens)
encoder_input = torch.cat(
    [   sos_token,
        torch.tensor(enc_input_tokens, dtype=torch.int64),
        eos_token,
        torch.tensor([pad_token] * enc_num_padding_tokens, dtype=torch.int64),
    ],
    dim=0,
)
print(encoder_input)

tensor([2])   tensor([3])   tensor([1])
337
tensor([   2,  237,   14,   67, 1693,   10,  442,   11,  703,   15,  132,    7,
           3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,   

In [1]:
for i in range(5,1):
    print(i)

In [5]:
import torch
x = torch.randn(1,2,32)
print(x[:,-1].shape)

torch.Size([1, 32])


In [2]:
print(sorted([ 41, 50, 143, 386, 23, 41, 50, 710, 39, 41, 900 ]))

[23, 39, 41, 41, 41, 50, 50, 143, 386, 710, 900]


In [5]:
import numpy as np
x = np.random.rand(20,10)
for i in range(20):
    print(x[i].round(2))

[0.71 0.3  0.85 0.85 0.31 0.24 0.39 0.15 0.99 0.36]
[0.58 0.12 0.69 0.72 0.23 0.92 0.02 0.65 0.91 0.25]
[0.85 0.29 0.73 0.78 0.07 0.34 0.86 0.37 0.3  0.33]
[0.9  0.76 0.69 0.98 0.4  0.08 0.16 0.75 0.75 0.71]
[0.61 0.42 0.62 0.9  0.29 0.25 0.06 0.58 0.25 0.55]
[0.91 0.57 0.36 0.29 0.67 0.49 1.   0.25 0.54 0.06]
[0.7  0.83 0.77 0.25 0.89 0.98 0.16 0.66 0.57 0.16]
[0.39 0.6  0.91 0.95 0.25 0.27 0.07 0.24 0.68 0.26]
[0.6  0.65 0.2  0.41 0.08 0.11 0.09 0.03 0.24 0.88]
[0.36 0.31 0.51 0.69 0.92 0.19 0.83 0.66 0.18 0.77]
[0.76 0.08 0.34 0.72 0.6  0.72 0.94 0.93 0.46 0.41]
[0.91 0.57 0.62 0.7  0.27 0.1  0.33 0.43 0.1  0.86]
[0.25 0.62 0.55 0.68 0.92 0.12 0.68 0.37 0.42 0.58]
[0.75 0.42 0.56 0.66 0.08 0.42 0.69 0.87 0.32 0.69]
[0.1  0.65 0.31 0.1  0.34 0.31 0.76 0.37 0.32 0.9 ]
[0.8  0.64 0.71 0.65 0.56 0.43 0.24 0.88 0.81 0.23]
[0.98 0.87 0.63 0.28 0.12 0.2  0.8  0.74 0.   0.82]
[0.17 0.06 0.8  0.21 0.66 0.8  0.   0.28 0.88 1.  ]
[0.65 0.55 0.61 0.07 0.28 0.   0.21 0.57 0.44 0.88]
[0.25 0.68 0

In [12]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The sky turned a brilliant shade of orange as the sun set over the horizon.",
    "As the sun dipped below the horizon, the sky was painted in vibrant hues of orange."
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences,normalize_embeddings=None)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])



(2, 384)


In [13]:
np.linalg.norm(embeddings[0])

1.0

In [8]:
import numpy as np
from sentence_transformers import util
print(util.cos_sim(embeddings[0],embeddings[1]))
np.dot(embeddings[0],embeddings[1])

tensor([[0.8696]])


0.8695888