In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1d2ee648b10>

In [79]:
word_to_ix = {"salam": 0, "necəsən": 1}
embeds = nn.Embedding(2, 5)  
lookup_tensor = torch.tensor([word_to_ix["necəsən"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[-0.1661, -1.5228,  0.3817, -1.0276, -0.5631]],
       grad_fn=<EmbeddingBackward0>)


In [118]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]

print(ngrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]


In [119]:
word_to_ix['winters']

89

In [133]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [134]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [135]:
for epoch in range(10):
    total_loss = 0
    for context, target in ngrams:

      
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

     
        model.zero_grad()

      
        log_probs = model(context_idxs)

       
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

      
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  


print(model.embeddings.weight[word_to_ix["beauty"]])

[523.4939029216766, 521.1120574474335, 518.748562335968, 516.4032638072968, 514.0753750801086, 511.7617199420929, 509.4619884490967, 507.1765878200531, 504.9051568508148, 502.64693689346313]
tensor([ 0.0470,  0.6477,  0.1442,  0.1578, -0.9045,  0.8705,  0.9191,  1.1073,
         2.1022, -0.1647], grad_fn=<SelectBackward0>)


In [136]:
word_to_ix['cold.']

28

In [137]:
import pandas as pd

In [140]:
word = 'cold.'
word_arr = []
dist_arr = []
for w in vocab:
    # dist = torch.norm( - model.embeddings.weight[word_to_ix[w]])
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})

In [141]:
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
70,small,0.005815
52,thine,0.006417
42,If,0.00798
20,"eyes,",0.017048
56,forty,0.020096
35,When,0.030605
64,couldst,0.031955
18,it,0.037172
0,held:,0.037192
13,art,0.037457


In [142]:
model.embeddings.weight[word_to_ix["When"]]

tensor([-0.4183, -1.1325,  1.3679, -0.1839,  0.2949,  0.3500,  0.2918,  2.5289,
        -2.2841, -1.2369], grad_fn=<SelectBackward0>)

In [143]:
CONTEXT_SIZE = 2  
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()


vocab = set(raw_text)
vocab_size = len(vocab)


In [144]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [145]:
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['are', 'We', 'to', 'study'], 'about'), (['about', 'are', 'study', 'the'], 'to'), (['to', 'about', 'the', 'idea'], 'study'), (['study', 'to', 'idea', 'of'], 'the'), (['the', 'study', 'of', 'a'], 'idea')]


-------------------------------------

In [146]:
import unicodedata
import string

In [147]:
all_letters = string.ascii_letters + " .,;'"

In [148]:
def unicodeToAscii(s):
    s =s.lower().replace('ə','e')
    s =s.replace('ı','i')
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [149]:
print(unicodeToAscii('günöğəəğqeş'))

gunogeegqes


In [453]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

CONTEXT_SIZE = 2 
EMDEDDING_DIM = 100
file = open('nagil.txt','r',encoding="utf8").read()

raw_text = file.split()


for ix,word in enumerate(raw_text):
    raw_text[ix] = unicodeToAscii(word)
    
vocab = set(raw_text)

vocab_size = len(vocab)

word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}

data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))

In [454]:
vocab_size

1216

In [455]:
data[6]

(['texnologiyalari', 'insanlarin', 'suretle', 'daxil'], 'heyatina')

In [456]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

      
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
       
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [457]:
model = CBOW(vocab_size, EMDEDDING_DIM)

In [458]:
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [459]:
for epoch in range(100):
    total_loss = 0

    for context, target in data:
        context_vector = make_context_vector(context, word_to_ix)  

        log_probs = model(context_vector)

        total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]))

   
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    print(f'epoch: {epoch} total loss: {total_loss}')


epoch: 0 total loss: 18115.966796875
epoch: 1 total loss: 17146.6015625
epoch: 2 total loss: 16413.990234375
epoch: 3 total loss: 16028.76953125
epoch: 4 total loss: 15451.2177734375
epoch: 5 total loss: 15487.77734375
epoch: 6 total loss: 14898.203125
epoch: 7 total loss: 14855.1162109375
epoch: 8 total loss: 14320.1552734375
epoch: 9 total loss: 14240.9375
epoch: 10 total loss: 13675.439453125
epoch: 11 total loss: 13644.5537109375
epoch: 12 total loss: 14299.654296875
epoch: 13 total loss: 13720.18359375
epoch: 14 total loss: 12671.6103515625
epoch: 15 total loss: 12051.185546875
epoch: 16 total loss: 12234.6552734375
epoch: 17 total loss: 11180.5029296875
epoch: 18 total loss: 10111.392578125
epoch: 19 total loss: 10763.447265625
epoch: 20 total loss: 10269.3466796875
epoch: 21 total loss: 10822.2880859375
epoch: 22 total loss: 10101.205078125
epoch: 23 total loss: 8901.2158203125
epoch: 24 total loss: 8685.5654296875
epoch: 25 total loss: 8386.7333984375
epoch: 26 total loss: 7030

In [460]:
data[1]

(['ki,', 'son', 'suni', 'intellekt'], 'zamanlar')

In [461]:
context = ['ki,', 'son', 'suni', 'intellekt']
for ix,word in enumerate(context):
    context[ix] = unicodeToAscii(word)
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Raw text: {" ".join(raw_text)}\n')
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Raw text: melumdur ki, son zamanlar suni intellekt texnologiyalari insanlarin heyatina suretle daxil olur. yeni texnologiyalar dunya olkelerinin siyasetine tesir edir, dovletlerin strateji potensialini mueyyenlesdirir. suni intellekt texnologiyalarindan hem herbi, hem de mulki meqsedlerle genis istifade olunur. artiq beseriyyetin inkisafinin bu merhelesinde suni intellekt insanlarin aile ve cemiyyet, hakimiyyet ve vetendas munasibetlerine guclu tesir ederek yeni realliqlar yaratmaqda, dovrumuzun ideologiyasina cevrilmekdedir. elbette ki, bu tendensiyanin da musbet ve menfi terefleri vardir. suni intellekt nezeriyyesinin elminezeri esaslarina geldikde, demek olar ki, insanlarin butun dovrlerde suni intellektle bagli arzulari olub. hetta bu texnologiyalar meydana gelmemisden once ayzek azimov, karel capek kimi fantast yazicilar oz eserlerinde suni intellekt meselelerine toxunub, muxtelif mulahizeler ireli surubler. lakin suni intellektin reallasmasi oten esrin ci illerinden sonra, komput

In [462]:
data[10]

(['daxil', 'olur.', 'texnologiyalar', 'dunya'], 'yeni')

In [463]:
context = ['daxil', 'olur.', 'texnologiyalar', 'dunya']
for ix,word in enumerate(context):
    context[ix] = unicodeToAscii(word)
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['daxil', 'olur.', 'texnologiyalar', 'dunya']

Prediction: yeni


In [464]:
data[54]

(['realliqlar', 'yaratmaqda,', 'ideologiyasina', 'cevrilmekdedir.'],
 'dovrumuzun')

In [465]:
context = ['realliqlar', 'yaratmaqda,', 'ideologiyasina', 'cevrilmekdedir.']
for ix,word in enumerate(context):
    context[ix] = unicodeToAscii(word)
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['realliqlar', 'yaratmaqda,', 'ideologiyasina', 'cevrilmekdedir.']

Prediction: dovrumuzun


In [466]:
data[-250]

(['merhelesinde', 'ise', 'bascisi', 'ilham'], 'dovlet')

In [468]:
word = 'dovlet'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
180,usullari,0.000199
467,ugurla,0.000241
604,mutexessisler,0.000292
570,"cemiyyet,",0.000354
543,azad,0.000387
575,professor,0.000459
597,dayaqlarindan,0.000561
975,"qurgular,",0.000638
106,saglamligi,0.000688
738,xas,0.000724


In [469]:
data[100]

(['eserlerinde', 'suni', 'meselelerine', 'toxunub,'], 'intellekt')

In [470]:
word = 'suni'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
408,verilen,1.3e-05
317,oyrenme,0.00043
1198,fundamental,0.000449
513,ibaretdir.,0.000496
810,mukafatina,0.000553
852,"meqama,",0.000694
1142,cixarir.,0.000733
900,"institutlari,",0.0008
816,etdikleri,0.000818
879,ayriayri,0.001008


In [472]:
data[-1]

(['maraginin', 'artirilmasina', 'destek', 'verirler.'], 'boyuk')

In [473]:
word = 'destek'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
510,behs,0.000185
910,lutfi,0.000329
1114,inqilablarindan,0.000575
964,isler,0.000633
1017,malikdir.,0.000649
988,isteyirem.,0.000747
149,gelecekle,0.000765
1189,murekkeb,0.000901
229,intellekt,0.000958
350,yeni,0.00122


In [475]:
word = 'boyuk'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
902,meqalesi,0.000319
579,canlilarin,0.000487
328,genetik,0.000526
389,fantastik,0.000589
1205,aglasigmaz,0.000592
958,induksiya,0.000637
397,yox,0.000768
987,telebleri,0.000897
945,silahlar,0.001036
105,zade,0.001095


In [480]:
data[-20]

(['asilanmasi', 'layihesi', 'steam,', 'bu'], 'olan')

In [482]:
word = 'steam,'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
335,proqrami,6.4e-05
351,imkanlarla,0.000115
1006,silah,0.000232
1023,mualice,0.000468
1122,hedde,0.00048
673,ise,0.000505
505,dasidigi,0.000542
1210,alimlerimiz,0.000552
462,olmusdur.,0.000579
798,asanliqla,0.000626


In [484]:
data[-29]

(['olunan', 'sabahin', 'musabiqesi,', 'sagirdlere'], 'alimleri')

In [486]:
word = 'sabahin'
word_arr = []
dist_arr = []
for w in vocab:
    dist = torch.cosine_similarity(model.embeddings.weight[word_to_ix[word]].unsqueeze(0),model.embeddings.weight[word_to_ix[w]].unsqueeze(0))
    word_arr.append(w)
    dist_arr.append(torch.norm(dist).item())
df = pd.DataFrame({'word':word_arr,'similarity':dist_arr})
df.sort_values(by='similarity').head(10)

Unnamed: 0,word,similarity
759,peselerin,4e-05
598,riskler,0.000103
506,arasdirmalar,0.00014
527,texnologiyalarla,0.000186
1196,teskil,0.000253
366,birbiri,0.000322
828,imkanlardan,0.000334
276,yaxsi,0.000431
217,"basqa,",0.000655
214,maraginin,0.000668
