In [17]:
!pip install transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForPreTraining
import numpy as np
import torch

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
model = AutoModelForPreTraining.from_pretrained("dbmdz/bert-base-turkish-uncased", output_hidden_states=True)

In [16]:
EMBEDDING_VECTOR_SHAPE = (768,)

model.eval()

# Referans: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
def getPhraseEmbedding(sentence, phrase):
    sentence = sentence
    sentenceTokens = tokenizer.tokenize(sentence)
    indexedTokens = tokenizer.convert_tokens_to_ids(sentenceTokens)

    segmentIDs = [1] * len(sentenceTokens)
    
    sentenceTokensTensor = torch.tensor([indexedTokens])
    segmentsTensor = torch.tensor([segmentIDs])

    tokenEmbeddings = []
    with torch.no_grad():
        outputs = model(sentenceTokensTensor, segmentsTensor)
        hiddenStates = outputs.hidden_states
        
        _tokenEmbeddings = torch.stack(hiddenStates, dim=0)
        _tokenEmbeddings = _tokenEmbeddings.squeeze(dim=1)  #Batch dimension kaldırıldı
        _tokenEmbeddings = _tokenEmbeddings.permute(1,0,2)  #Dimension'lar [token, layer, vector_feature] sırasına getirildi
        for token in _tokenEmbeddings:
            tokenEmbeddings.append(torch.mean(token[-4:], dim=0))  #Son 4 hidden layer'in toplanması sonucu token embedding elde edilmesi
    
    phraseTokens = tokenizer.tokenize(phrase)  #İstenen kelime veya kelime grubunun tokenizasyonu
    for sTokenIndex in range(len(sentenceTokens)):
        if sentenceTokens[sTokenIndex] == phraseTokens[0]:
            phraseStartIndex = sTokenIndex  #Kelime grubunun başlangıç indexinin belirlenmesi 
            
    phraseEndIndex = phraseStartIndex + len(phraseTokens)  #Bitiş indexinin belirlenmesi
    
    phraseMeanEmbedding = np.zeros(EMBEDDING_VECTOR_SHAPE)  #Kelime grubu başlangıç-bitiş indexleri arasındaki ortalama embedding'i tutacak olan vektör
    for tokenIndex in range(phraseStartIndex, phraseEndIndex):
        phraseMeanEmbedding = np.add(phraseMeanEmbedding, tokenEmbeddings[tokenIndex])
    phraseMeanEmbedding /= (phraseEndIndex - phraseStartIndex)  #Ortalama embedding'in elde edilmesi
    
    return phraseMeanEmbedding

def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

print(cosine_similarity(getPhraseEmbedding("uzun", "uzun"), getPhraseEmbedding("kısa", "kısa")))

0.845899656989336
