<a href="https://www.kaggle.com/code/aleksandrmorozov123/transformers?scriptVersionId=232563749" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/14-million-word-corpus-txt/corpus.txt


**Self-attention mechanizm is a foundational block of all transformer architectures**

In [2]:
import torch
from torch.nn.functional import softmax

In [3]:
# we start with 3 inputs, each with dimension 4
x = [
    [1, 0, 2, 0],
    [0, 3, 0, 3],
    [2, 2, 2, 2]
]

x = torch.tensor (x, dtype = torch.float32)
x

tensor([[1., 0., 2., 0.],
        [0., 3., 0., 3.],
        [2., 2., 2., 2.]])

In [4]:
# initialise weights
w_key = [
    [0, 1, 1],
    [2, 1, 0],
    [1, 0, 1],
    [3, 3, 1]
]
w_query = [
    [1, 0, 1],
    [2, 0, 3],
    [0, 2, 3],
    [0, 2, 2]
]
w_value = [
    [0, 2, 1],
    [0, 2, 0],
    [1, 3, 0],
    [2, 2, 0]
]

w_key = torch.tensor (w_key, dtype = torch.float32)
w_query = torch.tensor (w_query, dtype = torch.float32)
w_value = torch.tensor (w_value, dtype = torch.float32)

print ("Weights for key: \n", w_key)
print ("Weights for query: \n", w_query)
print ("Weights for value: \n", w_value)

Weights for key: 
 tensor([[0., 1., 1.],
        [2., 1., 0.],
        [1., 0., 1.],
        [3., 3., 1.]])
Weights for query: 
 tensor([[1., 0., 1.],
        [2., 0., 3.],
        [0., 2., 3.],
        [0., 2., 2.]])
Weights for value: 
 tensor([[0., 2., 1.],
        [0., 2., 0.],
        [1., 3., 0.],
        [2., 2., 0.]])


In [5]:
# adding the bias vector to the product of matrix multiplication
keys = x @ w_key
querys = x @ w_query
values = x @ w_value

print ("Keys: \n", keys)
print ("Querys: \n", querys)
print ("Values: \n", values)

Keys: 
 tensor([[ 2.,  1.,  3.],
        [15., 12.,  3.],
        [12., 10.,  6.]])
Querys: 
 tensor([[ 1.,  4.,  7.],
        [ 6.,  6., 15.],
        [ 6.,  8., 18.]])
Values: 
 tensor([[ 2.,  8.,  1.],
        [ 6., 12.,  0.],
        [ 6., 18.,  2.]])


In [6]:
# calculating attention scores
attn_scores = querys @ keys.T
print (attn_scores)

tensor([[ 27.,  84.,  94.],
        [ 63., 207., 222.],
        [ 74., 240., 260.]])


In [7]:
# calculate softmax
attn_scores_softmax = softmax (attn_scores, dim = -1)

print (attn_scores_softmax)

tensor([[7.9845e-30, 4.5398e-05, 9.9995e-01],
        [0.0000e+00, 3.0590e-07, 1.0000e+00],
        [0.0000e+00, 2.0612e-09, 1.0000e+00]])


In [8]:
attn_scores_softmax = [
    [0.0, 0.5, 0.5],
    [0.0, 1.0, 0.0],
    [0.0, 0.9, 0.1]
]
attn_scores_softmax = torch.tensor (attn_scores_softmax)

print (attn_scores_softmax)

tensor([[0.0000, 0.5000, 0.5000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 0.9000, 0.1000]])


In [9]:
# multiply scores with values
weighted_values = values[:, None] * attn_scores_softmax.T[:,:, None]
print (weighted_values)

tensor([[[ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000]],

        [[ 3.0000,  6.0000,  0.0000],
         [ 6.0000, 12.0000,  0.0000],
         [ 5.4000, 10.8000,  0.0000]],

        [[ 3.0000,  9.0000,  1.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.6000,  1.8000,  0.2000]]])


In [10]:
# multiply scores with values of input 2 and input 3
outputs = weighted_values.sum (dim = 0)
print (outputs)

tensor([[ 6.0000, 15.0000,  1.0000],
        [ 6.0000, 12.0000,  0.0000],
        [ 6.0000, 12.6000,  0.2000]])


**Tokenizer**

In [11]:
!pip install gensim
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
import math
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings (action = 'ignore')

In [20]:
# Word2Vec Tokenization
sample = open ('/kaggle/input/14-million-word-corpus-txt/corpus.txt', "r")
s = sample.read ()

# processing esacape characters
f = s.replace ("\n", " ")
data = []

# sentence parsing
for i in sent_tokenize(f):
    temp = []
    #tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append (j.lower())
    data.append (temp)

# create Skip Gram model
model2 = gensim.models.Word2Vec (data, min_count = 1, vector_size = 512, window = 5, sg = 1)
print = model2

KeyboardInterrupt: 

In [15]:
# function for calculating cosine similarity'
def similarity (word1, word2):
    cosine = False
    try:
        a = model2[word1]
        cosine = True
    except KeyError:
        print (word1, ":[unk] key not found in the dictionary")
    try:
        b = model2[word2]
    except KeyError:
        cosine = False
        print (word2, ":[unk] key not found in the dictionary")
    if (cosine == True):
        b = model2 [word2]
        # compute cosine similarity
        dot = np.dot (a, b)
        norma = np.linalg.norm (a)
        normb = np.linalg.norm (b)
        cos = dot / (norma * normb)
        aa = a.reshape  (1, 512)
        ba = b.reshape (1, 512)
        cos_lib = cosine_similarity (aa, ba)
    if (cosine == False):
        cos_lib = 0
        return cos_lib

In [19]:
# similarity words in the dataset and the dictionary
def similarity (word1, word2):
    cosine = False
    try:
        a = model2.wv[word1]
        cosine = True
    except KeyError:
        print ("The word ", word1, " does not exist in the dictionary")
    try:
        b = model2.wv[word2]
    except KetyError:
        print ("The word ", word2, " does not exist in the dictionary")
        cosine = False
    if cosine:
        return cosine_similarity ([a], [b])
    else:
        return 0

In [18]:
word1 = "sun"
word2 = "dream"
print ("Similarity between", word1, "and", word2, "is", similarity (word1, word2))

TypeError: 'Word2Vec' object is not callable