<a href="https://colab.research.google.com/github/dhruv2600/Capstone/blob/main/multi_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing the necessary libararies**

In [1]:
!pip install numpy
!pip install torch
!pip install sklearn
!pip install pytorch_transformers

Collecting pytorch_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 26.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 26.7MB/s 
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/1b/55/e66b557bdbc266ab4f15249f382f5d7d165fee1caa7e12c96348c05ea53d/boto3-1.17.95-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 47.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |█

# **Loading the Pre-trained BERT model**

In [2]:
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel

## Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)


100%|██████████| 231508/231508 [00:00<00:00, 16715698.58B/s]
100%|██████████| 433/433 [00:00<00:00, 338830.90B/s]
100%|██████████| 440473133/440473133 [00:09<00:00, 46195230.31B/s]


# **Input Formatting (Tokenization)**

In [3]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "Here is the sentence I want embeddings for."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
here          2,182
is            2,003
the           1,996
sentence      6,251
i             1,045
want          2,215
em            7,861
##bed         8,270
##ding        4,667
##s           2,015
for           2,005
.             1,012
[SEP]           102


# **Running BERT on the text**

In [4]:
import torch

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [14]:
# Run the text through BERT, get the output and collect all of the hidden states produced
# from all 12 layers.
def getBertEncoding(text):
  # Define a new example sentence with multiple meanings of the word "bank"
 

# Add the special tokens.
  marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
  for tup in zip(tokenized_text, indexed_tokens):
      print('{:<12} {:>6,}'.format(tup[0], tup[1]))
        
  with torch.no_grad():

      outputs = model(tokens_tensor)

    # can use last hidden state as word embeddings
      last_hidden_state = outputs[0]
      word_embed_1 = last_hidden_state

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
      hidden_states = outputs[2]

    # initial embeddings can be taken from 0th layer of hidden states
      word_embed_2 = hidden_states[0]

    # sum of all hidden states
      word_embed_3 = torch.stack(hidden_states).sum(0)

    # sum of second to last layer
      word_embed_4 = torch.stack(hidden_states[2:]).sum(0) 

    # sum of last four layer
      word_embed_5 = torch.stack(hidden_states[-4:]).sum(0) 

    #concat last four layers
      word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)

      return word_embed_5




In [18]:
student_ans = "Sky is red"
question = "What is the colour of the sky?"
reference_ans ="Sky appears blue"

In [19]:
K = getBertEncoding(student_ans)
Q = getBertEncoding(question)

[CLS]           101
sky           3,712
is            2,003
red           2,417
[SEP]           102
[CLS]           101
what          2,054
is            2,003
the           1,996
colour        6,120
of            1,997
the           1,996
sky           3,712
?             1,029
[SEP]           102


In [20]:
K_values = K[0][1]
Q_values = Q[0][1]

In [22]:
import numpy as np
K_values = np.array(K_values)
Q_values = np.array(Q_values)

In [23]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [24]:
K = torch.from_numpy(K_values)
Q = torch.from_numpy(Q_values)

In [26]:
K = getBertEncoding(student_ans)
Q = getBertEncoding(question)

[CLS]           101
sky           3,712
is            2,003
red           2,417
[SEP]           102
[CLS]           101
what          2,054
is            2,003
the           1,996
colour        6,120
of            1,997
the           1,996
sky           3,712
?             1,029
[SEP]           102


In [30]:
import math
from torch import nn
import torch.nn.functional as F
cross_attention = attention(Q,K,K)

In [32]:
multihead_attn = nn.MultiheadAttention(embed_dim = 768, num_heads=3)

In [33]:
multihead_attn(K,K,K)

(tensor([[[-1.1576,  1.4937,  0.1382,  ..., -0.6803,  0.0493,  0.3212],
          [ 1.4715, -0.7007, -0.4951,  ...,  0.5699, -0.8588, -1.3575],
          [-0.3468,  0.4541, -1.6388,  ..., -0.1949, -0.2089, -0.3386],
          ...,
          [-0.1864, -0.9092,  0.7743,  ..., -2.0339,  0.0060,  0.8453],
          [ 0.0273, -0.0093,  0.2529,  ...,  0.4722, -0.0395,  0.3065],
          [ 0.1637, -0.0177,  0.0786,  ...,  0.3874, -0.2167,  0.4599]]],
        grad_fn=<AddBackward0>), tensor([[[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]]], grad_fn=<DivBackward0>))

In [34]:
multihead_attn(K,Q,K)

(tensor([[[-1.1576,  1.4937,  0.1382,  ..., -0.6803,  0.0493,  0.3212],
          [ 1.4715, -0.7007, -0.4951,  ...,  0.5699, -0.8588, -1.3575],
          [-0.3468,  0.4541, -1.6388,  ..., -0.1949, -0.2089, -0.3386],
          ...,
          [-0.1864, -0.9092,  0.7743,  ..., -2.0339,  0.0060,  0.8453],
          [ 0.0273, -0.0093,  0.2529,  ...,  0.4722, -0.0395,  0.3065],
          [ 0.1637, -0.0177,  0.0786,  ...,  0.3874, -0.2167,  0.4599]]],
        grad_fn=<AddBackward0>), tensor([[[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]],
 
         [[1.]]], grad_fn=<DivBackward0>))

In [35]:
import torch
import torch.nn as nn

In [36]:
T = 5 #5 words sequence/Timesteps
E = 768 #word embedding

In [37]:
x = torch.randn(T,E) # (T,E)
x.shape

torch.Size([5, 768])

In [38]:
torch.Size([5, 768])

torch.Size([5, 768])

In [None]:
query = nn.Linear(E, E) 
key = nn.Linear(E, E)
val = nn.Linear(E, E)

q = query(x) # (T,E)
k = key(x) # (T,E)
v = val(x) # (T,E)

In [40]:
multihead_attn = nn.MultiheadAttention(E, num_heads=12)

In [None]:
attn_output, attn_output_weights = multihead_attn(q, k, v)