# Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fd53f20b410>

In [0]:

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])


In [0]:

# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [4]:
# Read file and do tokenization
def load_doc(link, file_name):

  fluff, id = link.split('=')
  print (id) # 
  downloaded = drive.CreateFile({'id':id}) 
  downloaded.GetContentFile(file_name)  

  f = open(file_name)
  documents = f.readlines()

  x = []
  y = []

  for i in documents:
    temp = i.rsplit(",", 1)
    x.append(temp[0])
    y.append(temp[1])

  x = x[1:]
  y = y[1:]

  x_split = []
  for i in x:
    i = i.lower()
    x_split.append(i.split(" "))

  y_split = []
  for i in y:
    i = i.replace('\n','')
    y_split.append(i.split(" "))

  return x_split, y_split

train_data, target_y_train = load_doc("https://drive.google.com/open?id=1X7FZzNUZMQpGuFurQiedUp-ShwpBssxT", "train.csv")
validation_data, target_y_validation = load_doc("https://drive.google.com/open?id=1cSGqqGnjpQSWL9G2nScMibCgASl37eLu", "val.csv")

1X7FZzNUZMQpGuFurQiedUp-ShwpBssxT
1cSGqqGnjpQSWL9G2nScMibCgASl37eLu


In [5]:
# Load test data
link = "https://drive.google.com/open?id=1-Fj0TGniAoZ7dB1zzmeWofD0wIQv_dYe"
file_name = "test.csv"

fluff, id = link.split('=')
print (id) # 
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile(file_name)  

f = open(file_name)
documents = f.readlines()

x = []
for i in documents:
  x.append(i)
x = x[1:]

x_split = []
for i in x:
  i = i.lower()
  i = i.replace('\n','')
  x_split.append(i.split(" "))

test_data = x_split

print (test_data[0:5])



1-Fj0TGniAoZ7dB1zzmeWofD0wIQv_dYe
[['-docstart-,'], ['"soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.",'], ['nadim', 'ladki,'], ['"al-ain', ',', 'united', 'arab', 'emirates', '1996-12-06",'], ['japan', 'began', 'the', 'defence', 'of', 'their', 'asian', 'cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'syria', 'in', 'a', 'group', 'c', 'championship', 'match', 'on', 'friday', '.,']]


In [0]:

train_tokens = train_data
val_tokens = validation_data
test_tokens = test_data

In [0]:


# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1X7FZzNUZMQpGuFurQiedUp-ShwpBssxT'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')

id = '1cSGqqGnjpQSWL9G2nScMibCgASl37eLu'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv')

id = '1-Fj0TGniAoZ7dB1zzmeWofD0wIQv_dYe'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test.csv')

import pandas as pd

def read_data(file_name):
  data = pd.read_csv(open(file_name))
  return data['Sentence'].values, data['NER'].values

# Load data in string format
train_data_b, train_label_b = read_data('train.csv')
val_data_b, val_label_b = read_data('val.csv')
test_data_b ,_ = read_data('test.csv')



# Input Embedding

## POS

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

# Generate pos tagging by tokenized data
# data: tokenized sentences
# return: a list of sentence tags
def posTag(data):
  sent_tags = []
  for sent in data:
    tags = []
    for (word, tag) in nltk.pos_tag(sent):
      tags.append(tag)
    sent_tags.append(tags)
  return sent_tags

tag_set = set(t for l in posTag(train_tokens+val_tokens+test_tokens) for t in l)
tag_dict = {tag:val for val,tag in enumerate(tag_set)}
print(tag_dict)

# convert tags from string to vector
# tags: lists of pos tags
# return: lists of tag vectors
def tagVector(tags):
  tag_vectors = []
  for sent in tags:
    tv = []
    for t in sent:
      tv.append(tag_dict[t])
    tag_vectors.append(tv)
  return tag_vectors

train_tv = tagVector(posTag(train_tokens))
val_tv = tagVector(posTag(val_tokens))
test_tv = tagVector(posTag(test_tokens))

train_data_pos = train_tv
val_data_pos = val_tv
test_data_pos = test_tv

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
{'WRB': 0, ',': 1, 'NN': 2, '.': 3, 'MD': 4, 'VBD': 5, 'POS': 6, 'VBG': 7, "''": 8, 'WP': 9, 'NNPS': 10, 'RP': 11, 'RBR': 12, 'NNP': 13, 'VB': 14, 'JJS': 15, 'IN': 16, 'VBZ': 17, 'WDT': 18, 'LS': 19, ':': 20, 'RB': 21, 'SYM': 22, 'EX': 23, 'FW': 24, 'JJ': 25, 'VBP': 26, 'VBN': 27, 'RBS': 28, 'CC': 29, 'TO': 30, 'UH': 31, 'NNS': 32, 'CD': 33, 'DT': 34, ')': 35, 'PRP': 36, '$': 37, 'JJR': 38, '(': 39, 'PRP$': 40, 'WP$': 41, 'PDT': 42}


In [9]:
#embedding for pos

vocab = []
for v in tag_dict:
  vocab.append(v)

def onHot(sequences, dimension=len(vocab)):
    sequences = np.array(sequences)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
      results[i, sequence] = 1
    return results
# Vocab is list of pos
print (vocab)

vocab = tagVector(posTag(vocab))

one_hot_postag_train = []
for i in vocab:
  temp = onHot(i)
  temp = temp.tolist()
  one_hot_postag_train.append(temp)

embedding_matrix_pos = np.identity(45, dtype = float) 

['WRB', ',', 'NN', '.', 'MD', 'VBD', 'POS', 'VBG', "''", 'WP', 'NNPS', 'RP', 'RBR', 'NNP', 'VB', 'JJS', 'IN', 'VBZ', 'WDT', 'LS', ':', 'RB', 'SYM', 'EX', 'FW', 'JJ', 'VBP', 'VBN', 'RBS', 'CC', 'TO', 'UH', 'NNS', 'CD', 'DT', ')', 'PRP', '$', 'JJR', '(', 'PRP$', 'WP$', 'PDT']


In [10]:
print (embedding_matrix_pos.shape)

(45, 45)


## TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import math

# generate ti-idf values
# tokensized_docs: lists of tokenized data
# return: lists of tf-idf values
def tfidf(tokensized_docs):

  DF = {}

  for tokensized_doc in tokensized_docs:
      # get each unique word in the doc - we need to know whether the word is appeared in the document
      for term in np.unique(tokensized_doc):
          try:
              DF[term] +=1
          except:
              DF[term] =1

  tf_idf = {}

  # total number of documents
  N = len(tokensized_docs)

  doc_id = 0
  # get each tokenised doc
  for tokensized_doc in tokensized_docs:
      # initialise counter for the doc
      counter = Counter(tokensized_doc)
      # calculate total number of words in the doc
      total_num_words = len(tokensized_doc)    

      # get each unique word in the doc
      for term in np.unique(tokensized_doc):

          #calculate Term Frequency 
          tf = counter[term]/total_num_words
          
          #calculate Document Frequency
          df = DF[term]

          # calculate Inverse Document Frequency
          idf = math.log(N/(df+1))+1

          # calculate TF-IDF
          tf_idf[doc_id, term] = tf*idf

      doc_id += 1
  return tf_idf

# extract tf-idf values
def tfidfVector(tokensized_docs, tfidf):
  ti_vectors = []
  for i,tokensized_doc in enumerate(tokensized_docs):
    ti_vector = []
    for word in tokensized_doc:
      ti_vector.append(tfidf[i,word])
    ti_vectors.append(ti_vector)
  return ti_vectors

#train data
train_tfidf = tfidf(train_tokens)
print(train_tfidf)
train_ti_vectors = tfidfVector(train_tokens, train_tfidf)
print(train_ti_vectors)

#val data
val_tfidf = tfidf(val_tokens)
print(val_tfidf)
val_ti_vectors = tfidfVector(val_tokens, val_tfidf)
print(val_ti_vectors)

#test data
test_tfidf = tfidf(test_tokens)
print(test_tfidf)
test_ti_vectors = tfidfVector(test_tokens, test_tfidf)
print(test_ti_vectors)


train_data_tfidf = train_ti_vectors
val_data_tfidf = val_ti_vectors
test_data_tfidf = test_ti_vectors

{(0, '-docstart-'): 3.7488721956224653, (1, '.'): 0.2645527999602545, (1, 'boycott'): 0.9236911541211444, (1, 'british'): 0.6345034112939908, (1, 'call'): 0.7696584473300456, (1, 'eu'): 0.7342746994279862, (1, 'german'): 0.6122066673537517, (1, 'lamb'): 0.8466748007255951, (1, 'rejects'): 0.9236911541211444, (1, 'to'): 0.31825890689587666, (2, 'blackburn'): 3.5302287092974667, (2, 'peter'): 4.15661019354515, (3, '1996-08-22'): 2.1809883342544367, (3, 'brussels'): 2.9576625571459654, (4, '.'): 0.07936583998807635, (4, 'advice'): 0.25400244021767854, (4, 'be'): 0.15059103993060358, (4, 'british'): 0.19035102338819726, (4, 'can'): 0.2173820305954082, (4, 'commission'): 0.23089753419901368, (4, 'consumers'): 0.23089753419901368, (4, 'cow'): 0.23089753419901368, (4, 'determine'): 0.25400244021767854, (4, 'disagreed'): 0.2635918426327379, (4, 'disease'): 0.20577180745313436, (4, 'european'): 0.19569577839070323, (4, 'german'): 0.18366200020612553, (4, 'it'): 0.14007821743056634, (4, 'lamb'):

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Dependency Parsing

In [12]:
import spacy
from prettytable import PrettyTable

nlp = spacy.load("en_core_web_sm")
parse = nlp(train_data_b[1])

x = PrettyTable()
x.add_row([0,"ROOT",0,"-"])
for token in parse:
  if token.dep_=="ROOT":
    x.add_row([token.i+1,token.text,"0",token.dep_])
  else:  
    x.add_row([token.i+1,token.text,token.head.i+1,token.dep_])
print(x)

# generate dependency parsing values
def dpVector(data):
  dp_vectors = []
  for sent in data:
    parse = nlp(sent)
    dp_vector = []
    for token in parse:
      if token.dep_=="ROOT":
        dp_vector.append(0)
      else:
        dp_vector.append(token.head.i+1)
    dp_vectors.append(dp_vector)
  return dp_vectors

train_dp_vectors = dpVector(train_data_b)
val_dp_vectors = dpVector(val_data_b)
test_dp_vectors = dpVector(test_data_b)
print(train_dp_vectors[0:10])


train_data_dp = train_dp_vectors
val_data_dp = val_dp_vectors
test_data_dp = test_dp_vectors

+---------+---------+---------+---------+
| Field 1 | Field 2 | Field 3 | Field 4 |
+---------+---------+---------+---------+
|    0    |   ROOT  |    0    |    -    |
|    1    |    eu   |    2    |  nsubj  |
|    2    | rejects |    0    |   ROOT  |
|    3    |  german |    4    |   amod  |
|    4    |   call  |    2    |   dobj  |
|    5    |    to   |    6    |   aux   |
|    6    | boycott |    4    |   acl   |
|    7    | british |    8    |   amod  |
|    8    |   lamb  |    6    |   dobj  |
|    9    |    .    |    2    |  punct  |
+---------+---------+---------+---------+
[[0], [2, 0, 4, 2, 6, 4, 8, 6, 2], [2, 0], [0, 1, 1, 6, 6, 1], [3, 3, 4, 0, 4, 5, 8, 4, 8, 11, 9, 8, 12, 15, 8, 17, 15, 20, 20, 15, 27, 24, 24, 27, 27, 27, 20, 27, 28, 4], [3, 1, 13, 3, 7, 7, 10, 7, 10, 4, 12, 13, 0, 13, 14, 18, 18, 13, 18, 18, 20, 21, 22, 23, 29, 28, 28, 29, 18, 29, 13], [29, 5, 5, 5, 29, 8, 8, 5, 13, 13, 13, 13, 5, 15, 13, 15, 16, 29, 29, 21, 24, 21, 24, 28, 27, 27, 28, 29, 0, 32, 32, 29, 2

## WORD EMBEDDING

In [0]:
# generate word to index dictionary
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)


In [14]:
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-100") 

EMBEDDING_DIM = 100

embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape





  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if __name__ == '__main__':


(15208, 100)

In [15]:
print (embedding_matrix.shape)

(15208, 100)


In [0]:
# convert word to index
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)
#test_output_index = to_index(target_y_test,tag_to_ix)

# Model

## Multi-head Attention
We use the multi_head_attention_forward() method in pytorch 
https://github.com/pytorch/pytorch/blob/ada2652ca6610150cc3fd114c491e59860388649/torch/nn/functional.py

In [0]:
from torch.nn import MultiheadAttention
from torch.nn.parameter import Parameter
from torch.nn import Linear
from torch.nn.functional import linear, softmax, dropout
from torch.nn.init import constant_
from torch.nn.init import xavier_uniform_

def multi_head_attention_forward(query,                           # type: Tensor
                                 key,                             # type: Tensor
                                 value,                           # type: Tensor
                                 embed_dim_to_check,              # type: int
                                 num_heads,                       # type: int
                                 in_proj_weight,                  # type: Tensor
                                 in_proj_bias,                    # type: Tensor
                                 bias_k,                          # type: Optional[Tensor]
                                 bias_v,                          # type: Optional[Tensor]
                                 add_zero_attn,                   # type: bool
                                 dropout_p,                       # type: float
                                 out_proj_weight,                 # type: Tensor
                                 out_proj_bias,                   # type: Tensor
                                 training=True,                   # type: bool
                                 key_padding_mask=None,           # type: Optional[Tensor]
                                 need_weights=True,               # type: bool
                                 attn_mask=None,                  # type: Optional[Tensor]
                                 use_separate_proj_weight=False,  # type: bool
                                 q_proj_weight=None,              # type: Optional[Tensor]
                                 k_proj_weight=None,              # type: Optional[Tensor]
                                 v_proj_weight=None,              # type: Optional[Tensor]
                                 static_k=None,                   # type: Optional[Tensor]
                                 static_v=None                    # type: Optional[Tensor]
                                 ):
    # type: (...) -> Tuple[Tensor, Optional[Tensor]]
    r"""
    Args:
        query, key, value: map a query and a set of key-value pairs to an output.
            See "Attention Is All You Need" for more details.
        embed_dim_to_check: total dimension of the model.
        num_heads: parallel attention heads.
        in_proj_weight, in_proj_bias: input projection weight and bias.
        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
        add_zero_attn: add a new batch of zeros to the key and
                       value sequences at dim=1.
        dropout_p: probability of an element to be zeroed.
        out_proj_weight, out_proj_bias: the output projection weight and bias.
        training: apply dropout if is ``True``.
        key_padding_mask: if provided, specified padding elements in the key will
            be ignored by the attention. This is an binary mask. When the value is True,
            the corresponding value on the attention layer will be filled with -inf.
        need_weights: output attn_output_weights.
        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
        use_separate_proj_weight: the function accept the proj. weights for query, key,
            and value in different forms. If false, in_proj_weight will be used, which is
            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
        static_k, static_v: static key and value used for attention operators.
    Shape:
        Inputs:
        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
          the embedding dimension.
        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
          the embedding dimension.
        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
          the embedding dimension.
        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
          will be unchanged. If a BoolTensor is provided, the positions with the
          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
          is provided, it will be added to the attention weight.
        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
        Outputs:
        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
          E is the embedding dimension.
        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
          L is the target sequence length, S is the source sequence length.
    """

    tgt_len, bsz, embed_dim = query.size()
    assert embed_dim == embed_dim_to_check
    # allow MHA to have different sizes for the feature dimension
    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)

    head_dim = embed_dim // num_heads
    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
    scaling = float(head_dim) ** -0.5

    if not use_separate_proj_weight:
        if torch.equal(query, key) and torch.equal(key, value):
            # self-attention
            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)

        elif torch.equal(key, value):
            # encoder-decoder attention
            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = 0
            _end = embed_dim
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            q = linear(query, _w, _b)

            if key is None:
                assert value is None
                k = None
                v = None
            else:

                # This is inline in_proj function with in_proj_weight and in_proj_bias
                _b = in_proj_bias
                _start = embed_dim
                _end = None
                _w = in_proj_weight[_start:, :]
                if _b is not None:
                    _b = _b[_start:]
                k, v = linear(key, _w, _b).chunk(2, dim=-1)

        else:
            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = 0
            _end = embed_dim
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            q = linear(query, _w, _b)

            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = embed_dim
            _end = embed_dim * 2
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            k = linear(key, _w, _b)

            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = embed_dim * 2
            _end = None
            _w = in_proj_weight[_start:, :]
            if _b is not None:
                _b = _b[_start:]
            v = linear(value, _w, _b)
    else:
        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
        len1, len2 = q_proj_weight_non_opt.size()
        assert len1 == embed_dim and len2 == query.size(-1)

        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
        len1, len2 = k_proj_weight_non_opt.size()
        assert len1 == embed_dim and len2 == key.size(-1)

        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
        len1, len2 = v_proj_weight_non_opt.size()
        assert len1 == embed_dim and len2 == value.size(-1)

        if in_proj_bias is not None:
            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
        else:
            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
    #####################################################
    # Scaled Dot-product
    q = q * scaling

    if attn_mask is not None:
        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
        if attn_mask.dtype == torch.uint8:
            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
            attn_mask = attn_mask.to(torch.bool)

        if attn_mask.dim() == 2:
            attn_mask = attn_mask.unsqueeze(0)
            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
                raise RuntimeError('The size of the 2D attn_mask is not correct.')
        elif attn_mask.dim() == 3:
            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
                raise RuntimeError('The size of the 3D attn_mask is not correct.')
        else:
            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
        # attn_mask's dim is 3 now.

    # convert ByteTensor key_padding_mask to bool
    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
        key_padding_mask = key_padding_mask.to(torch.bool)

    if bias_k is not None and bias_v is not None:
        if static_k is None and static_v is None:
            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = pad(attn_mask, (0, 1))
            if key_padding_mask is not None:
                key_padding_mask = pad(key_padding_mask, (0, 1))
        else:
            assert static_k is None, "bias cannot be added to static key."
            assert static_v is None, "bias cannot be added to static value."
    else:
        assert bias_k is None
        assert bias_v is None

    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
    if k is not None:
        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
    if v is not None:
        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)

    if static_k is not None:
        assert static_k.size(0) == bsz * num_heads
        assert static_k.size(2) == head_dim
        k = static_k

    if static_v is not None:
        assert static_v.size(0) == bsz * num_heads
        assert static_v.size(2) == head_dim
        v = static_v

    src_len = k.size(1)

    if key_padding_mask is not None:
        assert key_padding_mask.size(0) == bsz
        assert key_padding_mask.size(1) == src_len

    if add_zero_attn:
        src_len += 1
        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1))
        if key_padding_mask is not None:
            key_padding_mask = pad(key_padding_mask, (0, 1))

    # print(q)
    # print(k)
    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
    # print(attn_output_weights)
    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]

    if attn_mask is not None:
        if attn_mask.dtype == torch.bool:
            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
        else:
            attn_output_weights += attn_mask


    if key_padding_mask is not None:
        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
        attn_output_weights = attn_output_weights.masked_fill(
            key_padding_mask.unsqueeze(1).unsqueeze(2),
            float('-inf'),
        )
        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)

    attn_output_weights = softmax(
        attn_output_weights, dim=-1)
    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)

    attn_output = torch.bmm(attn_output_weights, v)
    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)

    if need_weights:
        # average attention weights over heads
        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
        return attn_output, attn_output_weights.sum(dim=1) / num_heads
    else:
        return attn_output, None

## BiLSTM CRF

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torch.nn import LayerNorm, Dropout

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeds_pos = nn.Embedding(45, 45)


        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.word_embeds_pos.weight.data.copy_(torch.from_numpy(embedding_matrix_pos))



        self.lstm = nn.LSTM(embedding_dim + 45 + 1 + 1, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        
        #################################################################################################
        ## Attention initialize
        # input projection weights
        self.in_proj_weight = Parameter(torch.empty(3 * self.hidden_dim, self.hidden_dim))
        self.in_proj_bias = Parameter(torch.empty(3 * self.hidden_dim))
        # output projection weights
        self.out_proj = Linear(self.hidden_dim, self.hidden_dim, bias=True)

        xavier_uniform_(self.in_proj_weight)
        constant_(self.in_proj_bias, 0.)
        constant_(self.out_proj.bias, 0.)
        self.norm = LayerNorm(self.hidden_dim)
        # self.dropout = Dropout(0.7)
        ## Attention initialize
        #################################################################################################

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, pos, tfidf, dp):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        
        ## POS CONCAT
        embeds_pos = self.word_embeds_pos(pos).view(len(pos), 1, -1)
        concat = torch.cat([embeds, embeds_pos], dim=2)

        ## TF-IDF CONCAT
        tfidf = tfidf.unsqueeze(1).expand( -1, 1)
        tfidf = tfidf.unsqueeze(1).expand(-1, -1, 1)
        concat2 = torch.cat([concat, tfidf], dim=2)

        ## DP CONCAT
        dp = dp.unsqueeze(1).expand( -1, 1)
        dp = dp.unsqueeze(1).expand(-1, -1, 1)
        concat3 = torch.cat([concat2, dp], dim=2)
       
        ## LSTM forward
        lstm_out, self.hidden = self.lstm(concat3, self.hidden)

        #################################################################################################
        ## Attention forward
        att_output, att_weights = multi_head_attention_forward(
                query=lstm_out, key=lstm_out, value=lstm_out, 
                embed_dim_to_check=self.hidden_dim, 
                num_heads=1,
                in_proj_weight=self.in_proj_weight, 
                in_proj_bias=self.in_proj_bias,
                bias_k=None, 
                bias_v=None, 
                add_zero_attn=False,
                dropout_p=0., 
                out_proj_weight=self.out_proj.weight, 
                out_proj_bias=self.out_proj.bias,
                training=True,
                key_padding_mask=None, 
                need_weights=True,
                attn_mask=None
                )
        
        att_output += lstm_out
        att_output = self.norm(att_output)
        ## Attention forward
        #################################################################################################

        lstm_out = att_output.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags, pos, tfidf, dp):
        feats = self._get_lstm_features(sentence, pos, tfidf, dp)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, pos, tfidf, dp):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence, pos, tfidf, dp)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# Train

In [0]:
import numpy as np
def cal_acc(model, input_index, output_index, pos, tfidf, dp):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device), 
                            torch.tensor(pos[i], dtype=torch.long).to(device), 
                            torch.tensor(tfidf[i], dtype=torch.float).to(device),
                            torch.tensor(dp[i][:len(idxs)], dtype=torch.float).to(device)
                             )
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return ground_truth, predicted, accuracy

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)

In [27]:
"""Each epoch will take about 1-2 minutes"""

import datetime
from tqdm import tqdm

loss_log = []

for epoch in tqdm(range(30)):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        pos = torch.tensor(train_data_pos[i], dtype=torch.long).to(device)
        tfidf = torch.tensor(train_data_tfidf[i], dtype=torch.float).to(device)
        dp = torch.tensor(train_data_dp[i][:len(idxs)], dtype=torch.float).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, pos, tfidf, dp)
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index, train_data_pos, train_data_tfidf, train_data_dp)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index, val_data_pos, val_data_tfidf, val_data_dp)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        pos = torch.tensor(val_data_pos[i], dtype=torch.long).to(device)
        tfidf = torch.tensor(val_data_tfidf[i], dtype=torch.float).to(device)
        dp = torch.tensor(val_data_dp[i][:len(idxs)], dtype=torch.float).to(device)

        loss = model.neg_log_likelihood(sentence_in, targets, pos, tfidf, dp)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()
    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
    temp_loss_log = [epoch+1, train_loss, train_acc, val_loss, val_acc, (time2-time1).total_seconds()]
    loss_log.append (temp_loss_log)


  3%|▎         | 1/30 [02:39<1:17:09, 159.63s/it]

Epoch:1, Training loss: 17838.80, train acc: 0.8799, val loss: 2803.67, val acc: 0.8761, time: 159.63s


  7%|▋         | 2/30 [05:14<1:13:46, 158.08s/it]

Epoch:2, Training loss: 11303.01, train acc: 0.9023, val loss: 2346.69, val acc: 0.8988, time: 154.45s


 10%|█         | 3/30 [07:50<1:10:55, 157.60s/it]

Epoch:3, Training loss: 9666.77, train acc: 0.9088, val loss: 2031.82, val acc: 0.9085, time: 156.49s


 13%|█▎        | 4/30 [10:21<1:07:22, 155.49s/it]

Epoch:4, Training loss: 8845.08, train acc: 0.9195, val loss: 1791.71, val acc: 0.9174, time: 150.55s


 17%|█▋        | 5/30 [12:53<1:04:26, 154.67s/it]

Epoch:5, Training loss: 8118.81, train acc: 0.9246, val loss: 1721.12, val acc: 0.9234, time: 152.75s


 20%|██        | 6/30 [15:25<1:01:32, 153.84s/it]

Epoch:6, Training loss: 7630.20, train acc: 0.9302, val loss: 1592.10, val acc: 0.9259, time: 151.92s


 23%|██▎       | 7/30 [18:00<59:03, 154.08s/it]  

Epoch:7, Training loss: 7189.11, train acc: 0.9311, val loss: 1589.05, val acc: 0.9263, time: 154.62s


 27%|██▋       | 8/30 [20:33<56:24, 153.86s/it]

Epoch:8, Training loss: 6821.44, train acc: 0.9353, val loss: 1489.85, val acc: 0.9299, time: 153.34s


 30%|███       | 9/30 [23:05<53:39, 153.32s/it]

Epoch:9, Training loss: 6503.24, train acc: 0.9375, val loss: 1533.54, val acc: 0.9254, time: 152.07s


 33%|███▎      | 10/30 [25:36<50:51, 152.58s/it]

Epoch:10, Training loss: 6108.57, train acc: 0.9394, val loss: 1474.27, val acc: 0.9291, time: 150.85s


 37%|███▋      | 11/30 [28:09<48:20, 152.66s/it]

Epoch:11, Training loss: 5830.67, train acc: 0.9427, val loss: 1445.14, val acc: 0.9344, time: 152.83s


 40%|████      | 12/30 [30:36<45:16, 150.93s/it]

Epoch:12, Training loss: 5467.65, train acc: 0.9462, val loss: 1405.47, val acc: 0.9329, time: 146.90s


 43%|████▎     | 13/30 [33:04<42:29, 149.96s/it]

Epoch:13, Training loss: 5244.37, train acc: 0.9467, val loss: 1430.98, val acc: 0.9344, time: 147.70s


 47%|████▋     | 14/30 [35:30<39:43, 148.98s/it]

Epoch:14, Training loss: 5032.75, train acc: 0.9499, val loss: 1445.58, val acc: 0.9330, time: 146.69s


 50%|█████     | 15/30 [37:57<37:06, 148.44s/it]

Epoch:15, Training loss: 4685.42, train acc: 0.9527, val loss: 1387.90, val acc: 0.9352, time: 147.17s


 53%|█████▎    | 16/30 [40:24<34:30, 147.87s/it]

Epoch:16, Training loss: 4483.04, train acc: 0.9534, val loss: 1381.51, val acc: 0.9341, time: 146.56s


 57%|█████▋    | 17/30 [42:56<32:17, 149.04s/it]

Epoch:17, Training loss: 4335.37, train acc: 0.9536, val loss: 1380.44, val acc: 0.9345, time: 151.77s


 60%|██████    | 18/30 [45:27<29:56, 149.73s/it]

Epoch:18, Training loss: 4175.07, train acc: 0.9552, val loss: 1402.76, val acc: 0.9358, time: 151.33s


 63%|██████▎   | 19/30 [47:55<27:21, 149.21s/it]

Epoch:19, Training loss: 3991.25, train acc: 0.9564, val loss: 1362.27, val acc: 0.9326, time: 147.98s


 67%|██████▋   | 20/30 [50:24<24:51, 149.20s/it]

Epoch:20, Training loss: 3848.41, train acc: 0.9602, val loss: 1357.48, val acc: 0.9389, time: 149.18s


 70%|███████   | 21/30 [53:00<22:40, 151.16s/it]

Epoch:21, Training loss: 3677.17, train acc: 0.9596, val loss: 1349.78, val acc: 0.9399, time: 155.75s


 73%|███████▎  | 22/30 [55:37<20:23, 152.93s/it]

Epoch:22, Training loss: 3533.08, train acc: 0.9600, val loss: 1364.54, val acc: 0.9370, time: 157.05s


 77%|███████▋  | 23/30 [58:15<18:00, 154.40s/it]

Epoch:23, Training loss: 3429.45, train acc: 0.9623, val loss: 1376.29, val acc: 0.9407, time: 157.83s


 80%|████████  | 24/30 [1:00:50<15:27, 154.56s/it]

Epoch:24, Training loss: 3187.20, train acc: 0.9647, val loss: 1330.82, val acc: 0.9385, time: 154.93s


 83%|████████▎ | 25/30 [1:03:26<12:55, 155.16s/it]

Epoch:25, Training loss: 3314.95, train acc: 0.9645, val loss: 1422.68, val acc: 0.9363, time: 156.56s


 87%|████████▋ | 26/30 [1:06:02<10:20, 155.20s/it]

Epoch:26, Training loss: 3005.27, train acc: 0.9615, val loss: 1411.71, val acc: 0.9358, time: 155.29s


 90%|█████████ | 27/30 [1:08:42<07:49, 156.66s/it]

Epoch:27, Training loss: 2948.10, train acc: 0.9685, val loss: 1302.48, val acc: 0.9439, time: 160.08s


 93%|█████████▎| 28/30 [1:11:18<05:13, 156.58s/it]

Epoch:28, Training loss: 2734.55, train acc: 0.9674, val loss: 1350.95, val acc: 0.9455, time: 156.38s


 97%|█████████▋| 29/30 [1:13:54<02:36, 156.32s/it]

Epoch:29, Training loss: 2603.36, train acc: 0.9682, val loss: 1274.02, val acc: 0.9452, time: 155.71s


100%|██████████| 30/30 [1:16:28<00:00, 152.95s/it]

Epoch:30, Training loss: 2500.37, train acc: 0.9705, val loss: 1381.34, val acc: 0.9431, time: 154.07s





In [0]:
# Save losses in to csv file
import csv

with open('attention_head_1_norm_0.7_loss_log.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(loss_log)

# Predict

In [0]:
import numpy as np

# do prediction
def cal_pred(model, input_index, test_pos, test_tfidf, test_dp):
    
    predicted = []
    for i,idxs in enumerate(input_index):
        
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device), 
                            torch.tensor(test_pos[i], dtype=torch.long).to(device), 
                            torch.tensor(test_tfidf[i], dtype=torch.float).to(device),
                            torch.tensor(test_dp[i][:len(idxs)], dtype=torch.float).to(device))
        predicted += pred
    
    return predicted

In [0]:
y_pred = cal_pred(model,test_input_index, test_data_pos, test_data_tfidf, test_data_dp)

# decoding
def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_pred_decode = decode_output(y_pred)

# Save prediction

In [0]:
# Save prediction to csv file
to_csv = []

for i in range(len(y_pred_decode)):
  temp = []
  temp.append(i)
  temp.append(y_pred_decode[i])

  to_csv.append(temp)

to_csv.insert(0, ['Id', 'Predicted'])

In [0]:

import csv

with open('group119_200605-12pm.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(to_csv)


# Save and Load model

In [28]:
# save model
torch.save(model, 'LSTM_CRF.pth')

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
# load model
model = torch.load('LSTM_CRF.pth')