# Window NER

Today we gonna work on simple window NER that no one uses....but it's a good starting point.

Later on, once you learned LSTM, I will teach a better one for NER.

## 1. Load data

Load the famous CoNLL-2002 Shared Task

In [1]:
# !pip install nltk  #or do it in your terminal

In [2]:
import nltk
nltk.__version__

'3.7'

In [3]:
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

nltk.download('conll2002')  #this will download the dataset, and put it somewhere in your pc

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/atichetsurakul/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [4]:
corpus = nltk.corpus.conll2002.iob_sents()

In [5]:
data = []
for cor in corpus:
    #extract only the first and third guy
    #one list containing the first, and one list containing the third
    #so we can use it to train
    #['Sao', 'Paulo', '(', 'Brasil'.....)],  ['B-LOC', 'I-LOC', 'B-LOC', '0']
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [6]:
data[9999]

[('ERC',
  'CONVOCA',
  'CONFERENCIA',
  'PRESENTARSE',
  'COMO',
  'PARTIDO',
  'DE',
  'GOBIERNO',
  'Barcelona',
  '.'),
 ('B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-LOC', 'O')]

In [7]:
len(data) #35k sentences.....

35651

## 2. Tokenization

If you look carefully, we don't need to!! Yay...things are already chopped.

## 3. Numericalization

In [8]:
flatten = lambda l: [item for sublist in l for item in sublist]

#I want to get all unique vocabs....
sents, tags = list(zip(*data))
vocab  = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

#why we don't combine vocab and tagset into one single list.....

In [9]:
vocab[89:98]

['conseguidos',
 'Revisión',
 'Dua',
 'Láinez',
 'sommige',
 'Bonte',
 'sectorgenoot',
 'ISO',
 'ese']

In [10]:
len(vocab)

65459

In [11]:
tagset

['B-MISC', 'B-LOC', 'B-PER', 'I-LOC', 'I-ORG', 'O', 'I-PER', 'B-ORG', 'I-MISC']

In [12]:
#create word2index library
word2index={'<UNK>': 0, '<DUMMY>': 1}  #DUMMY facilitates me moving the windows.....

#loop each vocab
for v in vocab:
    #if that vocab does not exist yet in the word2index
    if word2index.get(v) is None:
        #the index of this vocab is basically the current len of word2indx
        word2index[v] = len(word2index)
#create the index2word
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
#do this the same for tagset
#loop each vocab
for t in tagset:
    #if that vocab does not exist yet in the word2index
    if tag2index.get(t) is None:
        #the index of this vocab is basically the current len of word2indx
        tag2index[t] = len(tag2index)
#create the index2word
index2tag = {v:k for k, v in tag2index.items()}

In [13]:
tag2index

{'B-MISC': 0,
 'B-LOC': 1,
 'B-PER': 2,
 'I-LOC': 3,
 'I-ORG': 4,
 'O': 5,
 'I-PER': 6,
 'B-ORG': 7,
 'I-MISC': 8}

In [14]:
index2tag

{0: 'B-MISC',
 1: 'B-LOC',
 2: 'B-PER',
 3: 'I-LOC',
 4: 'I-ORG',
 5: 'O',
 6: 'I-PER',
 7: 'B-ORG',
 8: 'I-MISC'}

## 4. Prepare window data

E.g., Chaky   is at AIT.   
       B-PER   0 0  B-LOC

Here I will four samples of data:

E.g., windows = [['<DUMMY>', '<DUMMY>', 'Chaky', 'is', 'at'], 'B-PER'], [], []]

In [15]:
for sample in data:
    print(sample[1])
    break

('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')


In [16]:
ws = 2
windows = []

for sample in data:
    dummy = ['<DUMMY>'] * ws
    text  = sample[0]
    fulltext = dummy + list(text) + dummy
    window = list(nltk.ngrams(fulltext, ws * 2 + 1))
    
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])   

In [17]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [18]:
len(windows)

678377

In [19]:
windows = windows[:50]

In [20]:
import random
random.shuffle(windows)

train = windows[:int(len(windows) * 0.9)]
test  = windows[int(len(windows) * 0.9):]

In [21]:
len(train), len(test)

(45, 5)

## 4. Model

<img src="../figures/ner_model.png" width="600">


In [45]:
import torch
import torch.nn as nn

class WinNER(nn.Module):
    
    def __init__(self, voc_size, emb_size, hid_size, ws, output_size):
        super(WinNER, self).__init__()
        self.embed   = nn.Embedding(voc_size, emb_size) #embedding the inputs
        self.h1      = nn.Linear((ws * 2 + 1) * emb_size, hid_size)
        self.h2      = nn.Linear(hid_size, output_size)
        self.relu    = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(0.5, inplace=True)
        self.ws = ws
        
    def forward(self, inputs):
        #inputs = (batch_size, window_size * 2 + 1)
        input_embed = self.embed(inputs)
        #input_embed = (batch_size, window_size * 2 + 1, emb_size)
        concats = input_embed.reshape(-1,input_embed.shape[-1]*input_embed.shape[-2])
        # concats  = 
        #concat everything ==> (batch_size, window_size * 2 + 1 * emb_size)  ===> 5d, e.g., 20
        h = self.relu(self.h1(concats))
        # after_h  = self.h1(concats)   #this h is basically a linear layer of some hidden size
        #after_h  = (batch_size, hidden_size), e.g., 8
    
        #apply relu
        #apply dropout
        
        #basically, h2 -> relu -> dropout where h2 project into one number
        
        return h

In [46]:
batch_size = 2
inputs = torch.randint(0, len(vocab), (batch_size, 5))
inputs
voc_size = len(vocab)
emb_size = 4
hid_size = 8
ws = 2
ws_       = ws * 2 + 1
output_size = len(tagset)
model = WinNER(voc_size, emb_size, hid_size, ws_, output_size)

In [47]:
something = model(inputs)
something.shape

RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x20 and 44x8)