In [None]:
!pip install -U 'scikit-learn<0.24'
!pip install sklearn-crfsuite

# YOU NEED TO RESTART THE RUNTIME!!!

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 9.7 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.23.2 which is incompatible.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.23.2 which is incompatible.[0m
Successfully installed scikit-learn-0.23.2


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 7.8 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [None]:
# Run this cell to mount your drive to this notebook in order to read the datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import json
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Read Dataset

In [None]:
# Put the folder path where the datasets are located
PATH = "/content/drive/MyDrive/445Project2/"

In [None]:
# function to read data, return list of tuples each tuple represents a token contains word, pos tag, chunk tag, and ner tag
def read_data(filename) -> list:
  tuples = []
  currSent = []
  for l in open(filename,'r').readlines():
    if l.strip() == "":
      tuples.append(currSent)
      currSent = []
    else: 
      currSent.append(tuple(l.strip().split(" ")))
  if len(currSent) !=0:
    tuples.append(currSent)
  #tuples = [tuple(l.strip().split(" ")) for l in open(filename,'r').readlines() if l.strip() != "" ]
  return tuples

In [None]:
# read data with your custom function
train_data = read_data(PATH + "train.txt")
val_data = read_data(PATH + "valid.txt")
test_data = read_data(PATH + "test.txt")

In [None]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

14987
3466
3684


# Create Gazetteer

In [None]:
# load wikipedia pages
W_PATH = PATH + "wikipedia_pages/"

count = 0
gazetteer = set()
for filename in os.listdir(W_PATH):
  f = open(W_PATH + filename, 'r')
  j = json.load(f)
  #print("Reading:",j["title"])
  matches = re.findall('a href=\"[A-Z][^\s]+\"&gt;(?P<link>[A-Z][\w ]*)&lt;',j["text"])
  gazetteer.update(matches)
  gazetteer.add(j["title"])
  count +=1
  #print(count,end=" ")
  #if count > 4: break


In [None]:
# print the size of your gazetteer
print(len(gazetteer))

244670


# Models

## Conditional Random Fields (CRF)

### Extract features for CRF

In [None]:
import nltk
from nltk.stem import PorterStemmer

nltk.download('stopwords')
from nltk.corpus import stopwords

import sklearn_crfsuite
from sklearn.metrics import make_scorer
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# create a function to extract features for each token

def token2features(sentence: list, idx: int) -> dict:
  word = sentence[idx][0]

  features = {
      'bias': 1.0,
      'stem': stem(word),
      'postag': sentence[idx][1],
      'chunktag': sentence[idx][2],
      'shortShape': shortShape(word),
      'containsNum': containsNum(word),
      'startUppercase': startUppercase(word),
      'wiShape': wiShape(word),
      'containsHyphen':containsHyphen(word),
      'upperHyphenDigit': upperHyphenDigit(word),
      'allUpper': allUpper(word),
      'isStopWord':isStopWord(word),
      'upper':upper(word),
      'inGazetteer': inGazetteer(word),
      'prefix': word[:3],
      'suffix': word[-3:]
  }
  if idx > 0:
    leftword = sentence[idx-1][0]
    features.update({
        'left':leftword,
        'left-postag': sentence[idx-1][1],
        'left-chunktag': sentence[idx-1][2],
        'left-wiShape':wiShape(leftword),
        'left-shortShape': shortShape(leftword),
        'beginning': False
    })
  else:
    features['beginning'] = True
  
  if idx == len(sentence)-1:
    features['end'] = True
  else:
    rightword = sentence[idx+1][0]
    features.update({
        'right':rightword,
        'right-postag': sentence[idx+1][1],
        'right-chunktag': sentence[idx+1][2],
        'right-wiShape':wiShape(rightword),
        'right-shortShape': shortShape(rightword),
        'end': False
    })

  return features


In [None]:
# Feature functions

ps=PorterStemmer()
def stem(word):
  return ps.stem(word)

def startUppercase(word):
  return 'A'< word[0] < 'Z'

def wiShape(word):
  regs = ["[A-Z]", "[a-z]", "[0-9]"]
  subs = ["X","x","d"]

  for i in range(len(regs)):
    word = re.sub(regs[i],subs[i],word)
  return word

def shortShape(word):
  regs = ["[A-Z]+", "[a-z]+", "[0-9]+"]
  subs = ["X","x","d"]

  for i in range(len(regs)):
    word = re.sub(regs[i],subs[i],word)
  return word

def containsNum(word):
  for i in range(len(word)):
    if word[i].isdigit(): return True
  return False

def containsHyphen(word):
  for i in range(len(word)):
    if word[i]=='-': return True
  return False

def allUpper(word):
  return word.isupper()

def upperHyphenDigit(word):
  return allUpper(word) and containsNum(word) and containsHyphen(word)

def isStopWord(word):
  return word in stopwords.words('english')

def upper(word):
  return word.upper()

def inGazetteer(word):
  global gazetteer
  return word in gazetteer

In [None]:
print(allUpper("ShdDDHD4D4"))

False


In [None]:
# define function to process each token given a sentence
def sent2features(sentence: list) -> list:
  return [token2features(sentence, i) for i in range(len(sentence))]

# get named entity labels from the sentence
def sent2labels(sentence: list) -> list:
  return [s[3] for s in sentence]

In [None]:
# prepare inputs and labels
train_sents = [sent2features(s) for s in train_data]
val_sents = [sent2features(s) for s in val_data]
test_sents = [sent2features(s) for s in test_data]

train_labels = [sent2labels(s) for s in train_data]
val_labels = [sent2labels(s) for s in val_data]
test_labels = [sent2labels(s) for s in test_data]

In [None]:
# calculate f1-score and classification report for test using sklearn_crfsuite.metrics class
train_sents[4][2]

{'allUpper': False,
 'beginning': False,
 'bias': 1.0,
 'chunktag': 'I-NP',
 'containsHyphen': False,
 'containsNum': False,
 'end': False,
 'isStopWord': False,
 'left': 'European',
 'left-chunktag': 'I-NP',
 'left-postag': 'NNP',
 'left-shortShape': 'Xx',
 'left-wiShape': 'Xxxxxxxx',
 'postag': 'NNP',
 'prefix': 'Com',
 'right': 'said',
 'right-chunktag': 'B-VP',
 'right-postag': 'VBD',
 'right-shortShape': 'x',
 'right-wiShape': 'xxxx',
 'shortShape': 'Xx',
 'startUppercase': True,
 'stem': 'commiss',
 'suffix': 'ion',
 'upper': 'COMMISSION',
 'upperHyphenDigit': False,
 'wiShape': 'Xxxxxxxxxx'}

In [None]:
# start from the stem of the token and add features one by one and train a new model with each feature that you add

keys = set(train_sents[0][0].keys())
keys.update(train_sents[4][2].keys())
keys = list(keys)

In [None]:
# CRF fit
import copy

crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

tf_copy = copy.deepcopy(train_sents)
for j in range(len(tf_copy)):
  for k in range(len(tf_copy[j])):
    tf_copy[j][k] = {}

results = {'Features': [], 'F1 Score':[], 'Precision':[],'Recall':[]}

for i in range(len(keys)):

  for j in range(len(tf_copy)):
    for k in range(len(tf_copy[j])):
      if keys[i] in train_sents[j][k].keys():
        tf_copy[j][k].update({keys[i]:train_sents[j][k][keys[i]]})

  crf.fit(tf_copy, train_labels)

  y_pred = crf.predict(val_sents)
  f1 = metrics.flat_f1_score(val_labels, y_pred,
                      average='weighted', labels=list(crf.classes_))
  pre =  metrics.flat_precision_score(val_labels, y_pred,
                      average='weighted', labels=list(crf.classes_))
  re = metrics.flat_recall_score(val_labels, y_pred,
                      average='weighted', labels=list(crf.classes_))
  
  results['Features'].append(keys[i])
  results['F1 Score'].append(f1)
  results['Precision'].append(pre)
  results['Recall'].append(re)


df = pd.DataFrame(results)
df

Unnamed: 0,Features,F1 Score,Precision,Recall
0,postag,0.872546,0.8768,0.880278
1,containsHyphen,0.872944,0.877237,0.881209
2,bias,0.872765,0.876855,0.88086
3,startUppercase,0.887337,0.891581,0.89164
4,inGazetteer,0.887271,0.886526,0.893501
5,right-shortShape,0.910537,0.911556,0.913102
6,allUpper,0.910644,0.911401,0.913432
7,right,0.942502,0.942844,0.943251
8,prefix,0.965754,0.965637,0.966226
9,containsNum,0.966018,0.965921,0.966478


In [None]:
# display the classification report for the best model


## Recurrent Neural Network (RNN)

In [None]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, Input, Sequential
from keras.layers import Dense, Flatten, Embedding, Input, Dropout, LSTM, TimeDistributed, Bidirectional
from tensorflow.keras.layers import add
from keras.callbacks import ModelCheckpoint

from gensim.models import Word2Vec
import gensim.downloader as api

In [None]:
!pip install seqeval
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=cc2be653eb3556721042b0edec6a9e3ac19e3c7addabe966036416df23050514
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
# find unique labels and create dictionary to map each label to a unique integer value
all_labels = set()
for t in train_data:
  #print(t)
  for word in t:
    all_labels.add(word[3])

label_vals = {}
for i, l in enumerate(all_labels):
  label_vals[l] = i
label_vals["Other"] = len(label_vals)
print(label_vals)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'B-LOC': 4, 'B-MISC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8, 'Other': 9}


In [None]:
# preprare your dataset for RNN classifier (you need to add padding to labels as well)
#Tokenize and pad words
tokenizer = Tokenizer()

train_words = [[j[0] for j in w] for w in train_data ]
val_words = [[j[0] for j in w] for w in val_data ]
print(train_words[:4])
tokenizer.fit_on_texts(train_words)
train_seq  = tokenizer.texts_to_sequences(train_words) 
val_seq = tokenizer.texts_to_sequences(val_words)

#padding to prepare sequences of same length
train_seq_pad  = pad_sequences(train_seq, maxlen=100,padding="post")
val_seq_pad = pad_sequences(val_seq, maxlen=100,padding="post")

word_vec_size = 100

word2ind = tokenizer.word_index
n_unique = len(word2ind)
#print(train_seq_pad[:4])

#padding the labels 
tokenizer = Tokenizer()

train_labels = [[j[3] for j in w] for w in train_data ]
val_labels = [[j[3] for j in w] for w in val_data ]
#print(train_labels[:4])
tokenizer.fit_on_texts(train_labels)
train_seq_l  = tokenizer.texts_to_sequences(train_labels) 
val_seq_l = tokenizer.texts_to_sequences(val_labels)

tag2ind = tokenizer.word_index
tag2ind["Other"] = len(tag2ind)
print(tag2ind)
print(train_seq_l[:4])
#padding to prepare sequences of same length
train_seq_pad_l = pad_sequences(train_seq_l, maxlen=100, value = tag2ind["Other"],padding="post")
val_seq_pad_l = pad_sequences(val_seq_l, maxlen=100,value = tag2ind["Other"],padding="post")

print(train_seq_pad_l[:4])


[['-DOCSTART-'], ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22']]
{'o': 1, 'b-loc': 2, 'b-per': 3, 'b-org': 4, 'i-per': 5, 'i-org': 6, 'b-misc': 7, 'i-loc': 8, 'i-misc': 9, 'Other': 9}
[[1], [4, 1, 7, 1, 1, 1, 7, 1, 1], [3, 5], [2, 1]]
[[1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [4 1 7 1 1 1 7 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [3 5 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [2 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 

In [None]:
print(train_seq_pad_l.shape)

(14987, 100)


In [None]:
# Randomly create your own word embeddings from scratch
embedding_matrix = np.zeros((n_unique, word_vec_size))
for word, i in word2ind.items():
  if i < n_unique:
    embedding_vector = np.random.rand(word_vec_size)*2 - 1
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
#example 
print(embedding_matrix[3])

# You can check https://radimrehurek.com/gensim/models/word2vec.html for training a word embeddings from scratch

# You can check https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html and https://github.com/RaRe-Technologies/gensim-data for loading pretrained word embeddings. 


[-9.75426105e-01  2.54423371e-01 -1.41648810e-01  4.86449649e-01
  7.77587153e-02 -4.98370452e-01 -3.57929296e-01 -6.42982539e-01
 -6.24497610e-01 -5.62508052e-01  5.04915298e-01 -9.44814697e-01
  4.00100495e-01  8.35028515e-01  3.77102820e-01  5.18118153e-01
 -2.85519517e-01  3.05508649e-01  5.98367895e-01  1.60999951e-01
 -3.02858364e-01  2.64808180e-01  2.31499062e-01  9.39954679e-01
 -9.12182723e-01 -8.48002698e-01 -9.57753531e-01  7.28504988e-01
 -8.08290129e-03 -1.93784489e-01 -5.82739659e-01  5.73652323e-01
 -6.77608804e-01  8.34352085e-01 -6.77871410e-01  6.45154003e-02
  5.30250055e-01  2.82064048e-01  8.46565204e-01  7.33081636e-01
  2.59414575e-03  2.82833198e-01 -1.94736596e-02 -6.16897768e-01
  3.32452359e-01 -7.13008939e-01  8.70144988e-01  8.62588188e-01
 -1.91539695e-01  6.70978574e-02  9.86743243e-01  1.06012802e-01
 -2.16013536e-01  6.18820933e-01  4.41052241e-01  9.88538058e-01
 -9.88583408e-02 -6.99816100e-01 -2.60182789e-01  4.61115128e-01
  1.15633884e-01  5.33081

In [None]:
# Word2Vec 
model_w2v = Word2Vec(train_words, size = word_vec_size, window = 10, workers = 10, min_count = 2)
vocs = list(model_w2v.wv.vocab)
print(vocs)
print(len(vocs))

num_words = len(vocs)
embedding_matrix_w2v = np.zeros((len(word2ind), word_vec_size))
for word, i in word2ind.items():
  if i < num_words:
    if word in model_w2v:
      embedding_vector = model_w2v[word]
      embedding_matrix_w2v[i] = embedding_vector
#example 
print(embedding_matrix_w2v[139])

11983
[ 5.53785741e-01  1.28151208e-01  2.29955409e-02 -1.16766788e-01
  2.02304915e-01 -1.38421580e-01 -3.81368816e-01 -4.70498025e-01
  5.46263099e-01  5.56608617e-01 -5.35573184e-01  1.86760351e-01
  4.58544672e-01 -7.12110937e-01  8.94886076e-01 -6.20765626e-01
  3.35331887e-01  3.20265144e-01  4.07946594e-02  4.77757931e-01
 -1.08508609e-01  2.98473179e-01 -9.88058150e-01 -2.07628742e-01
 -6.75156236e-01  6.05920732e-01 -2.16424823e-01  7.50604153e-01
 -2.54040901e-02 -3.14042985e-01 -1.15069933e-01  9.23015237e-01
  3.22835654e-01 -2.85683423e-01 -1.02153444e+00 -1.94113240e-01
 -5.24880886e-01  1.42225936e-01 -3.33735868e-02  1.57983154e-01
  1.96395833e-02 -4.44260299e-01  3.29163194e-01 -2.30912287e-02
 -6.25274539e-01  1.08872280e-01  2.89305657e-01 -7.32403100e-01
 -4.20051903e-01 -1.93699583e-01 -2.60769695e-01  4.44958478e-01
  1.76411331e-01  2.11974941e-02 -3.57262552e-01  3.44929159e-01
 -3.75901431e-01  9.65583473e-02 -6.93242610e-01 -3.10843363e-02
 -8.00368749e-03 -2

In [None]:
# Embedding matrix with gensim api 
import gensim.downloader as api
model_gensim = api.load("glove-wiki-gigaword-100")



In [None]:
embedding_matrix_gensim = np.zeros((len(word2ind), word_vec_size))
for word, i in word2ind.items():
  if i < len(word2ind):
    if word in model_gensim:
      embedding_matrix_gensim[i] = model_gensim[word]
print(embedding_matrix_gensim[4])

[-0.1529     -0.24279     0.89837003  0.16996001  0.53516001  0.48784
 -0.58825999 -0.17982    -1.35810006  0.42541     0.15377     0.24214999
  0.13474     0.41192999  0.67043    -0.56418002  0.42985001 -0.012183
 -0.11677     0.31781     0.054177   -0.054273    0.35516    -0.30241001
  0.31434    -0.33846     0.71714997 -0.26855001 -0.15837    -0.47466999
  0.051581   -0.33252001  0.15003    -0.12989999 -0.54617    -0.37843001
  0.64261001  0.82187003 -0.080006    0.078479   -0.96976    -0.57740998
  0.56490999 -0.39873001 -0.057099    0.19743     0.065706   -0.48091999
 -0.20125    -0.40834001  0.39456001 -0.02642    -0.11838     1.01199996
 -0.53171003 -2.74740005 -0.042981   -0.74848998  1.75740004  0.59085
  0.04885     0.78267002  0.38497001  0.42096999  0.67882001  0.10337
  0.63279998 -0.026595    0.58647001 -0.44332001  0.33057001 -0.12022
 -0.55645001  0.073611    0.20915     0.43395001 -0.012761    0.089874
 -1.79910004  0.084808    0.77112001  0.63104999 -0.90684998  0.603

In [None]:
# Create Embedding Matrices and Layers
# architecture from https://medium.com/analytics-vidhya/named-entity-recognition-using-deep-learning-elmo-embedding-bi-lstm-48295bc66cab
input_shape = Input(shape = (len(train_seq_pad[0]),))

max_len = max([len(x) for x in train_seq])
tag_len = len(tag2ind)
#embedding layer
embedding_layer = Embedding(
    len(word2ind),
    word_vec_size,
    weights=[embedding_matrix],
    input_length=max_len,
    trainable=False
  ) (input_shape)
x = Bidirectional(LSTM(units=128, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding_layer)
#x_rnn = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x_rnn)
#x = add([x, x_rnn])  # residual connection to the first biLSTM
x = Dense(64,activation = "sigmoid")(x)

out = TimeDistributed(Dense(tag_len, activation="softmax"))(x)



In [None]:
embedding_matrix.shape

(21010, 100)

In [None]:
embedding_matrix_w2v.shape

(21010, 100)

In [None]:
embedding_matrix_gensim.shape

(21010, 100)

In [None]:
# ****** Hyperparameter Tuning ******
import itertools
matrices=[embedding_matrix, embedding_matrix_w2v, embedding_matrix_gensim]
matrix_inds=[0,1,2]
dense_ns = [64, 128]
lstm_ns = [128, 64]

a=[matrix_inds,dense_ns, lstm_ns]

accs = []
combinations = []
models = []
max_model = 0
max_acc = 0
count = 0

input_shape = Input(shape = (len(train_seq_pad[0]),))

max_len = max([len(x) for x in train_seq])
tag_len = len(tag2ind)

for comb in list(itertools.product(*a)):

  embedding_layer = Embedding(
      len(word2ind),
      word_vec_size,
      weights=[ matrices[comb[0]] ],
      input_length=max_len,
      trainable=False
    ) (input_shape)
  x = Bidirectional(LSTM(units=comb[2], return_sequences=True,
                        recurrent_dropout=0.2, dropout=0.2))(embedding_layer)
  #x_rnn = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x_rnn)
  #x = add([x, x_rnn])  # residual connection to the first biLSTM
  x = Dense(comb[1],activation = "sigmoid")(x)

  out = TimeDistributed(Dense(tag_len, activation="softmax"))(x)
  model = Model(input_shape, out)
  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
  #print(model.summary())

  model_res = model.fit(train_seq_pad, train_seq_pad_l, validation_data = (val_seq_pad,val_seq_pad_l) , batch_size = 500, epochs = 4, verbose = 1)
  val_acc = model_res.history["val_accuracy"][-1]
  print("Validation accuracy for ",comb, " is: ", val_acc)
  accs.append(val_acc)
  models.append(model)
  combinations.append(comb)
  if val_acc > max_acc:
    max_model = count
  count+=1



Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (0, 64, 128)  is:  0.9595152735710144
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (0, 64, 64)  is:  0.9613358378410339
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (0, 128, 128)  is:  0.9609578847885132
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (0, 128, 64)  is:  0.9625735878944397
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (1, 64, 128)  is:  0.9640681147575378
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (1, 64, 64)  is:  0.9639440178871155
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (1, 128, 128)  is:  0.9641200304031372
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (1, 128, 64)  is:  0.9640998244285583
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (2, 64, 128)  is:  0.9643133282661438
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Validation accuracy for  (2, 64, 

In [None]:
print("Properties of the model with highest accuracy:\n")
m = ["Scratch","word2vec","gensim"]
print("Embedding matrix:",m[combinations[max_model][0]])
print("Dense Layer Size:",combinations[max_model][1])
print("LSTM size:",combinations[max_model][2])
print("Validaiton accuracy", accs[max_model])

Properties of the model with highest accuracy:

Embedding matrix: gensim
Dense Layer Size: 128
LSTM size: 64
Validaiton accuracy 0.9644172191619873


In [None]:
# Prepare test data
tokenizer = Tokenizer()

test_words = [[j[0] for j in w] for w in test_data ]
print(test_words[:4])
tokenizer.fit_on_texts(test_words)
test_seq  = tokenizer.texts_to_sequences(test_words) 

#padding to prepare sequences of same length
test_seq_pad  = pad_sequences(test_seq, maxlen=100,padding="post")

word_vec_size = 100

word2ind = tokenizer.word_index
n_unique = len(word2ind)
#print(train_seq_pad[:4])

#padding the labels 
tokenizer = Tokenizer()

test_labels = [[j[3] for j in w] for w in test_data ]
#print(train_labels[:4])
tokenizer.fit_on_texts(test_labels)
test_seq_l  = tokenizer.texts_to_sequences(test_labels) 

tag2ind = tokenizer.word_index
tag2ind["Other"] = len(tag2ind)
print(tag2ind)
print(test_seq_l[:4])
#padding to prepare sequences of same length
test_seq_pad_l = pad_sequences(test_seq_l, maxlen=100, value = tag2ind["Other"],padding="post")

print(test_seq_pad_l[:4])

[['-DOCSTART-'], ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.'], ['Nadim', 'Ladki'], ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']]
{'o': 1, 'b-loc': 2, 'b-org': 3, 'b-per': 4, 'i-per': 5, 'i-org': 6, 'b-misc': 7, 'i-loc': 8, 'i-misc': 9, 'Other': 9}
[[1], [1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1], [4, 5], [2, 1, 2, 8, 8, 1]]
[[1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [1 1 2 1 1 1 1 4 1 1 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
 [4 5 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
  9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9

In [None]:
# Prediction

preds = models[max_model].predict(test_seq_pad)
preds

In [None]:
preds[1,0]

array([0.00380317, 0.84374505, 0.03193357, 0.02765813, 0.02440799,
       0.01627332, 0.01718599, 0.01341335, 0.00822308, 0.01335635],
      dtype=float32)

In [None]:
#preds[0][0]-> word ner prob
preds_maxed = np.zeros(preds.shape[0:2])
for iy, ix,iz in np.ndindex(preds.shape):
  preds_maxed[iy,ix] = preds[iy,ix].argmax()
print(preds_maxed[0,1])

9.0


In [None]:
# define a function to remove paddings and align labels and tokens
def align_predictions(predictions:np.array):
  pr = list(predictions)

  for j in range(len(pr)):
    for k in range(len(pr[x])):
      pr[j][k] = np.delete(predictions[j], np.where(predictions[j] == 9))
  for iy, ix in np.ndindex(predictions.shape):
    predictions[iy] = np.delete(predictions[iy], np.where(predictions[iy] == 9))
  return predictions

preds_aligned = align_predictions(preds_maxed)


ValueError: ignored

In [None]:
# Evaluate your models with functions of seqeval library









In [None]:
# ....

## My Report


In this project we had to perform named entity recognition based on two approaches, namely Conditional Random Fields and Recurrent Neural Networks. The data that we have utilized for this task is from Reuters and consists of various articles in English, that specifically contain various names of persons, organizations, locations, and miscellaneous entities.

All the data were presented as words with their tags in the format (word, postag, chunktag, ner tag). The training data consisted of 14987 words that were tagged in the same format, the validation had 3466 samples and the test set had 3684. 

Also to complement the tagging process, as part of the CRF stages, we have utilized a wide source of Wikipedia articles and fetched links that contained named entities. There were more than 25000 wikipedia articles and for each of them, using regular expression, the titles of pages that contained information regarding named entities were fetched and a gazetteer of size 244670 was created, to be used as a feature for the CRF tagging.

The CRF taggin part utilized the features that were identified by iterating over the sets and identifying values for all features. Some example features were: 
- Stem
- POS tag
- Chunk tag
- Start of the Sentence
- End of the Sentence
- Starts with an uppercase letter
- 𝑤𝑖’s shape
- 𝑤𝑖’s short word shape
- 𝑤𝑖 contains a number

Besides such examples, we have also repeated some features for the words that were to the left and right of the word that was being inspected.
To test the CRF model, we have one by one included the features, building up the list to its fullest. The top 3 scoring models were the last three additions, namely:

| Feature Name | F1 | Precision       | Recall |
|------------|-------------|--------------|----------|
| Stem     | 0.979061 | 0.979007    | 0.979255    | 
| Left-posttag     | 0.979535    | 0.979473 | 0.979720   | 
| Uppercase      | 0.979754 | 0.979678   | 0.979972   | 


For the recurrent neural models, we have utilized 3 different embedding matrices to fill our word embedding layer, to ease the identificaiton of relation between words. First one was randomly initialized with values between -1 and 1. The second one utilized a word corpus that was made from scratch using our own training data with the help of Word2Vec function. The last one was using pretrained models from the Gensim library to match and extract embedding vectors with our tokens in our corpus. These weights were then connected to an RNN before outputting.  
The architecture was a Bidirectional LSTM layer that operates on an embedding layer as previously mentioned, followed by a dense layer (fully connected) and then another dense layer that performed the label outputting but was wrapped in a Time Distributed layer, so that it updated itself at every processed word, which is a prominent feature of a recurrent neural network in essence.

The labels were preprocessed and we switched ner tags with index values, also creating a 9th label to put in place of the padded empty places. 

The hyperparameter tuning was performed using all of the embedding matrix types. Different hyperparameters tried were as follows: 

LSTM size = [128, 64],

dense layer node size = [64,128].

<br>

> **Properties of the model with highest accuracy:**
- Embedding matrix: gensim
- Dense Layer Size: 128
- LSTM size: 64
- Validaiton accuracy 0.9644172191619873


The best model was used to predict the labels and then accuracy metrics were performed, eliminating the previously created extra label. 
 