In [230]:
import pickle
import operator
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import tensorflow as tf

from plot_keras_history import plot_history
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from keras_contrib.utils import save_load_utils

from keras import layers
from keras import optimizers

from keras.models import Model
from keras.models import Input

# from keras_contrib.layers import CRF
from keras_contrib import losses
from keras_contrib import metrics

from Preprocess import Preprocess

import pandas as pd
import numpy as np
from nltk import word_tokenize

from tensorflow_addons.layers import CRF 

In [231]:
train = '../data/ner-disease/train.iob'
test = '../data/ner-disease/test.iob' 
dev = '../data/ner-disease/dev.iob'
dev_predicted = '../data/ner-disease/dev-predicted.iob'


preprocess = Preprocess()
preprocess.text_to_data(filepath=train)
X, y = preprocess.preprocess_data()
preprocess.text_to_data(filepath=test)
Xtest, y_true = preprocess.preprocess_data()

In [232]:
df = pd.DataFrame(columns=['Sentence', 'Word', 'POS', 'Tag'])
df['Word'] = X
df['Tag'] = y

In [None]:
for i in range(len(df['Word'])):
    item = df['Word'][i]
    tag = nltk.pos_tag([item])
    df['POS'][i] = tag[0][1]

In [None]:
df['Sentence'][0] = 'Sentence: '+ str(1)
k = 2

for i in range(len(df['Word'])):
    if df['Word'][i] == '.':
        df['Sentence'][i+1] = 'Sentence: ' + str(k)
        k+=1        

In [None]:
dfnew = df.copy()

In [None]:
# dfnew['Sentence'] = dfnew['Sentence'].astype(str)

In [None]:
dfnew = dfnew.fillna(method="ffill")
dfnew["Sentence"] = dfnew["Sentence"].apply(lambda s: s[9:])
# dfnew["Sentence"] = dfnew["Sentence"].astype("int32")
dfnew.head()

In [None]:
dfnew.to_csv('../data/ner-disease/DatasetTrain.csv')

In [None]:
print("Total number of sentences in the dataset: {:,}".format(dfnew["Sentence"].nunique()))
print("Total words in the dataset: {:,}".format(dfnew.shape[0]))

In [None]:
dfnew["POS"].value_counts().plot(kind="bar", figsize=(10,5));

In [None]:
dfnew[dfnew["Tag"]!="O"]["Tag"].value_counts().plot(kind="bar", figsize=(10,5))

In [None]:
word_counts = dfnew.groupby("Sentence")["Word"].agg(["count"])
word_counts = word_counts.rename(columns={"count": "Word count"})
word_counts.hist(bins=50, figsize=(8,6));

In [None]:
MAX_SENTENCE = word_counts.max()[0]
print("Longest sentence in the corpus contains {} words.".format(MAX_SENTENCE))

In [None]:
longest_sentence_id = word_counts[word_counts["Word count"]==MAX_SENTENCE].index[0]
print("ID of the longest sentence is {}.".format(longest_sentence_id))

In [None]:
longest_sentence = dfnew[dfnew["Sentence"]==longest_sentence_id]["Word"].str.cat(sep=' ')
print("The longest sentence in the corpus is:\n")
print(longest_sentence)

In [None]:
all_words = list(set(dfnew["Word"].values))
all_tags = list(set(dfnew["Tag"].values))

print("Number of unique words: {}".format(dfnew["Word"].nunique()))
print("Number of unique tags : {}".format(dfnew["Tag"].nunique()))

In [None]:
word2index = {word: idx + 2 for idx, word in enumerate(all_words)}

word2index["--UNKNOWN_WORD--"]=0

word2index["--PADDING--"]=1

index2word = {idx: word for word, idx in word2index.items()}

In [None]:
for k,v in sorted(word2index.items(), key=operator.itemgetter(1))[:10]:
    print(k,v)

In [None]:
test_word = "examinations"

test_word_idx = word2index[test_word]
test_word_lookup = index2word[test_word_idx]

print("The index of the word {} is {}.".format(test_word, test_word_idx))
print("The word with index {} is {}.".format(test_word_idx, test_word_lookup))

In [None]:
tag2index = {tag: idx + 1 for idx, tag in enumerate(all_tags)}
tag2index["--PADDING--"] = 0

index2tag = {idx: word for word, idx in tag2index.items()}

In [None]:
def to_tuples(data):
    iterator = zip(data["Word"].values.tolist(),
                   data["POS"].values.tolist(),
                   data["Tag"].values.tolist())
    return [(word, pos, tag) for word, pos, tag in iterator]

sentences = dfnew.groupby("Sentence").apply(to_tuples).tolist()

print(sentences[0])

In [None]:
X = [[word[0] for word in sentence] for sentence in sentences]
y = [[word[2] for word in sentence] for sentence in sentences]
print("X[0]:", X[0])
print("y[0]:", y[0])

In [None]:
X = [[word2index[word] for word in sentence] for sentence in X]
y = [[tag2index[tag] for tag in sentence] for sentence in y]
print("X[0]:", X[0])
print("y[0]:", y[0])

In [None]:
X = [sentence + [word2index["--PADDING--"]] * (MAX_SENTENCE - len(sentence)) for sentence in X]
y = [sentence + [tag2index["--PADDING--"]] * (MAX_SENTENCE - len(sentence)) for sentence in y]
print("X[0]:", X[0])
print("y[0]:", y[0])

In [None]:
TAG_COUNT = len(tag2index)
y = [ np.eye(TAG_COUNT)[sentence] for sentence in y]
print("X[0]:", X[0])
print("y[0]:", y[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)

print("Number of sentences in the training dataset: {}".format(len(X_train)))
print("Number of sentences in the test dataset : {}".format(len(X_test)))

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
WORD_COUNT = len(index2word)
DENSE_EMBEDDING = 50
LSTM_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_UNITS = 100
BATCH_SIZE = 256
MAX_EPOCHS = 5

In [None]:
input_layer = layers.Input(shape=(MAX_SENTENCE,))

model = layers.Embedding(WORD_COUNT, DENSE_EMBEDDING, embeddings_initializer="uniform", input_length=MAX_SENTENCE)(input_layer)

model = layers.Bidirectional(layers.LSTM(LSTM_UNITS, recurrent_dropout=LSTM_DROPOUT, return_sequences=True))(model)

model = layers.TimeDistributed(layers.Dense(DENSE_UNITS, activation="relu"))(model)

crf_layer = CRF(units=TAG_COUNT)
output_layer = crf_layer(model)

ner_model = Model(input_layer, output_layer)

loss = losses.crf_loss
acc_metric = metrics.crf_accuracy
opt = tf.keras.optimizers.Adam(lr=0.001)

ner_model.compile(optimizer=opt, loss=loss, metrics=[acc_metric])

ner_model.summary()

In [None]:
history = ner_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.1, verbose=2)