In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from math import nan
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Attention
import keras
import tensorflow as tf
# from keras_contrib.layers import CRF
from keras import Model, Input
import matplotlib.pyplot as plt
from tensorflow.python.keras.layers import SpatialDropout1D
import nltk
from nltk.stem import WordNetLemmatizer

from time import sleep
from tqdm import tqdm

In [2]:
def preprocess_dataset(dataset_name, word_column, sentence_column, tag_column, other_tag):
    data = pd.read_csv(dataset_name, encoding="latin1")
    data = data.fillna(method="ffill")

    # words = list(set(data["Word"].values))
    words = list(set(data[word_column].values))
    words.append("ENDPAD")
    num_words = len(words)

    tags = list(set(data[tag_column].values))
    num_tags = len(tags)

    class SentenceGetter(object):
        def __init__(self, data):
            self.n_sent = 1
            self.data = data
            self.empty = False
            agg_func = lambda s: [(w, t) for w, t in zip(s[word_column].values.tolist(), s[tag_column].values.tolist())]
            self.grouped = self.data.groupby(sentence_column).apply(agg_func)
            self.sentences = [s for s in self.grouped]

        def get_next(self):
            try:
                s = self.grouped["Sentence {}".format(self.n_sent)]
                self.n_sent += 1
                return s
            except:
                return None

    getter = SentenceGetter(data)
    sentences = getter.sentences

    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    max_len = data.groupby([sentence_column], sort=False).size().max()

    x = [[word2idx[w[0]] for w in s] for s in sentences]
    x = pad_sequences(maxlen=max_len, sequences=x, padding="post", value=word2idx["ENDPAD"])

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx[other_tag])

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

    return x_train, x_test, y_train, y_test, word2idx, tag2idx, max_len, words, tags

In [3]:
def build_matrix_embeddings(path, num_tokens, embedding_dim, word_index):


    hits, misses = 0, 0
    embeddings_index = {}

    print('Loading file...')

    sleep(0.5)

    for line in tqdm(open(path, encoding='utf-8')):
        word, coefs = line.split(maxsplit=1)
        embeddings_index[word] = np.fromstring(coefs, "f", sep=" ")

    print("Processed %s Word Vectors." % len(embeddings_index))

    sleep(0.5)

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))

    for word, i in tqdm(word_index.items()):
        if i >= num_tokens:
            continue
        try:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                embedding_vector = embeddings_index.get(str(word).lower())
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    hits += 1
                else:
                    embedding_vector = embeddings_index.get(str(word).upper())
                    if embedding_vector is not None:
                        embedding_matrix[i] = embedding_vector
                        hits += 1
                misses += 1
        except:
            embedding_matrix[i] = embeddings_index.get('UNK')

    print("Hits: %d Tokens | Miss: %d Tokens" % (hits, misses))

    return embedding_matrix

In [4]:
dataset = 'kaggle/input/covid_subset_0.5perc.csv'
#dataset = 'kaggle/input/ner_dataset.csv'
x_train, x_test, y_train, y_test, word2idx, tag2idx, max_len, words, tags = preprocess_dataset(dataset, 'word', 'sentence', 'entity', 'Other')

In [5]:
file_path = 'kaggle/embed/glove.6B.100d.txt'

glove_embeddings = \
build_matrix_embeddings(path=file_path,
                        num_tokens=len(words), 
                        embedding_dim=100,
                        word_index=word2idx)

Loading file...


400000it [00:08, 49199.55it/s]


Processed 400000 Word Vectors.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27637/27637 [00:00<00:00, 811108.71it/s]

Hits: 13412 Tokens | Miss: 14225 Tokens





In [6]:
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))

idx2tag = {v: k for k, v in tag2idx.items()}
idx2word = {i: w for w, i in word2idx.items()}

input_word = Input(shape=(max_len,))
#model = Embedding(input_dim=len(words), output_dim=128, input_length=max_len)(input_word)
model = Embedding(input_dim=glove_embeddings.shape[0], output_dim=glove_embeddings.shape[1],\
                  weights=[glove_embeddings], trainable=True)(input_word)
model = SpatialDropout1D(0.2)(model)
rnn = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1))(model)
# attention = Attention()([rnn, rnn])
# hidden1 = keras.layers.Dense(len(tags), activation='relu')(attention)
# hidden2 = keras.layers.Dense(len(tags), activation='relu')(hidden1)
out = TimeDistributed(Dense(len(tags), activation="softmax"))(rnn)
model = Model(input_word, out)
model.summary()

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

history = model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    batch_size=32,
    epochs=3,
    verbose=1
)

model.evaluate(x_test, y_test)

Tensorflow version: 2.10.1
GPU detected: []
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          2763700   
                                                                 
 tf.identity (TFOpLambda)    (None, 512, 100)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 512, 256)         234496    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 512, 117)         30069     
 ibuted)                                                         
                                                                 
Total params: 3,0

[0.9166202545166016, 0.8162207007408142]

In [7]:
i = np.random.randint(0, x_train.shape[0])
# i = 151
print(i)
p = model.predict(np.array([x_train[i]]))
p = np.argmax(p, axis=-1)
y_true = y_train[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" * 30)
for w, true, pred in zip(x_train[i], y_true, p[0]):
    if w == len(words) - 1:
        break
    print("{:15}{}\t{}".format(words[w], tags[true], tags[pred]))
print("----------------------------")

679
Word           True 	 Pred

------------------------------
first          B-ORDINAL	Other
case           Other	Other
of             Other	Other
coronavirus    B-CORONAVIRUS	B-CORONAVIRUS
disease        Other	Other
2019           B-DATE	Other
covid-19       B-CORONAVIRUS	Other
pneumonia      B-DISEASE_OR_SYNDROME	Other
in             Other	Other
taiwan         B-GPE	Other
an             Other	Other
outbreak       Other	Other
of             Other	Other
respiratory    B-CHEMICAL	Other
illness        I-CHEMICAL	Other
proved         Other	Other
to             Other	Other
be             Other	Other
infected       Other	Other
by             Other	Other
a              Other	Other
2019           B-DATE	Other
novel          Other	Other
coronavirus    B-CORONAVIRUS	B-CORONAVIRUS
officially     Other	Other
named          Other	Other
coronavirus    B-CORONAVIRUS	B-CORONAVIRUS
disease        Other	Other
2019           B-DATE	Other
covid-19       B-CORONAVIRUS	Other
,              Other	Other
was

In [8]:
i = 65
print(i)
p = model.predict(np.array([x_train[i]]))

65


In [9]:
p[0][7]

array([1.0149045e-03, 4.9902797e-02, 2.8836224e-04, 1.0499849e-03,
       6.2999665e-04, 7.2337146e-04, 3.7516817e-04, 1.4316953e-03,
       1.4341905e-03, 1.5450266e-04, 3.1493750e-04, 2.4996619e-04,
       7.3772838e-04, 3.3050840e-04, 1.0587914e-03, 6.7634846e-04,
       1.7188586e-03, 6.9928134e-04, 4.5405333e-03, 5.7481008e-04,
       2.8238422e-04, 4.9291463e-03, 8.9148022e-03, 1.9839773e-04,
       3.2986901e-04, 3.9629903e-02, 6.2550191e-04, 4.1925260e-03,
       2.7366157e-04, 3.6528232e-04, 7.8856869e-04, 3.8730475e-04,
       1.0111126e-03, 5.6302984e-04, 2.4939733e-04, 1.6066578e-03,
       9.4540743e-03, 6.9909199e-04, 4.7315713e-02, 3.4822110e-04,
       3.0291028e-04, 8.9252442e-03, 2.3532055e-02, 1.2037166e-02,
       2.4140051e-03, 5.0438003e-04, 4.1516149e-01, 2.3991968e-04,
       3.9364002e-03, 2.8326095e-03, 2.0275955e-04, 2.8856448e-04,
       4.6837382e-04, 5.4264455e-03, 1.1171360e-02, 3.4942050e-04,
       1.0920324e-03, 1.2438878e-03, 1.8678027e-03, 5.1643222e

In [10]:
print([(idx2word[w], idx2tag[t]) for w, t in zip(x_test[107], y_test[107])])

[('patients', 'B-GROUP'), ('particularly', 'Other'), ('those', 'Other'), ('with', 'Other'), ('severe', 'B-ORG'), ('symptoms', 'I-ORG'), ('as', 'Other'), ('the', 'Other'), ('clinical', 'Other'), ('effects', 'Other'), ('might', 'Other'), ('be', 'Other'), ('related', 'Other'), ('to', 'Other'), ('the', 'Other'), ('indication', 'Other'), ('(severities', 'Other'), ('of', 'Other'), ('illness', 'B-DISEASE_OR_SYNDROME'), (',', 'Other'), ('the', 'Other'), ('timing', 'Other'), ('of', 'Other'), ('intervention,', 'Other'), ('the', 'Other'), ('dose', 'Other'), ('and', 'Other'), ('duration', 'Other'), ('of', 'Other'), ('corticosteroids', 'B-CHEMICAL'), ('therapy', 'Other'), ('9', 'Other'), ('.', 'Other'), ('of', 'Other'), ('note,', 'Other'), ('as', 'Other'), ('documented', 'Other'), ('in', 'Other'), ('a', 'Other'), ('series', 'Other'), ('of', 'Other'), ('randomized', 'Other'), ('clinical', 'Other'), ('trials', 'Other'), ('(rct),', 'Other'), ('low', 'Other'), ('or', 'Other'), ('physiologic', 'Other'),