In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from CRF import CRF
import pandas as pd
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from model_word2vec import Model
from gensim.models import FastText
import logging

In [2]:
print(tf.__version__)

2.2.0


In [3]:
def load_data(path):
    f = open(path, "r")
    lines = f.readlines()

    X = []
    Y = []

    x = []
    y = []

    for line in lines:
        if line == "\n":
            X.append(x)
            Y.append(y)
            x = []
            y = []
        else:
            data = line.strip().split('\t')
            x.append(str(data[0]))
            y.append(str(data[1]))
    if len(x) > 0:
        X.append(x)
        Y.append(y)
    return X, Y

In [4]:
train_data_path = "./train/Genia4ERtask1.iob2"
eval_data_path = "./evaluation/Genia4EReval1.iob2"

X_train, Y_train = load_data(train_data_path)
X_eval, Y_eval = load_data(eval_data_path)

lengths_train = [len(x) for x in X_train]
lengths_eval = [len(x) for x in X_eval]

X_eval_origin = X_eval

In [5]:
max_len_train = max(len(sentence) for sentence in X_train)
max_len_eval = max(len(sentence) for sentence in X_eval)
max_len = max(max_len_train, max_len_eval)
print("max_len: ", max_len)

X_train = tf.keras.preprocessing.sequence.pad_sequences(
    X_train,
    dtype=object,
    value='Backlight',
    padding="post",
    maxlen=max_len
)
X_eval = tf.keras.preprocessing.sequence.pad_sequences(
    X_eval,
    dtype=object,
    value='Backlight',
    padding="post",
    maxlen=max_len
)
print("X_train:\n", X_train)
print("X_eval:\n", X_eval)

X = np.concatenate((X_train, X_eval), axis=0).tolist()

print("X:\n", X)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
ft_model = FastText(X, min_count=1, size=200, iter=20)
ft_model.save("fasttext.model")
del X

max_len:  208
X_train:
 [['IL-2' 'gene' 'expression' ... 'Backlight' 'Backlight' 'Backlight']
 ['Activation' 'of' 'the' ... 'Backlight' 'Backlight' 'Backlight']
 ['In' 'primary' 'T' ... 'Backlight' 'Backlight' 'Backlight']
 ...
 ['This' 'gp160' 'treatment' ... 'Backlight' 'Backlight' 'Backlight']
 ['Effects' 'similar' 'to' ... 'Backlight' 'Backlight' 'Backlight']
 ['The' 'aberrant' 'activation' ... 'Backlight' 'Backlight' 'Backlight']]
X_eval:
 [['Number' 'of' 'glucocorticoid' ... 'Backlight' 'Backlight' 'Backlight']
 ['The' 'study' 'demonstrated' ... 'Backlight' 'Backlight' 'Backlight']
 ['In' 'the' 'lymphocytes' ... 'Backlight' 'Backlight' 'Backlight']
 ...
 ['Higher' 'nuclear' 'content' ... 'Backlight' 'Backlight' 'Backlight']
 ['CONCLUSION' '.' 'Backlight' ... 'Backlight' 'Backlight' 'Backlight']
 ['It' 'is' 'conceivable' ... 'Backlight' 'Backlight' 'Backlight']]


2020-06-14 05:38:55,585 : INFO : resetting layer weights
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

2020-06-14 05:39:11,255 : INFO : collecting all words and their counts
2020-06-14 05:39:11,256 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-14 05:39:11,624 : INFO : PROGRESS: at sentence #10000, processed 2080000 words, keeping 15440 word types
2020-06-14 05:39:12,004 : INFO : PROGRESS: at sentence #20000, processed 4160000 words, keeping 23125 word types
2020-06-14 05:39:12,101 : INFO : collected 25104 word types from a corpus of 4659616 raw words and 22402 sentences
2020-06-14 05:39:12,102 : INFO : Loading a fresh vocabulary
2020-06-14 05:39:12,185 : INFO : effective_m

In [6]:
ft_model.wv['Backlight']

array([ 3.3016303e+00,  3.4687457e+00,  3.0948467e+00, -4.4990101e+00,
        8.3867097e-01,  9.0488768e+00, -2.8340123e+00, -6.0428786e-01,
        1.7930250e-01, -3.6419766e+00,  5.4491673e-02,  2.6600723e+00,
       -1.4920245e+00, -2.1181319e+00,  2.5842035e+00, -6.6178761e+00,
       -4.9707189e+00, -1.1181245e+00,  3.1023738e-01,  2.4177692e+00,
        1.5093932e-01,  3.3681993e+00, -4.3830051e+00,  2.5112476e+00,
       -2.3588431e+00, -1.3748438e-02, -1.4787818e-01,  8.6635238e-01,
        1.6370355e+00, -7.0977297e+00, -4.8077069e-03,  6.3028288e+00,
        1.2784752e+00, -1.8567324e+00, -2.9085598e+00, -5.4660845e-01,
        4.5925527e+00,  2.3911141e-01, -2.2149954e+00, -4.1357155e+00,
       -4.1071038e+00,  1.7571224e+00, -2.3496962e+00, -2.7928078e+00,
        6.4037180e+00, -1.8147177e+00, -3.1739278e+00,  4.2749283e-01,
       -5.1916748e-01, -3.5767794e+00, -7.0898479e-01,  2.8378897e+00,
        2.9629257e-01, -2.7969213e+00,  2.8647742e+00,  1.9151322e+00,
      

In [7]:
X_train = [[ft_model.wv[word] for word in sentence] for sentence in X_train]
X_eval = [[ft_model.wv[word] for word in sentence] for sentence in X_eval]

In [8]:
print(np.array(X_train).shape)
print(np.array(X_eval).shape)

(18546, 208, 200)
(3856, 208, 200)


In [9]:
label2int = {"Backlight": 0}
label_count = 1

for labels in Y_train:
    for label in labels:
        if label not in label2int:
            label2int[label] = label_count
            label_count += 1
for labels in Y_eval:
    for label in labels:
        if label not in label2int:
            label2int[label] = label_count
            label_count += 1
print("label_count: ", label_count)

Y_train = [[label2int[label] for label in labels] for labels in Y_train]
Y_eval = [[label2int[label] for label in labels] for labels in Y_eval]

Y_train = tf.keras.preprocessing.sequence.pad_sequences(
    Y_train,
    value=0,
    padding="post",
    maxlen=max_len
)
Y_train = Y_train.reshape((Y_train.shape[0], Y_train.shape[1], 1))
Y_eval = tf.keras.preprocessing.sequence.pad_sequences(
    Y_eval,
    value=0,
    padding="post",
    maxlen=max_len
)
Y_eval = Y_eval.reshape((Y_eval.shape[0], Y_eval.shape[1], 1))
print("Y_train:", Y_train.shape)
print("Y_eval:", Y_eval.shape)

label_count:  12
Y_train: (18546, 208, 1)
Y_eval: (3856, 208, 1)


In [10]:
model = Model(
    batch_size=512,
    epochs=300,
)

In [11]:
model.fit(X_train, Y_train, label_count=label_count)

(18546, 208, 200)
(18546, 208, 1)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 208, 200)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 208, 64)           59648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 208, 64)           0         
_________________________________________________________________
crf_layer (CRF)              (None, 208)               948       
Total params: 60,596
Trainable params: 60,596
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300

In [12]:
Y_pred = model.predict(X_eval)

In [13]:
Y_pred

array([[3, 3, 4, ..., 0, 0, 0],
       [3, 3, 3, ..., 0, 0, 0],
       [3, 3, 6, ..., 0, 0, 0],
       ...,
       [3, 3, 3, ..., 0, 0, 0],
       [3, 3, 0, ..., 0, 0, 0],
       [3, 3, 3, ..., 0, 0, 0]], dtype=int32)

In [14]:
Y_eval

array([[[3],
        [3],
        [4],
        ...,
        [0],
        [0],
        [0]],

       [[3],
        [3],
        [3],
        ...,
        [0],
        [0],
        [0]],

       [[3],
        [3],
        [6],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[3],
        [3],
        [3],
        ...,
        [0],
        [0],
        [0]],

       [[3],
        [3],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[3],
        [3],
        [3],
        ...,
        [0],
        [0],
        [0]]], dtype=int32)

In [15]:
Y_pred_unique = np.unique(Y_pred)
print(Y_pred_unique)

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [16]:
int2label  = {}
for key, val in label2int.items():
    int2label[val] = key
# print(int2label)

In [17]:
Y_pred = list(Y_pred)
Y_pred = [[int2label[label] for label in labels] for labels in Y_pred]
# print(Y_pred)

In [18]:
Y_eval = Y_eval.tolist()
Y_eval = [[int2label[label[0]] for label in labels] for labels in Y_eval]
# print(Y_eval)

In [19]:
length = len(Y_pred)
y = []
y_prime = []
with open('./check/answer_ft.iob2', 'w') as f:
    for i in range(length):
        for j in range(lengths_eval[i]):
            if Y_pred[i][j] == "Backlight":
                Y_pred[i][j] = "O"
            f.write(str(X_eval_origin[i][j]))
            f.write('\t')
            f.write(str(Y_pred[i][j]))
            f.write('\n')

            y.append(Y_eval[i][j])
            y_prime.append(Y_pred[i][j])
        f.write('\n')

In [20]:
print(classification_report(y, y_prime))

              precision    recall  f1-score   support

       B-DNA       0.72      0.69      0.70      1056
       B-RNA       0.72      0.76      0.74       118
 B-cell_line       0.58      0.60      0.59       500
 B-cell_type       0.82      0.66      0.73      1921
   B-protein       0.72      0.84      0.77      5067
       I-DNA       0.74      0.85      0.79      1789
       I-RNA       0.78      0.84      0.81       187
 I-cell_line       0.56      0.74      0.64       989
 I-cell_type       0.85      0.73      0.79      2991
   I-protein       0.78      0.77      0.77      4774
           O       0.97      0.96      0.97     81647

    accuracy                           0.93    101039
   macro avg       0.75      0.77      0.76    101039
weighted avg       0.93      0.93      0.93    101039

