# Import Character Vocab


In [159]:
import numpy as np
import json
import tensorflow as tf

In [160]:
with open('files/vocab.json', 'r') as f:
  CHAR_INDICES = json.load(f)

In [161]:
print(CHAR_INDICES)

{' ': 0, '(': 1, ')': 2, ',': 3, '-': 4, '0': 5, '1': 6, '2': 7, '3': 8, '4': 9, '5': 10, '6': 11, '7': 12, '8': 13, '9': 14, 'A': 15, 'B': 16, 'C': 17, 'D': 18, 'E': 19, 'F': 20, 'G': 21, 'H': 22, 'I': 23, 'J': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'Q': 31, 'R': 32, 'S': 33, 'T': 34, 'U': 35, 'V': 36, 'W': 37, 'X': 38, 'Y': 39, 'Z': 40, 'a': 41, 'b': 42, 'c': 43, 'd': 44, 'e': 45, 'f': 46, 'g': 47, 'h': 48, 'i': 49, 'j': 50, 'k': 51, 'l': 52, 'm': 53, 'n': 54, 'o': 55, 'p': 56, 'q': 57, 'r': 58, 's': 59, 't': 60, 'u': 61, 'v': 62, 'w': 63, 'x': 64, 'y': 65, 'z': 66, '<pad>': 67, '<unk>': 68}


# Preprocessing text data

In [162]:

look_back = 10

In [163]:
def create_dataset(text, look_back = look_back):
    """
    take text with label (text that being defined where to cut ('|')) 
    and encode text and make label
    return encoded text & label
  """
  X, y = [], []
  text = '|' + text
  data = [CHAR_INDICES['<pad>']] * look_back
  for i in range(1, len(text)):
    current_char = text[i]
    before_char = text[i-1]

    if current_char == '|':
      continue
    data = data[1:] + [CHAR_INDICES[current_char]]  # X data

    target = 1 if before_char == '|' else 0  # y data
    X.append(data)
    y.append(target)
  
  return np.array(X), tf.one_hot(y, 2)

In [164]:
def text_pred_preprocessing(text, sequence_len=10):
    """
    take unseen (testing) text and encode it with CHAR_DICT
    //It's like create_dataset() but not return label
    return encoded text
  """
  X = []
  data = [CHAR_INDICES['<pad>']] * sequence_len
  for char in text:
    char = char if char in CHAR_INDICES else '<unk>'  # check char in dictionary
    data = data[1:] + [CHAR_INDICES[char]]  # X data
    X.append(data)
  return np.array(X)

In [165]:
def word_tokenize(text, class_):
    cut_indexs = []
    words = []

    # boolean index of each word 1 if cut before
    class_ = np.append(class_, 1)

    # if y_label at i is 1 so add i (index) to cut_indexs
    for i, value in enumerate(class_):
      if value == 1:
        cut_indexs.append(i)

    # add word after cutting till before ext cutting
    for i in range(len(cut_indexs)-1):
      words.append(text[cut_indexs[i]:cut_indexs[i+1]])
    
    return words

In [166]:
def decode_label(y):
  return tf.argmax(y, axis=-1).numpy()

In [167]:
with open('files/Dictionary.json', 'r') as f1, open('files/Dictionary_cut.json', 'r') as f2:
    Dict = json.load(f1)
    Dict_cut = json.load(f2)

In [168]:
for key, value in Dict.items():
    print(key, len(value))
print('-'*15)
for key, value in Dict_cut.items():
    print(key, len(value))

alkane 10
cyclo_alkane 8
alkene 25
dialkene 69
cyclo_alkene 8
alkyne 9
dialkyne 49
cyclo_alkyne 8
---------------
alkane 10
cyclo_alkane 8
alkene 25
dialkene 69
cyclo_alkene 8
alkyne 9
dialkyne 49
cyclo_alkyne 8


# Create Dataset

In [169]:
last_key = list(Dict.keys())[-1]
print(last_key)

cyclo_alkyne


In [170]:
dataset_cut = ''
for key, value in Dict_cut.items():
    for name in value:
        dataset_cut += name
        if name != Dict_cut[last_key][-1]:
            dataset_cut = dataset_cut + '| |'
    #print(len(dataset_cut.replace('|','')))
print(dataset_cut[:101])
print(len(dataset_cut.replace('|','')))

Meth|ane| |Eth|ane| |Prop|ane| |But|ane| |Pent|ane| |Hex|ane| |Hept|ane| |Oct|ane| |Non|ane| |Dec|ane
2452


In [171]:
dataset = ""
for key, value in Dict.items():
  for name in value:
    dataset += name
    if name != Dict[last_key][-1]:
      dataset = dataset + ' '
  #print(len(dataset.replace('|','')))
print(dataset[:73])
print(len(dataset))

Methane Ethane Propane Butane Pentane Hexane Heptane Octane Nonane Decane
2452


In [185]:
X_train, y = create_dataset(dataset_cut)
print(y.numpy())

X_test = text_pred_preprocessing(dataset)
print(X_train.shape, y.shape)
print(X_test.shape)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]
(2452, 10) (2452, 2)
(2452, 10)


In [173]:
training_data = tf.data.Dataset.from_tensor_slices((X_train,y))
training_data = training_data.batch(128)

# Create Model

## Model Architecture

In [174]:
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Embedding
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

In [175]:
_input_shape = (look_back, len(CHAR_INDICES))
print(_input_shape[1])

69


In [176]:
Model = Sequential(
    [
     Embedding(len(CHAR_INDICES), _input_shape[1]),
     Bidirectional(LSTM(_input_shape[1]//2, return_sequences=False),
                      merge_mode='sum',
                      weights=None),
     Dense(_input_shape[1]//4),
     Dense(2, activation='softmax')
    ],
    name='model'
)

In [177]:
Model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 69)          4761      
_________________________________________________________________
bidirectional_6 (Bidirection (None, 34)                28288     
_________________________________________________________________
dense_12 (Dense)             (None, 17)                595       
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 36        
Total params: 33,680
Trainable params: 33,680
Non-trainable params: 0
_________________________________________________________________


## Training Model

In [178]:
Model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
Model.fit(training_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2821ec275b0>

In [179]:
pred_proba = Model.predict(X_test)
print(pred_proba.round(3))

[[0.037 0.963]
 [0.994 0.006]
 [1.    0.   ]
 ...
 [0.006 0.994]
 [0.955 0.045]
 [0.962 0.038]]


In [180]:
pred = decode_label(pred_proba)
y_decode = decode_label(y)

print("true:", y_decode)
print("predict:",pred)
print("y:",y_decode.shape,"ans:",pred.shape)

true: [1 0 0 ... 1 0 0]
predict: [1 0 0 ... 1 0 0]
y: (2452,) ans: (2452,)


In [181]:
# Count same item between y (label) and pred (prediction)
elem_same = (y_decode == pred).sum()

print("Same =",elem_same,", Not Same =",pred.shape[0]-elem_same)
print("\nConfusion Matrix:")
print(tf.math.confusion_matrix(y_decode, pred, num_classes=2).numpy())

Same = 2426 , Not Same = 26

Confusion Matrix:
[[1002    3]
 [  23 1424]]


## Test on unseen data

In [182]:
myText = "Benzyl-N,2-dimethylpentan-3-aminoate"
myText_cut = "Benz|yl|-|N|,|2|-|di|meth|yl|pent|an|-|3|-|amin|oate"

In [183]:
myText_test, my_y = create_dataset(myText_cut)

pred_test_proba = Model.predict(myText_test)

pred_test = decode_label(pred_test_proba)
my_y_decode = decode_label(my_y)

print(pred_test)
print(my_y_decode)

# Count same item between y (label) and pred (prediction)
elem_same = (pred_test == my_y_decode).sum()
print("\nSame =",elem_same,", Not Same =",pred_test.shape[0]-elem_same)

print("\nConfusion Matrix:")
print(tf.math.confusion_matrix(my_y_decode, pred_test, num_classes=2).numpy())

[1 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0]
[1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 1 1 0 0 0 1 0 0 0]

Same = 30 , Not Same = 6

Confusion Matrix:
[[16  3]
 [ 3 14]]


In [184]:
words = word_tokenize(myText, pred_test)
print(words)
print('|'.join(words))

['Benzyl', '-', 'N', ',', '2', '-', 'di', 'met', 'h', 'ylpent', 'an', '-', '3', '-', 'a', 'mino', 'ate']
Benzyl|-|N|,|2|-|di|met|h|ylpent|an|-|3|-|a|mino|ate
