In [1]:
# import os
# import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras import layers
# import tensorflow_hub as hub
# import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

In [2]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [30]:
df = pd.read_csv('data/train/train.csv', sep='\t')
print("Number of sentences =", len(df))
print("\nData:")
print(df.iloc[:3])

Number of sentences = 1895

Data:
                                                text  label
0  5. Can regularly rinsing your nose with saline...      0
1  French police chief killed himself after #Char...      1
2  Coronavirus disease (COVID-19) advice for the ...      0


In [9]:
sum(df_dev['label'])

139

In [31]:
df_dev = pd.read_csv('data/dev/dev.csv', sep='\t')
print("Number of sentences =", len(df_dev))
print("\nData:")
print(df_dev.iloc[:3])

Number of sentences = 632

Data:
                                                text  label
0  COVID-19 Fact:\nAre hand dryers effective in k...      0
1  atruchecks when can we expect the result of my...      0
2  How does COVID-19 spread? \n\nPeople can catch...      0


In [28]:
df_test = pd.read_csv('data/test/test.csv', sep='\t')
print("Number of sentences =", len(df_test))
print("\nData:")
print(df_test.iloc[:3])

Number of sentences = 558

Data:
                                                text
0  How Does COVID-19 Spread? http http WCCO Thank...
1  brain_warrior I hate to keep saying it, but Ca...
2  Q. How are COVID-19 and influenza viruses diff...


In [32]:
sentences = df['text'].values
labels = df['label'].values
y_train = np.array(labels)

In [33]:
dev_sentences = df_dev['text'].values
dev_labels = df_dev['label'].values
y_dev = np.array(dev_labels)

In [29]:
test_sentences = df_test['text'].values

In [34]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<UNK>", num_words=10000)
tokenizer.fit_on_texts(sentences)

x_train = tokenizer.texts_to_matrix(sentences, mode="count") #BOW representation
x_dev = tokenizer.texts_to_matrix(dev_sentences, mode="count")
x_test = tokenizer.texts_to_matrix(test_sentences, mode="count")
vocab_size = x_train.shape[1]
print("Vocab size =", vocab_size)
print(x_train[0])

Vocab size = 10000
[ 0.  0. 16. ...  0.  0.  0.]


In [19]:
tokenizer.__dict__

{'word_counts': OrderedDict([('5', 134),
              ('can', 1449),
              ('regularly', 101),
              ('rinsing', 85),
              ('your', 1038),
              ('nose', 140),
              ('with', 1707),
              ('saline', 81),
              ('help', 290),
              ('prevent', 376),
              ('infection', 233),
              ('the', 8840),
              ('new', 1325),
              ('coronavirus', 1471),
              ('http', 10953),
              ('4', 377),
              ('eating', 121),
              ('garlic', 134),
              ('covid19malaysia', 5),
              ('6', 269),
              ('do', 913),
              ('vaccines', 124),
              ('against', 385),
              ('pneumonia', 112),
              ('protect', 261),
              ('you', 2498),
              ('7', 121),
              ('spraying', 95),
              ('alcohol', 185),
              ('or', 1203),
              ('chlorine', 86),
              ('all', 837),
        

In [35]:
xseq_train = tokenizer.texts_to_sequences(sentences)
print(xseq_train[0])

[312, 23, 434, 516, 33, 293, 19, 551, 117, 91, 155, 19, 3, 26, 22, 2, 90, 23, 358, 313, 117, 91, 155, 19, 3, 26, 22, 4443, 2, 128, 39, 348, 88, 391, 133, 11, 88, 3, 26, 22, 2, 359, 23, 462, 205, 30, 509, 45, 147, 33, 302, 126, 3, 26, 22, 3957, 2, 607, 44, 107, 16, 498, 510, 7, 598, 27, 185, 19, 3, 26, 22, 2, 566, 23, 52, 876, 850, 861, 126, 3, 26, 22, 2, 303, 16, 287, 517, 107, 7, 379, 3, 26, 22, 2, 695, 3, 26, 22, 297, 21, 216, 183, 770, 738, 2, 659, 360, 4, 202, 771, 59, 18, 91, 3, 26, 22, 113, 2, 749, 452, 621, 9, 877, 297, 126, 3, 26, 22, 2, 715, 29, 32, 65, 23, 21, 216, 7, 499, 19, 202, 9, 815, 878, 2, 879, 739, 205, 59, 18, 133, 11, 88, 29, 32, 9, 23, 21, 791, 2, 1138, 193, 816, 5, 1031, 33, 1288, 17, 303, 1445, 30, 64, 493, 880, 30, 1327, 2056, 59, 18, 380, 11, 2, 1216, 11, 23, 953, 28, 3, 22, 113, 29, 32, 971, 3, 26, 22, 59, 18, 380, 11, 53, 758, 2, 1074, 1406, 447, 5, 3, 1007, 30, 5, 1407, 740, 151, 1628, 1581, 59, 18, 91, 3, 22, 2691, 2, 32, 478, 759, 830, 39, 18, 119, 29, 32

In [36]:
xseq_dev = tokenizer.texts_to_sequences(dev_sentences)
print(xseq_dev[0])

[29, 32, 201, 16, 287, 517, 107, 7, 379, 3, 26, 22, 1046, 3, 536, 19, 33, 1202, 67, 2, 4376, 36, 16, 7, 201, 1, 1, 1]


In [37]:
xseq_test = tokenizer.texts_to_sequences(test_sentences)
print(xseq_test[0])

[44, 59, 29, 32, 119, 2, 2, 1, 363, 1, 11, 16, 3, 106, 997, 10, 23, 1259, 7, 3, 483, 5, 1460, 160, 116, 54, 2961, 463, 5, 214, 46, 2]


In [38]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 2048
xseq_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)
print(xseq_train[0])

[312  23 434 ...   0   0   0]


In [247]:
xseq_train = xseq_train.reshape(1895, 128, 8)
xseq_dev = xseq_dev.reshape(632, 128, 8)

In [39]:
from keras.layers import *
from keras.models import *
from keras import backend as K
class attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="normal")

        super(attention,self).build(input_shape)


    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

In [25]:
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [56]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [54]:
mc = tf.keras.callbacks.ModelCheckpoint('model/best_model-{val_get_f1:05f}.h5', monitor='val_get_f1', mode='max', verbose=0, save_best_only=True, initial_value_threshold=0.86)

In [251]:
xseq_dev

array([[  29,   32,  201, ...,    0,    0,    0],
       [   1,   85,   23, ...,    0,    0,    0],
       [  44,   59,   29, ...,    0,    0,    0],
       ...,
       [  33,  838, 4249, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0]])

In [274]:
t = np.array([1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5,6,7,8,9,10,11,12])
t = t.reshape(2,3,4)
t.reshape(2,1,3,4)

array([[[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]],


       [[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]]])

In [None]:
embed = Embedding(5000, 50, input_length=1024)(xseq_train)
embed_dev = Embedding(5000, 50, input_length=1024)(xseq_dev)
embed_dev

In [275]:
xseq_train = tf.reshape(embed, [1895, 64, 16, 50])
xseq_dev = tf.reshape(embed_dev, [632, 64, 16, 50])

In [281]:
Conv2D(filters=50, kernel_size=(3,3), padding='same', activation='relu', input_shape = (64,16,50))(xseq_train).shape

TensorShape([1895, 64, 16, 50])

In [None]:
while(True):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=2048))
    # model.add(Conv2D(filters=50, kernel_size=(3,50), padding='same', activation='relu'))
    # model.add(Dropout(0.5))
    # model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy', get_f1])
    history = model.fit(xseq_train, y_train, epochs=30, batch_size= 128,
                        validation_data = (xseq_dev, y_dev), verbose = 0, callbacks = [mc])

In [57]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', get_f1])

In [58]:
history = model.fit(xseq_train, y_train, epochs=20, batch_size= 128,
                    validation_data = (xseq_dev, y_dev), verbose = 1, callbacks = [mc])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [150]:
from sklearn.metrics import f1_score

In [26]:
xseq_test

array([[  44,   59,   29, ...,    0,    0,    0],
       [ 972, 2151,   10, ...,    0,    0,    0],
       [ 136,   44,   16, ...,    0,    0,    0],
       ...,
       [ 507, 2215,    1, ...,    0,    0,    0],
       [1098,   66,   68, ...,    0,    0,    0],
       [  23,   29,   32, ...,    0,    0,    0]])

In [40]:
model1 = tf.keras.models.load_model('model/best_model-0.861965.h5', custom_objects={'get_f1':get_f1})
model2 = tf.keras.models.load_model('model/best_model-0.862929.h5', custom_objects={'get_f1':get_f1})
model3 = tf.keras.models.load_model('model/best_model-0.863069.h5', custom_objects={'get_f1':get_f1})
model4 = tf.keras.models.load_model('model/best_model-0.870752.h5', custom_objects={'get_f1':get_f1})
model5 = tf.keras.models.load_model('model/best_model-0.872530.h5', custom_objects={'get_f1':get_f1})

In [41]:
y_test_1 = model1.predict(np.array(xseq_test)[:,:1024])
y_test_2 = model2.predict(xseq_test[:,:1024])
y_test_3 = model3.predict(xseq_test)
y_test_4 = model4.predict(xseq_test)
y_test_5 = model5.predict(xseq_test)

In [59]:
y_test = model.predict(xseq_test)

In [60]:
import csv
index = 0
p_label = []
for value in y_test:
    if value[0] >= 0.5:
        p_label.append(1)
    else:
        p_label.append(0)
df = pd.DataFrame({'Id': list(range(len(p_label))), 'Predicted': p_label})
df.to_csv('data/test/test_pred.csv', sep=',', index=False, encoding='utf-8')

In [36]:
def get_01(k):
    r = []
    for i in k:
        if i >= 0.5:
            r.append(1)
        else:
            r.append(0)
    if sum(r) > 2:
        return 1
    else:
        return 0


In [37]:
import csv
index = 0
p_label = []
for i in range(558):
    temp = get_01([y_test_1[i],y_test_2[i],y_test_3[i],y_test_4[i],y_test_5[i]])
    p_label.append(temp)
df = pd.DataFrame({'Id': list(range(558)), 'Predicted': p_label})
df.to_csv('data/test/test_pred.csv', sep=',', index=False, encoding='utf-8')

In [47]:

df2 = pd.read_csv('pred/test.pred (45).csv')
df3 = pd.read_csv('pred/test_pred50.csv')
df4 = pd.read_csv('pred/test.pred (31).csv')
df6 = pd.read_csv('pred/test_pred55.csv')
df7 = pd.read_csv('pred/test.pred (35).csv')


In [66]:
result = df2['Predicted'] + df3['Predicted'] + df4['Predicted'] + y_test[:,0] + y_test_5[:,0]

In [67]:
p_label = []
for i in result:
    if i > 2:
        p_label.append(1)
    else:
        p_label.append(0)
p_label
df = pd.DataFrame({'Id': list(range(558)), 'Predicted': p_label})
df.to_csv('data/test/test_pred.csv', sep=',', index=False, encoding='utf-8')

In [68]:
sum(p_label)

140

In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2048, 64)          640000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 256)               33024     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 739,329
Trainable params: 739,329
Non-trainable params: 0
________________________________________________