In [None]:
with open('data/X_train_new.txt', 'r') as f:
    X = list(map(str, f.read().splitlines()))
with open('data/Y_train_new.txt', 'r') as f:
    Y = list(map(int, f.read().splitlines()))

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X, Y,
    test_size=0.33,
    random_state=996,
    stratify=Y
    )

In [None]:
import os
import numpy as np

path_to_glove_file = "model/w2v_new_200.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
import json
from collections import Counter
from itertools import chain

# Reducing vocabulary size by cutting some low frequency words
VOCAB = [word for word, freq in Counter(chain.from_iterable(sentence.split(' ') for sentence in X)).items() if freq >= 10 and word != '']

# Load pre constructed vocabulary
# with open('data/dictionary.json', 'r') as f:
#     VOCAB = list(json.load(f).keys())[:len(embeddings_index)-1]

VOCAB_SIZE = len(VOCAB)

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf

# Max token is VOCAB_SIZE + 2 because keras reserve 2 more token, one for unseen token, one for whitespace
vectorizer = TextVectorization(max_tokens=VOCAB_SIZE + 2, output_sequence_length=300, vocabulary=VOCAB, ngrams=3)

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import initializers

# embedding_layer = Embedding(
#     num_tokens,
#     embedding_dim,
#     embeddings_initializer=initializers.Constant(embedding_matrix),
#     trainable=False,
# )

embedding_layer = Embedding(
    num_tokens,
    embedding_dim
)

In [None]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.regularizers import l1_l2	

int_sequences_input = keras.Input(shape=(1,), dtype=tf.string)
vect = vectorizer(int_sequences_input)
embedded_sequences = embedding_layer(vect)

x = layers.Conv1D(32, 3, strides=2, padding="same")(embedded_sequences)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Conv1D(64, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)

previous_block_activation = x

for size in [128, 256, 512, 728]:
    x = layers.Activation("relu")(x)
    x = layers.Conv1D(size, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Conv1D(size, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)

    x = layers.MaxPooling1D(3, strides=2, padding="same")(x)

    residual = layers.Conv1D(size, 1, strides=2, padding="same")(
        previous_block_activation
    )

    x = layers.Add()([x, residual])
    previous_block_activation = x

x = layers.SeparableConv1D(1024, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Dropout(0.5)(x)

x = layers.LSTM(256, dropout=0.3, return_sequences=True)(x)
x = layers.LSTM(256, dropout=0.3)(x)
x = layers.Dense(200)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Dense(100)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
preds = layers.Dense(1, activation="linear")(x)
model = keras.Model(int_sequences_input, preds)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall, AUC, BinaryAccuracy

model.compile(
    loss=BinaryCrossentropy(from_logits=True), optimizer=Adam(learning_rate=0.001), metrics=[BinaryAccuracy()]
)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5)
early_stop = EarlyStopping(monitor='val_loss', patience=8)

In [None]:
model.fit(x_train, y_train, batch_size=128, epochs=100, validation_data=(x_val, y_val), callbacks=[reduce_lr, early_stop])