In [1]:
import os
import pydot
import graphviz
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from tensorflow.keras import utils
from keras.layers import Convolution1D, MaxPooling1D, Flatten, Dense, Embedding, Activation, BatchNormalization

In [2]:
def ConvolutionalNet(vocabulary_size, embedding_dimension, input_length, embedding_weights=None):
    
    model = Sequential()
    if embedding_weights is None:
        model.add(Embedding(vocabulary_size, embedding_dimension, input_length=input_length, trainable=False))
    else:
        model.add(Embedding(vocabulary_size, embedding_dimension, input_length=input_length, weights=[embedding_weights], trainable=False))

    model.add(Convolution1D(32, 2, kernel_regularizer=l2(0.005)))
    model.add(BatchNormalization())
    model.add(Activation("relu"))

    model.add(Convolution1D(32, 2, kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Activation("relu"))

    model.add(Convolution1D(32, 2, kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Activation("relu"))

    model.add(MaxPooling1D(17))
    model.add(Flatten())

    model.add(Dense(1, kernel_regularizer=l2(0.001))) #bias=True,
    model.add(BatchNormalization())
    model.add(Activation("sigmoid"))

    return model

In [3]:
SEQUENCE_LENGTH = 20
EMBEDDING_DIMENSION = 30

def words_to_indices(inverse_vocabulary, words):
    return [inverse_vocabulary[word] for word in words]

if __name__ == "__main__":

    vocabulary = open("../data/vocabulary.txt").read().split("\n")
    inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))

    clickbait = open("../data/clickbait.preprocessed.txt").read().split("\n")
    clickbait = pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)

    genuine = open("../data/genuine.preprocessed.txt").read().split("\n")
    genuine = pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in genuine], maxlen=SEQUENCE_LENGTH)

    X = np.concatenate([clickbait, genuine], axis=0)
    y = np.array([[1] * clickbait.shape[0] + [0] * genuine.shape[0]], dtype=np.int32).T
    p = np.random.permutation(y.shape[0])
    X = X[p]
    y = y[p]

    X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y)

    embedding_weights = np.load("../models/embeddings.npy")
    params = dict(vocabulary_size=len(vocabulary), embedding_dimension=EMBEDDING_DIMENSION, input_length=SEQUENCE_LENGTH, embedding_weights=embedding_weights)
    model = ConvolutionalNet(**params)

In [4]:
tf.keras.utils.plot_model(
model,
to_file="model.png",
show_shapes=True,
show_dtype=False,
show_layer_names=True,
rankdir="TB",
expand_nested=True,
dpi=96,
layer_range=None,
show_layer_activations=True,
)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')
