In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Conv1D, MaxPooling1D
from tensorflow.keras.layers import Embedding, BatchNormalization, Activation, Dropout, SpatialDropout1D

import gensim.downloader as api
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

In [None]:
# Set constants

PATH = "data/"
NUM_FEATURES = 50

In [None]:
# Download Gensim data

# word2vec = api.load("glove-twitter-25")
word2vec = api.load("glove-twitter-50")
# word2vec = api.load("glove-twitter-100")
# word2vec = api.load("word2vec-google-news-300")

In [None]:
# Retrieve vocabulary from disk

print("Retrieving article data from disk...")
dataset = pd.read_csv(PATH + "dataset_clean.csv")
print(dataset)

In [None]:
USE_GENSIM = True
MAX_SEQ_LEN = 500

train, test = train_test_split(dataset, test_size=0.2)
x_train = train.loc[:, "Article_Text"].values
y_train = train.loc[:, "Label"].values
x_test = test.loc[:, "Article_Text"].values
y_test = test.loc[:, "Label"].values
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2)

print("Tokenizing data...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((x_train, x_valid, x_test)))

print("Converting to sequences...")
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_valid_sequences = tokenizer.texts_to_sequences(x_valid)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

print("Padding sequences...")
x_train_pad = pad_sequences(x_train_sequences, maxlen=MAX_SEQ_LEN)
x_valid_pad = pad_sequences(x_valid_sequences, maxlen=MAX_SEQ_LEN)
x_test_pad = pad_sequences(x_test_sequences, maxlen=MAX_SEQ_LEN)

print("\tTrain Sequence Shape:", x_train_pad.shape)
print("\tValid Sequence Shape:", x_valid_pad.shape)
print("\tTest Sequence Shape:", x_test_pad.shape)

if USE_GENSIM:
    print("Retrieving word embeddings from Gensim...")
    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, NUM_FEATURES))
    for word, i in word_index.items():
        if i > num_words or not word in word2vec.vocab:
            continue
        embedding_vector = word2vec[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
print("Done!")

In [None]:
# Build and train the model

BATCH_SIZE = 32
NUM_EPOCHS = 30
VALIDATION_STEPS = 10
USE_CONVOLUTION = False
USE_BIDIRECTIONAL = True

# Batch and prefetch the datasets
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_pad, y_train))
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices((x_valid_pad, y_valid))
valid_dataset = valid_dataset.batch(BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Build the model
print("Building the model...")
model = Sequential()

if USE_GENSIM:
    model.add(Embedding(num_words, NUM_FEATURES, input_length=MAX_SEQ_LEN, weights=[embedding_matrix], trainable=False))
else:
    model.add(Embedding(num_words, NUM_FEATURES, input_length=MAX_SEQ_LEN))

if USE_CONVOLUTION:
    model.add(Conv1D(128, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    
if USE_BIDIRECTIONAL:
    model.add(Bidirectional(LSTM(50, dropout=0.5, recurrent_dropout=0.5)))
else:
    model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
    
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary(), "\n")

# Train the model
print("Training the model...")
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=valid_dataset, validation_steps=VALIDATION_STEPS)
print("Done")

In [None]:
# Graph the model history

print("Training Accuracy:", str(round(history.history['accuracy'][-1] * 100, 2)) + "%")
print("Validation Accuracy:", str(round(history.history['val_accuracy'][-1] * 100, 2)) + "%")
plt.plot(history.history['accuracy'], color='blue')
plt.plot(history.history['val_accuracy'], color='green')
plt.title('Training vs. Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()

print("Training Loss:", str(round(history.history['loss'][-1] * 100, 2)) + "%")
print("Validation Loss:", str(round(history.history['val_loss'][-1] * 100, 2)) + "%")
plt.plot(history.history['loss'], color='blue')
plt.plot(history.history['val_loss'], color='green')
plt.title('Training vs. Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Evaluate the model on the test set

score, acc = model.evaluate(x_test_pad, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', str(round(acc * 100, 2)) + "%")
print('Test loss:', str(round(score * 100, 2)) + "%")