In [1]:
import json
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Conv2D

import gensim.downloader as api

from sklearn.model_selection import train_test_split

In [19]:
# Set constants

PATH = "data/"
NUM_FEATURES = 100

In [5]:
# Download Gensim data

word2vec = api.load("word2vec-google-news-300")



In [41]:
# Retrieve vocabulary from disk

print("Retrieving article data from disk...")
dataset = pd.read_csv(PATH + "dataset_clean.csv")
print(dataset)

Retrieving article data from disk...
                                          Article_Title  \
0     If anti-Sanders Democrats were serious, they’d...   
1     The Japanese prime minister is going to Pearl ...   
2     The best argument for each of the 2020 Democra...   
3     Instagram found a new place to show you ads: I...   
4                           My Week Without Apple Watch   
...                                                 ...   
6808  EpiPen Price Hikes Reportedly Added Millions T...   
6809  The First Year Home May Be Most Dangerous For ...   
6810  Mistakes That Fueled Ebola Spread Are Preventi...   
6811              Taylor Spear's GPS Guide On Self Care   
6812  Hacktivists Couldn't Have Pulled Off Cyber Att...   

                                           Article_Text         Publish_Date  \
0     democrats opposed to sen bernie sanders want y...  2020-02-29 20:00:00   
1     about six months ago president obama became th...  2016-12-27 14:20:01   
2     share al

In [42]:
# train, test = train_test_split(dataset)
# x_train = train.loc[:, "Article_Text"].values
# y_train = train.loc[:, "Label"].values
# x_test = test.loc[:, "Article_Text"].values
# y_test = test.loc[:, "Label"].values

# max_length = max([len(x.split()) for x in x_train])
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(x_train)
# sequences = tokenizer.texts_to_sequences(x_train)
# word_index = tokenizer.word_index
# x_train_pad = pad_sequences(sequences, maxlen=max_length)

# sequences_test = tokenizer.texts_to_sequences(x_test)
# x_test_pad = pad_sequences(sequences_test, maxlen=max_length)
    
# indices = np.arange(x_train_pad.shape[0])
# np.random.shuffle(indices)
# x_train_pad = x_train_pad[indices]
# y_train = y_train[indices]
    
# num_words = len(word_index) + 1
# embedding_matrix = np.zeros((num_words, NUM_FEATURES))
# for word, i in word_index.items():
#     if i > num_words or not word in word2vec.vocab:
#         continue
#     embedding_vector = word2vec[word]
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [43]:
# Prepare the data (split, tokenize, pad)

# Split data
print("Splitting data into train/test sets...")
train, test = train_test_split(dataset)
x_train = train.loc[:, "Article_Text"].values
y_train = train.loc[:, "Label"].values
x_test = test.loc[:, "Article_Text"].values
y_test = test.loc[:, "Label"].values

# Tokenize data
print("Tokenizing data...")
tokenizer = Tokenizer()
total_x = np.concatenate((x_train, x_test))
tokenizer.fit_on_texts(total_x)
print(tokenizer.word_index)

# Calculate some hyperparameters
print("Calculating hyperparameters (max_length and vocab_size)...")
max_length = max([len(x.split()) for x in total_x])
vocab_size = len(tokenizer.word_index) + 1
print("\t", "max_length", max_length)
print("\t", "vocab_size", vocab_size)

# Convert data to sequences
print("Converting data to sequences...")
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)
print("\t", "x_train_tokens length", len(x_train_tokens))
print("\t", "x_test_tokens kength", len(x_test_tokens))

# Pad sequences
print("Padding sequences...")
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_length, padding="post")
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_length, padding="post")
print("\t", "x_train_pad shape", x_train_pad.shape)
print("\t", "x_test_pad shape", x_test_pad.shape)
print("Done!")

Splitting data into train/test sets...
Tokenizing data...
Calculating hyperparameters (max_length and vocab_size)...


	 max_length 10447
	 vocab_size 124599
Converting data to sequences...
	 x_train_tokens length 5109
	 x_test_tokens kength 1704
Padding sequences...
	 x_train_pad shape (5109, 10447)
	 x_test_pad shape (1704, 10447)
Done!


In [34]:
# Build and train the model

# Set constants
OUTPUT_SIZE = 128
BATCH_SIZE = 32
NUM_EPOCHS = 50
VALIDATION_STEPS = 10

# Batch and prefetch the dataset
tf_dataset = tf.data.Dataset.from_tensor_slices((x_train_pad, y_train))
tf_dataset = tf_dataset.batch(BATCH_SIZE)
tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Build the model
print("Building the model...")
model = Sequential()
model.add(Embedding(vocab_size, NUM_FEATURES, input_length=max_length))
# model.add(Embedding(num_words,
#                     NUM_FEATURES,
#                     weights=[embedding_matrix],
#                     input_length=max_length,
#                     trainable=False))
model.add(GRU(OUTPUT_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary(), "\n")

# Train the model
print("Training the model...")
history = model.fit(tf_dataset, epochs=NUM_EPOCHS, validation_data=(x_test_pad, y_test), validation_steps=VALIDATION_STEPS)
# history = model.fit(x_train_pad, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(x_test_pad, y_test))
print("Done")

Building the model...
Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 10447, 100)        12460500  
_________________________________________________________________
gru_22 (GRU)                 (None, 128)               88320     
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 129       
Total params: 12,548,949
Trainable params: 12,548,949
Non-trainable params: 0
_________________________________________________________________
None 

Training the model...
Train for 40 steps, validate on 1704 samples
Epoch 1/50
 1/40 [..............................] - ETA: 1:58:46

KeyboardInterrupt: 

In [246]:
# Evaluate the model

score, acc = model.evaluate(x_test_pad, y_test, batch_size=BATCH_SIZE)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.4593739266578968
Test accuracy: 0.83076924
