In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import time

plt.style.use(style="seaborn")
%matplotlib inline
# nltk.download('popular')

In [None]:
train_set = pd.read_csv('raw_data/fulltrain.csv')
train_set.columns = ['label', 'text']

In [None]:
train_set.head().T

In [None]:
# count the number of texts under each label
train_set.groupby('label').count()

#Add more reliable news training data

In [None]:
other_train = pd.read_csv('raw_data/other_train.csv')

other_train.drop(['id', 'title', 'author'], axis=1, inplace=True)

# only takes label 0
other_train = other_train[other_train['label'] == 0]

other_train_fake = other_train[other_train['label'] == 1]

other_train['label'] = 4

other_train_fake['label'] = 2

# append first 7000 other_train to train_set
train_set = pd.concat([train_set, other_train.head(7000)])
train_set = pd.concat([train_set, other_train_fake])

#Adding more data to hoax

In [None]:
hoax_train = pd.read_csv('raw_data/Fake.csv')

# only takes US_News and Middle-east
# hoax_train = hoax_train[(hoax_train['subject'] == 'US_News') | (hoax_train['subject'] == 'Middle-east') | (hoax_train['subject'] == 'News')]

hoax_train.drop(['title', 'subject', 'date'], axis=1, inplace=True)

# add a column of 2 to the dataframe
hoax_train['label'] = 2

# reverse the order of the dataframe
hoax_train = hoax_train.iloc[::-1]

# make id of hoax_train start from 0
hoax_train.reset_index(inplace=True)

# add first 10000 data points to train_set
train_set = pd.concat([train_set, hoax_train])

train_set.drop(['index'], axis=1, inplace=True)

In [None]:
train_set.groupby('label').count()

#Define utility functions

In [None]:
import string

def remove_punctuations(text):
    # keeps punctuation marks and question marks
    raw = str.maketrans('', '', string.punctuation[1:20] + string.punctuation[21:])
    return text.translate(raw)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(text)

def transform_lower(text):
    return text.lower()

def pad_sequences_and_truncate(sequence, max_len):
    if len(sequence) > max_len:
        return sequence[:max_len]
    else:
        return sequence + [0] * (max_len-len(sequence))

def perform_stemming(text):
    porter = PorterStemmer()
    text = [porter.stem(word) for word in text.split()]
    return " ".join(text)


#Perform preprocessing

In [None]:
train_set['text'] = train_set.text.map(lambda x: remove_punctuations(x))
train_set['text'] = train_set.text.map(lambda x: transform_lower(x))
train_set['text'] = train_set.text.map(lambda x: remove_stopwords(x))
train_set['text'] = train_set.text.map(lambda x: perform_stemming(x))

In [None]:
len_array = np.asarray(train_set['text'].str.len())
len_dict = dict(zip(len_array, np.zeros(len(len_array))))
for i in len_array:
    len_dict[i] += 1
# plot the distribution
plt.bar(len_dict.keys(), len_dict.values())
plt.xlim([0, 15000])
plt.ylim([0, 50])
plt.xlabel('Text Length')
plt.ylabel('Number of Texts')
plt.title('Distribution of Text Length')
plt.show()

#Split the text into paragraphs of 100 words

In [None]:
# split the text into paragraphs of 100 words each

# print("Initial length of train_set: ", len(train_set))

# paragraphs = []
# labels = []

# for i in range(len(train_set['text'].values)):
#     text = train_set['text'].values[i]
#     cur = text.split()
#     cur_label = train_set['label'].values[i]
#     for j in range(0, len(cur), 100):
#         if len(cur) < 100:
#             paragraphs.append(" ".join(cur))
#         else:
#             paragraphs.append(" ".join(cur[j:j+100]))
#         labels.append(cur_label)

# train_set = pd.DataFrame({'text': paragraphs, 'label': labels})

# print("Final length of train_set: ", len(train_set))
    

In [None]:
# print("Paragraph length: ", len(train_set['text'].values[23000].split()))

#Check result

In [None]:
train_set.text

#Find max length of input data after preprocessing


In [None]:
maxlen = -1
avglen = 0
sumlen = 0
for item in train_set.text:
  words = item.split()
  maxlen = max(len(words), maxlen)
  sumlen += len(words)
avglen = sumlen/len(train_set.text)
print("Maximum sentence lenth: ", maxlen)
print("Average sentence lenth: ", avglen)

#Create corpus


In [None]:
from nltk.tokenize import word_tokenize

def create_corpus_tk(df):
  corpus = []
  for text in train_set['text']:
    words = [word.lower() for word in word_tokenize(text)]
    corpus.append(words)
  return corpus

In [None]:
corpus = create_corpus_tk(train_set)
num_words = len(corpus)
print(num_words)

In [None]:
corpus[0]

##Train/Test split

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

padding_len = 1000

In [None]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_set['text'])

In [None]:
X = train_set['text'].values
Y = pd.get_dummies(train_set['label']).values
X_train, Y_train, X_train_labels, Y_train_labels = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [None]:
print(X_train.shape)
print(Y_train_labels.shape)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)


In [None]:
X_train

In [None]:
Y_train = tokenizer.texts_to_sequences(Y_train)

In [None]:
def pad_sequences_and_truncate(sequences, max_len):
    res = []
    for s in sequences:
        if len(s) >= 2 * max_len:
            # take the first and last max_len/2 words
            res.append(s[:max_len//2] + s[-max_len//2:])
        elif len(s) > max_len and len(s) < 2 * max_len:
            res.append(s[:max_len])
        else:
            res.append(s + [0] * (max_len - len(s)))

    return res


In [None]:
X_train_padded_raw = pad_sequences_and_truncate(X_train, padding_len)
Y_train_padded_raw = pad_sequences_and_truncate(Y_train, padding_len)

# transform to numpy ndarray otherwise memory error
X_train_padded = pad_sequences(X_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')
Y_train_padded = pad_sequences(Y_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')

# X_train_padded = pad_sequences(X_train, padding='post', maxlen=padding_len, truncating='post')
# Y_train_padded = pad_sequences(Y_train, padding='post', maxlen=padding_len, truncating='post')

In [None]:
type(X_train_padded), X_train_padded.shape

In [None]:
len(X_train[11]), len(X_train_padded[11])


#Create word embedding using gloVe

In [None]:
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r", encoding="UTF-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors
f.close()

In [None]:
embedding_dict

In [None]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in word_index.items():
  if i < num_words:
    vector = embedding_dict.get(word)
    if vector is not None:
      embedding_matrix[i] = vector

In [None]:
embedding_matrix[10]

# Prepare test set

In [None]:
test = pd.read_csv('raw_data/balancedtest.csv')
test.columns = ['label', 'text']
test.groupby('label').count()

In [None]:
test['text'] = test.text.map(lambda x: remove_punctuations(x))
test['text'] = test.text.map(lambda x: transform_lower(x))
test['text'] = test.text.map(lambda x: remove_stopwords(x))
test['text'] = test.text.map(lambda x: perform_stemming(x))
maxlen = -1
avglen = 0
sumlen = 0
for item in test.text:
  words = item.split()
  maxlen = max(len(words), maxlen)
  sumlen += len(words)
avglen = sumlen/len(test.text)
print("Maximum sentence lenth: ", maxlen)
print("Average sentence lenth: ", avglen)

In [None]:
X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test['label']).values

# Implement Custom Early Stopping Mechanism

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time test error improved.
            verbose (bool): If True, prints a message for each test error improvement. 
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
            path (str): Path for the checkpoint to be saved to.
            trace_func (function): trace print function.        
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.test_err_min = np.Inf
        self.delta = delta
        self.best_weights = None
        self.trace_func = trace_func
        
    def __call__(self, test_err, model):

        score = -test_err

        if self.best_score is None:
            self.best_score = score
            self.best_weights = model.get_weights()
            self.save_checkpoint(test_err, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            self.trace_func(f'Current Best Accuracy: {1 + self.best_score}')
            if self.counter >= self.patience:
                model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                model.set_weights(self.best_weights)
        else:
            self.best_score = score
            self.save_checkpoint(test_err, model)
            self.counter = 0

    def save_checkpoint(self, test_err, model):
        if self.verbose:
            self.trace_func(f'Test error decreased ({self.test_err_min:.6f} --> {test_err:.6f}). Saving model...')
            self.best_weights = model.get_weights()
        self.test_err_min = test_err

In [None]:
import keras

class EarlyStoppingCustom(keras.callbacks.Callback):
      def on_epoch_end(self, epoch, logs={}):
        test_err = 1 - self.model.evaluate(X_test, Y_test)[1]
        early_stopping(test_err, self.model)
        if self.model.stop_training:
            print("Early stopping at epoch, ", epoch + 1)

#Implementing LSTM baseline model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, CuDNNLSTM
from keras.layers import Bidirectional, LeakyReLU, Activation
from keras import regularizers
import tensorflow as tf
import tensorflow_addons as tfa

filters = 100
kernel_size = 5
lstm_units = 32
embed_dim = 200
epochs = 100

early_stopping = EarlyStopping(patience=15, verbose=True)

# opt = tfa.optimizers.AdamW(learning_rate=0.0001, weight_decay=0.001)

model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_length=padding_len, trainable=False))
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
model.add(LeakyReLU(alpha=0.2))
model.add(MaxPooling1D(pool_size=2))
# model.add(Bidirectional(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5,return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(lstm_units, return_sequences=True)))
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
model.add(LeakyReLU(alpha=0.2))
model.add(MaxPooling1D(pool_size=2))
# model.add(Bidirectional(LSTM(lstm_units ,dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(lstm_units, return_sequences=True)))
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
model.add(LeakyReLU(alpha=0.2))
model.add(MaxPooling1D(pool_size=2))
# model.add(Bidirectional(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5)))
model.add(Bidirectional(CuDNNLSTM(lstm_units)))
model.add(Dense(50, input_shape=(lstm_units,)))
model.add(Activation('relu'))
model.add(Dense(16, input_shape=(50,)))
model.add(Activation('relu'))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.summary()

#Check GPU info and availability

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
tf.test.is_built_with_cuda()

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
history = model.fit(
    X_train_padded,
    X_train_labels,
    epochs=epochs,
    validation_split=0.1,
    verbose=1,
    batch_size=128,
    shuffle=True,
    callbacks=[EarlyStoppingCustom()]
)

##Final Evaluation

In [None]:
accr = model.evaluate(X_test, Y_test)
print('Test set without paragraphs\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

#Calculates f1-score per class

In [None]:
from sklearn.metrics import f1_score

predict_x = model.predict(X_test) 
classes_x = np.argmax(predict_x, axis=1)
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, classes_x, average='macro')