In [None]:
#Import Reqs
import pandas as pd
import numpy as np
import random
import nltk
import re
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

import keras
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

from matplotlib import pyplot as plt


from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed, Activation, Dot, Reshape, Flatten 
from keras.utils.vis_utils import plot_model
from IPython.display import Image

from imblearn.under_sampling import RandomUnderSampler

In [2]:
#Initialize Hyperparameters
MAX_VOCAB_SIZE = 30000
MAX_SEN_LEN = 30

LSTM_DIM = 128
EMBEDDING_DIM = 100
BATCH_SIZE = 32
N_EPOCHS = 10

In [3]:
#Load Data Functions - taken from baseline model
from csv import DictReader

class DataSet():
    def __init__(self, name="train", path="fnc-1"):
        #self.path = path

        print("Reading dataset")
        bodies = name+"_bodies.csv"
        stances = name+"_stances.csv"

        self.stances = self.read(stances)
        articles = self.read(bodies)
        self.articles = dict()

        #make the body ID an integer value
        for s in self.stances:
            s['Body ID'] = int(s['Body ID'])

        #copy all bodies into a dictionary
        for article in articles:
            self.articles[int(article['Body ID'])] = article['articleBody']

        print("Total stances: " + str(len(self.stances)))
        print("Total bodies: " + str(len(self.articles)))



    def read(self,filename):
        rows = []
        with open(filename, "r", encoding='utf-8') as table:
            r = DictReader(table)

            for line in r:
                rows.append(line)
        return rows

In [4]:
#Pre-processing functions taken from baseline
_wnl = nltk.WordNetLemmatizer()
def normalize_word(w):
    return _wnl.lemmatize(w).lower()

def get_tokenized(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

#def remove_stopwords(line):
#    # Removes stopwords from a list of tokens
#    return [word for word in line if word not in feature_extraction.text.ENGLISH_STOP_WORDS]


In [None]:
#Load & Preprocess Data
trainingDataset = DataSet()

trainHeadlines_unaltered = []
trainBodyID = []
labels = []
trainingText = []



for raw in trainingDataset.stances:
  Headline = raw['Headline']
  BodyID = raw['Body ID']
  Stance = raw['Stance']
  Article = trainingDataset.articles[BodyID]
  allText = Headline+ " ENDSEQUENCE "+ Article
  
  trainHeadlines_unaltered.append(Headline)
  trainBodyID.append(BodyID)
  labels.append(Stance)

  allText = clean(allText)
  allText = get_tokenized(allText)
  trainingText.append(allText)


competitionDataset = DataSet("competition_test")
testHeadlines_unaltered = []
testBodyID_unaltered = []
testBodyID = []
testText = []

for raw in competitionDataset.stances:
  Headline = raw['Headline']
  BodyID = raw['Body ID']
  Article = competitionDataset.articles[BodyID]

  compiledText = Headline + " ENDSEQUENCE "+ Article
  
  testHeadlines_unaltered.append(Headline)
  testBodyID.append(BodyID)

  compiledText = clean(compiledText)
  FinalT = get_tokenized(compiledText)
  testText.append(FinalT)


Test = pd.DataFrame(
    {'testHeadlines_unaltered': testHeadlines_unaltered,
     'BodyID': testBodyID,
     'compliledText':testText})

In [None]:
#Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts([' '.join(seq[:MAX_SEN_LEN]) for seq in trainingText])

In [None]:
print("Number of words in vocabulary:", len(tokenizer.word_index))
X = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SEN_LEN]) for seq in trainingText])
X = pad_sequences(X, maxlen=MAX_SEN_LEN, padding='post', truncating='post')

In [None]:
#Split into Train & Validation
X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=10, test_size=0.2)

In [None]:
#resamples to undersample on "unrelated"
#resolves bias
#under_sampler = RandomUnderSampler(random_state=42)
#X_train, y_train = under_sampler.fit_resample(X_train, y_train)

In [None]:
#Load Embeddings  - word2vec?

embeddings = Word2Vec(tokenizer.word_index, min_count=2)
vocab_size = len(embeddings.wv.vocab)
print('Number of words in this w2v model:', vocab_size)
print('Dimension of w2v:', embeddings.vector_size)

In [None]:
#embedding matrix
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM)) 

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

In [None]:
#One-Hot Encoding
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)


In [None]:
#Define+Complile LSTM Model

model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True))

model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer'))
#model.add(Dense(4, activation='sigmoid', name='sig_layer'))
#model.add(Dense(4, activation='ReLU', name='relu_layer'))
model.add(Dense(4, activation='tanh',name='tanh_layer'))

model.add(Dense(4, activation='softmax', name='output_layer'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
#Train Model + test on validation set
import time

start = time.time()

history = model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(X_test, y_test))

print("Total time: ", time.time() - start, "seconds")

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'Validate'])
plt.show()

In [None]:
# Loss Plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'Validate'])
plt.show()

In [None]:
#visualization for the report
plot_model(model, to_file='basic_lstm_classifier.png', show_layer_names=True, show_shapes=True)
Image('basic_lstm_classifier.png')

In [None]:
#Prepare Prediction
#combinedText =[]
#for index,row in Test.iterrows():
#  x = row['Headline'] + " ENDSEQUENCE " + row['Article']
#  combinedText.append(x)


X_predict = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SEN_LEN]) for seq in testText])
X_predict = pad_sequences(X_predict, maxlen=MAX_SEN_LEN, padding='post', truncating='post')

predictions = model.predict(X_predict)

In [None]:
stances = []
for pred in predictions:
  idx = np.argmax(pred)
  if idx == 0:
    stances.append("agree")
  elif idx == 1:
    stances.append("disagree")

  elif idx == 2:
    stances.append("discuss")
  else:
    stances.append("unrelated")


In [None]:
headlines = Test['testHeadlines_unaltered']
bodyid= Test['BodyID'],
answers = pd.DataFrame({'Headline':Test['testHeadlines_unaltered'], 
                        'Body ID': Test['BodyID'], 
                        'Stance': stances})

answers.to_csv('answer.csv', index=False, encoding='utf-8')

In [None]:
#Accuracy on test set
y_set = pd.read_csv('competition_test_stances.csv')
y_pred = pd.get_dummies(y_set['Stance'])

score, acc = model.evaluate(X_predict, y_pred,
                            batch_size=BATCH_SIZE)