In [None]:
import os

import tensorflow as tf

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

In [None]:
import keras
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Optimizer
from keras import callbacks
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional,concatenate
from keras.models import Model
import matplotlib as mpl
%matplotlib inline
from matplotlib import pyplot as plt
from keras.utils import plot_model 
from keras.optimizers import Adam
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

np.random.seed(17)


import sys
import csv
from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
from csv import DictReader
from csv import DictWriter
import codecs

from utils.system import parse_params, check_version

# Load data

In [None]:
train_bodies = pd.read_csv('./fnc-1/train_bodies.csv')
    
train_stances = pd.read_csv('./fnc-1/train_stances.csv')

test_bodies = pd.read_csv('./fnc-1/competition_test_bodies.csv')

test_stances = pd.read_csv('./fnc-1/competition_test_stances.csv')

# Merge and concate

In [None]:
train_stances['Stance'].replace('agree',0,True)
train_stances['Stance'].replace('disagree',1,True)
train_stances['Stance'].replace('discuss',2,True)
train_stances['Stance'].replace('unrelated',3,True)
train_merge = pd.merge(train_stances, train_bodies, on='Body ID')


test_stances['Stance'].replace('agree',0,True)
test_stances['Stance'].replace('disagree',1,True)
test_stances['Stance'].replace('discuss',2,True)
test_stances['Stance'].replace('unrelated',3,True)
test_merge = pd.merge(test_stances, test_bodies, on='Body ID')

test_merge.head(10)

In [None]:
MAX_SENT_LEN = 16
MAX_VOCAB_SIZE = 50000
LSTM_DIM = 100
EMBEDDING_DIM = 50
BATCH_SIZE = 128
N_EPOCHS = 1
MAX_SENT_LEN1 = 100
MAX_VOCAB_SIZE1 = 50000


train_merge['Headline'].head(10)

test_merge['Headline'] = test_merge['Headline'].apply(lambda x: str(x))
train_merge['Headline'] = train_merge['Headline'].apply(lambda x: str(x))

test_merge['articleBody'] = test_merge['articleBody'].apply(lambda x: str(x))
train_merge['articleBody'] = train_merge['articleBody'].apply(lambda x: str(x))


train_merge = train_merge.sample(frac=1, random_state=10)


word_seq_train_stances = [text_to_word_sequence(head) for head in train_merge['Headline']]
word_seq_train_bodies = [text_to_word_sequence(body) for body in train_merge['articleBody']]



word_seq_test_stances = [text_to_word_sequence(head) for head in test_merge['Headline']]
word_seq_test_bodies = [text_to_word_sequence(body) for body in test_merge['articleBody']]

# Create a list for embedding

In [None]:
word_seq = []

for i in range(len(word_seq_train_stances)):
    word_seq.append(word_seq_train_stances[i])
for i in range(len(word_seq_train_bodies)):
    word_seq.append(word_seq_train_bodies[i])


for i in range(len(word_seq_test_stances)):
    word_seq.append(word_seq_test_stances[i])
for i in range(len(word_seq_test_bodies)):
    word_seq.append(word_seq_test_bodies[i])

In [None]:

print('75th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_train_stances], 75))
print('75th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_train_bodies], 75))


# Pass the words through tokenizer

In [None]:
filter_list = '\r\t\n'
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,filters=filter_list)
tokenizer.fit_on_texts([seq for seq in word_seq])
print("Number of words in vocabulary:", len(tokenizer.word_index))

tokenizer.word_index

X = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_train_stances])
X = pad_sequences(X, maxlen=MAX_SENT_LEN, padding='post', truncating='post')


In [None]:
X1 = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN1]) for seq in word_seq_train_bodies])
X1 = pad_sequences(X1, maxlen=MAX_SENT_LEN1, padding='post', truncating='post')

In [None]:
X_test_bodies = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN1]) for seq in word_seq_test_bodies])
X_test_bodies = pad_sequences(X_test_bodies, maxlen=MAX_SENT_LEN1, padding='post', truncating='post')

In [None]:
X_test_bodies[:3]

In [None]:
X_test_stances = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_test_stances])
X_test_stances = pad_sequences(X_test_stances, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

# Split train test data

In [None]:
y = train_merge['Stance']

encoder_train = LabelEncoder()
encoder_train.fit(y)
encoded_train = encoder_train.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_train)

y_test = test_merge['Stance']



train_size =math.ceil( X.shape[0] * 0.9)
train_size_end =math.ceil( X.shape[0] * 0.2)

X_train = X[0:train_size]
X1_train = X1[0:train_size]

X_val = X[train_size:]
X1_val = X1[train_size:]

y_train = dummy_y_train[0:train_size]
y_val = dummy_y_train[train_size:]

print(X_train.shape)
print(X1_train.shape)
print(y_train.shape)

print(X_val.shape)
print(X1_val.shape)
print(y_val.shape)

# import glove embedding

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

glove_input_file = 'd:/641/glove.twitter.27B.50d.txt'
word2vec_output_file = 'glove.500d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)



In [None]:
# Create an embedding matrix containing only the word's in our vocabulary
# If the word does not have a pre-trained embedding, then randomly initialize the embedding
embeddings = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings

In [None]:
from keras.models import Sequential

# Generate 2 models for headline and article body seperately

In [None]:
def create_model2():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], 
                          input_length=MAX_SENT_LEN,
                          trainable=False, name='word_embedding_layer', #False
                          mask_zero=True))

    model.add(Dense(50,  name='dense_layer'))

    model.add(Dense(25, name='output_layer1'))
    return model

In [None]:
def create_model3():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          input_length=MAX_SENT_LEN1,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer1', #False
                          mask_zero=True))
    model.add(Dense(50, name='dense_layer1'))
    model.add(Dense(25,  name='output_layer2'))
    return model

# Concate the output then pass through a BiLSTM layer, dropout layer and dense layer

In [None]:
model_stance1 = create_model2()
model_bodies1 = create_model3()

combinedInput =keras.layers.Concatenate(axis=1)([model_stance1.output, model_bodies1.output])

x = Bidirectional(LSTM(LSTM_DIM, return_sequences=False, name='Bidrectional_lstm1'))(combinedInput)
x = Dropout(rate=0.2, name='dropout_2')(x)
x = Dense(4, activation="softmax")(x)
model = Model(inputs=[model_stance1.input, model_bodies1.input], outputs=x)

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'] )

filepath="divide_then_merge_textsummarization_{epoch:02d}_{val_loss:.4f}.h5"
checkpoint = callbacks.ModelCheckpoint(filepath, 
                                       monitor='val_loss', 
                                       verbose=0, 
                                       save_best_only=False)
callbacks_list = [checkpoint]

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='bilstm_concate_model.png', show_layer_names=True, show_shapes=True)
Image('bilstm_concate_model.png')

# Train model

In [None]:
history  = model.fit(
    [X_train, X1_train], y_train,
    validation_data=([X_val, X1_val], y_val),
    epochs= 40, batch_size=BATCH_SIZE ,verbose = 1,callbacks = callbacks_list)

# Weighted score

In [None]:
predicted = [LABELS[int(np.argmax(a, axis = -1))] for a in model.predict([X_test_stances,X_test_bodies])]

actual = [LABELS[int(a)] for a in y_test]

report_score(actual,predicted)

# accuracy and loss plot

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()