In [12]:
import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model

from keras.models import Input,Model,Sequential
from keras.layers import LSTM,Embedding,Dropout,Activation,Reshape,Dense,GRU,Add,Flatten,concatenate

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences


# Dataset Preparation

In [3]:
data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat,stance_base,jaccard_similarity
0,police find mass graves least 15 bodies near m...,712,unrelated,danny boyle directing untitled film seth rogen...,3,unrelated,0.0
1,hundreds palestinians flee floods gaza israel ...,158,agree,hundreds palestinians evacuated homes sunday m...,0,related,79.545455
2,christian bale passes role steve jobs actor re...,137,unrelated,30 year old moscow resident hospitalized wound...,3,unrelated,0.0
3,hbo apple talks 15 month apple tv streaming se...,1034,unrelated,reuters canadian soldier shot canadian war mem...,3,unrelated,0.0
4,spider burrowed tourist stomach chest,1923,disagree,fear arachnophobes story bunbury spiderman mig...,1,related,28.301887


In [4]:
corpus = np.r_[data['Headline'].values,data['articleBody'].values]
print(49972*2)
print(len(corpus)) # first 49972 contains the Headline and next 49972 contains the articleBody

vocabulary = []
for sentence in corpus:
    vocabulary.extend(sentence.split(' '))

vocabulary = list(set(vocabulary))
vocab_length = len(vocabulary)
print("Vocabulary Length is {0}".format(vocab_length))


99944
99944
Vocabulary Length is 23226


In [5]:
max_features = 5000
max_nb_words = 24000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 64

# BASELINE - ONE HOT ENCODING

In [6]:

encoded_docs_headline = [one_hot(sentence,vocab_length) for sentence in data.loc[:,'Headline'].tolist()]
padded_docs_headline = pad_sequences(encoded_docs_headline,MAX_SEQUENCE_LENGTH,padding='post')

encoded_docs_body = [one_hot(sentence,vocab_length) for sentence in data.loc[:,'articleBody'].tolist()]
padded_docs_body = pad_sequences(encoded_docs_body,MAX_SEQUENCE_LENGTH,padding='post')


labels = to_categorical(data.loc[:,'stance_cat'])


In [7]:
input_headline = Input(shape=[64],name='input_headline')
embedding_headline = Embedding(vocab_length,50,input_length = MAX_SEQUENCE_LENGTH)(input_headline)
dense_headline = Dense(16,activation='relu')(embedding_headline)

input_body = Input(shape=[64],name='input_body')
embedding_body = Embedding(vocab_length,50,input_length = MAX_SEQUENCE_LENGTH)(input_body)
dense_body = Dense(16,activation='relu')(embedding_body)

addition_layer = concatenate([dense_body,dense_headline])
flatten = Flatten()(addition_layer)
output = Dense(4,activation='sigmoid')(flatten)

model_combined = Model(inputs=[input_headline,input_body],outputs=output)

model_combined.compile(optimizer = 'adam',loss ='categorical_crossentropy',metrics = ['accuracy'])


In [8]:
model_combined.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_body (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_headline (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 64, 50)       1161300     input_body[0][0]                 
__________________________________________________________________________________________________
embedding (Embedding)           (None, 64, 50)       1161300     input_headline[0][0]             
______________________________________________________________________________________________

In [9]:
padded_docs_headline_train = padded_docs_headline[:int(len(padded_docs_headline)*0.8),:]
padded_docs_headline_test = padded_docs_headline[int(len(padded_docs_headline)*0.8):,:]

padded_docs_body_train = padded_docs_body[:int(len(padded_docs_body)*0.8),:]
padded_docs_body_test = padded_docs_body[int(len(padded_docs_body)*0.8):,:]

labels_train = labels[:int(len(labels)*0.8),:]
labels_test = labels[int(len(labels)*0.8):,:]


In [10]:
model_combined.fit([padded_docs_headline_train,padded_docs_body_train],labels_train,epochs=5,verbose=1,validation_data=([padded_docs_headline_test,padded_docs_body_test],labels_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba5de614c0>

# Bi- Directional GRU

In [13]:
GLOVE_DIR = "/home/abhinav/fake_news_challenge/fake_news_challenge/glove"
def setup_embedding_index():
    embedding_index=dict()
    f = open(os.path.join(GLOVE_DIR,"glove.6B.50d.txt"),encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:],dtype='float32')
        embedding_index[word] = coefs
    f.close()
    return embedding_index
embeddings_index = setup_embedding_index()

In [14]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data.loc[:,'Headline'].values)
vocab_headline_length = len(tokenizer.word_index)+1
encoded_docs= tokenizer.texts_to_sequences(data.loc[:,'Headline'])
padded_docs_headline = pad_sequences(encoded_docs, maxlen=16, padding='post')
print(vocab_headline_length)
word_index = tokenizer.word_index

3255


In [15]:
embedding_matrix_headline = np.zeros((vocab_headline_length,EMBEDDING_DIM))
# words = (list(word_index.keys()))[:max_nb_words]

# for word,i in word_index.items():
#     if i>=max_nb_words:
#         continue
#     embedding_vector = embedding_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix_headline[i] = embedding_vector

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_headline[i] = embedding_vector
dims = len(embedding_matrix_headline[0])

print(dims)

50


In [16]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data.loc[:,'articleBody'].values)
vocab_body_length = len(tokenizer.word_index)+1
encoded_docs= tokenizer.texts_to_sequences(data.loc[:,'Headline'])
padded_docs_body = pad_sequences(encoded_docs, maxlen=48, padding='post')
print(vocab_body_length)
vocab_length = max(vocab_body_length,vocab_headline_length)

23045


In [17]:
print(padded_docs_headline.shape)
print(padded_docs_body.shape)

(49972, 16)
(49972, 48)


In [18]:
# tokenizer.fit_on_texts(data.loc[:,'articleBody'].values)
# encoded_docs= tokenizer.texts_to_sequences(data.loc[:,'articleBody'])

# X_en = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
# X_encoded = np.concatenate((X_encoded,X_en),axis=1)
word_index = tokenizer.word_index
# num_words = min(max_nb_words,len(word_index))
# print('Number of words',num_words)

In [19]:
print(len(word_index))

23044


In [20]:
embedding_matrix_body = np.zeros((vocab_body_length,EMBEDDING_DIM))
# words = (list(word_index.keys()))[:max_nb_words]

# for word,i in word_index.items():
#     if i>=max_nb_words:
#         continue
#     embedding_vector = embedding_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix_body[i] = embedding_vector
# dims = len(embedding_matrix_body[0])

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_body[i] = embedding_vector
dims = len(embedding_matrix_body[0])
print(dims)

50


In [21]:
input_headline = Input(shape=[16],name='input_headline')
embedding_layer_headline = Embedding(vocab_headline_length,dims,weights=[embedding_matrix_headline],input_length = 16,trainable=False)(input_headline)

# lstm_headline = LSTM(units=16)(embedding_headline)

input_body = Input(shape=[48],name='input_body')
embedding_layer_body = Embedding(vocab_body_length,dims,weights = [embedding_matrix_body],input_length=48,trainable = False)(input_body)
# lstm_body = LSTM(units=16)(embedding_layer_body)

addition_layer = concatenate([embedding_layer_headline,embedding_layer_body],axis=1)
lstm = LSTM(units=64)(addition_layer)
drop = Dropout(0.25)(lstm)
dense = Dense(64,activation='relu')(drop)
# flatten = Flatten()(addition_layer)

output = Dense(4,activation='sigmoid')(dense)

model_combined_lstm = Model(inputs=[input_headline,input_body],outputs=output)
from keras.optimizers import SGD
sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)

# model.compile(loss = "categorical_crossentropy", optimizer = opt)

model_combined_lstm.compile(optimizer = sgd,loss ='categorical_crossentropy',metrics = ['accuracy'])


In [22]:
model_combined_lstm.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_headline (InputLayer)     [(None, 16)]         0                                            
__________________________________________________________________________________________________
input_body (InputLayer)         [(None, 48)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 16, 50)       162750      input_headline[0][0]             
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 48, 50)       1152250     input_body[0][0]                 
____________________________________________________________________________________________

In [23]:
padded_docs_headline_train = padded_docs_headline[:int(len(padded_docs_headline)*0.9),:]
padded_docs_headline_test = padded_docs_headline[int(len(padded_docs_headline)*0.9):,:]

padded_docs_body_train = padded_docs_body[:int(len(padded_docs_body)*0.9),:]
padded_docs_body_test = padded_docs_body[int(len(padded_docs_body)*0.9):,:]

labels = to_categorical(data.loc[:,'stance_cat'])

labels_train = labels[:int(len(labels)*0.9),:]
labels_test = labels[int(len(labels)*0.9):,:]


In [24]:
model_combined_lstm.fit([padded_docs_headline_train,padded_docs_body_train],labels_train,epochs=10,shuffle=True,verbose=1,validation_data=([padded_docs_headline_test,padded_docs_body_test],labels_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fba589f62b0>