In [42]:
import pickle
import csv
import numpy as np
import pandas as pd
import tensorflow as tf


from utils.dataset import DataSet
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Import data

In [9]:
trainDataset = DataSet()
testDataset = DataSet("competition_test")

FullCorpus = []
tokenizedTrainHeadlines = []
tokenizedTrainBodies = []
tokenizedTestHeadlines = []
tokenizedTestBodies = []

#seen headlines and body tracker to ensure we don't have duplicates when building TF for corpus
trainHeadlinesSeen = {}
trainBodiesSeen = {}
testHeadlinesSeen = {}
testBodiesSeen = {}


for stance in trainDataset.stances:
    if stance['Headline'] not in trainHeadlinesSeen:
        tokenizedHeadline = word_tokenize(stance['Headline'])
        tokenizedTrainHeadlines.append(tokenizedHeadline)
        trainHeadlinesSeen[stance['Headline']] = tokenizedHeadline
    
    if stance['Body ID'] not in trainBodiesSeen:
        tokenizedBody = word_tokenize(trainDataset.articles[stance['Body ID']])
        tokenizedTrainBodies.append(tokenizedBody)
        trainBodiesSeen[stance['Body ID']] = tokenizedBody

for stance in testDataset.stances:
    if stance['Headline'] not in testHeadlinesSeen:
        tokenizedHeadline = word_tokenize(stance['Headline'])
        tokenizedTestHeadlines.append(tokenizedHeadline)
        testHeadlinesSeen[stance['Headline']] = tokenizedHeadline
    
    if stance['Body ID'] not in testBodiesSeen:
        tokenizedBody = word_tokenize(testDataset.articles[stance['Body ID']])
        tokenizedTestBodies.append(tokenizedBody)
        testBodiesSeen[stance['Body ID']] = tokenizedBody

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


## Build tokenizers and count vectorizers

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([' '.join(seq) for seq in tokenizedTrainHeadlines + tokenizedTrainBodies + ["<UNK>"]])

vocabulary=tokenizer.word_index
vocabulary=list(vocabulary.keys())

countVectorizer = CountVectorizer(vocabulary=vocabulary)
trainCorpusBagOfWords = countVectorizer.fit_transform([' '.join(seq) for seq in tokenizedTrainHeadlines + tokenizedTrainBodies])

tfVectorizer = TfidfVectorizer().fit([' '.join(seq) for seq in tokenizedTrainHeadlines + tokenizedTrainBodies + ['<UNK>']]) 


## Build feature and label vectors

In [11]:
trainFeatures = []
trainLabels = []

testFeatures = []
testLabels = []

# 0=unrelated 1=discuss 2=agree 3=disagree
for stance in trainDataset.stances:
    label = [1,0,0,0] if stance['Stance'] == 'unrelated' else [0,1,0,0] if stance['Stance'] == 'discuss' else [0,0,1,0] if stance['Stance'] == 'agree' else [0,0,0,1]  
    trainLabels.append(label)
    headline = [' '.join(trainHeadlinesSeen[stance['Headline']])]
    body = [' '.join(trainBodiesSeen[stance['Body ID']])]
    headlineTermVec = list(countVectorizer.transform(headline).toarray())[0].reshape(1, -1)
    bodyTermVec = list(countVectorizer.transform(body).toarray())[0].reshape(1, -1)
    
    tfidfHeadline = tfVectorizer.transform(headline).toarray()
    tfidfBody = tfVectorizer.transform(body).toarray()
    tfidf_cos = cosine_similarity(tfidfHeadline, tfidfBody)[0].reshape(1, 1)
    x = np.hstack(( tfidfHeadline, tfidfBody,tfidf_cos )).ravel()
    trainFeatures.append(x)
    
for stance in testDataset.stances:
    label = [1,0,0,0] if stance['Stance'] == 'unrelated' else [0,1,0,0] if stance['Stance'] == 'discuss' else [0,0,1,0] if stance['Stance'] == 'agree' else [0,0,0,1] 
    testLabels.append(label)
    headline = [' '.join(testHeadlinesSeen[stance['Headline']])]
    body = [' '.join(testBodiesSeen[stance['Body ID']])]
    headlineTermVec = list(countVectorizer.transform(headline).toarray())[0].reshape(1, -1)
    bodyTermVec = list(countVectorizer.transform(body).toarray())[0].reshape(1, -1)
    
    tfidfHeadline = tfVectorizer.transform(headline).toarray()
    tfidfBody = tfVectorizer.transform(body).toarray()
    tfidf_cos = cosine_similarity(tfidfHeadline, tfidfBody)[0].reshape(1, 1)
    x = np.hstack(( tfidfHeadline, tfidfBody,tfidf_cos )).ravel()
    testFeatures.append(x)


In [12]:
trainFeatures = np.array(trainFeatures)
testFeatures = np.array(testFeatures)

In [13]:
trainLabels = np.array(trainLabels)
testLabels = np.array(testLabels)

In [14]:
testFeatures.shape

(25413, 46675)

In [15]:
trainFeatures.shape

(49972, 46675)

# Build Models

In [16]:
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional


#INPUT_DIM = 2*len(tokenizer.word_index) + 1
INPUT_DIM = trainFeatures.shape[1]
BATCH_SIZE = 512
N_EPOCHS = 10

## Sigmoid Model

In [20]:
modelSig = Sequential()
modelSig.add(Dense(256, activation='sigmoid', input_dim=INPUT_DIM, name="activation_layer"))
modelSig.add(Dense(4, activation='softmax', name='output_layer'))
modelSig.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 activation_layer (Dense)    (None, 256)               11949056  
                                                                 
 output_layer (Dense)        (None, 4)                 1028      
                                                                 
Total params: 11,950,084
Trainable params: 11,950,084
Non-trainable params: 0
_________________________________________________________________


In [21]:
modelSig.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
modelSig.fit(trainFeatures, trainLabels,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(testFeatures, testLabels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b89c29d0>

In [24]:
score, acc = modelSig.evaluate(testFeatures, testLabels, batch_size=BATCH_SIZE)
print(acc)

0.8462991118431091


## Relu Model

In [17]:
modelRelu = Sequential()
modelRelu.add(Dense(256, activation='relu', input_dim=INPUT_DIM, name="activation_layer"))
modelRelu.add(Dense(4, activation='softmax', name='output_layer'))
modelRelu.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 activation_layer (Dense)    (None, 256)               11949056  
                                                                 
 output_layer (Dense)        (None, 4)                 1028      
                                                                 
Total params: 11,950,084
Trainable params: 11,950,084
Non-trainable params: 0
_________________________________________________________________


2022-03-30 14:19:37.309706: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
modelRelu.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [19]:
modelRelu.fit(trainFeatures, trainLabels,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(testFeatures, testLabels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ae9abdc0>

In [20]:
score, acc = modelRelu.evaluate(testFeatures, testLabels, batch_size=BATCH_SIZE)
print(acc)

0.8485814332962036


## Tanh Model

In [29]:
modelTanh = Sequential()
modelTanh.add(Dense(256, activation='tanh', input_dim=INPUT_DIM, name="activation_layer"))
modelTanh.add(Dense(4, activation='softmax', name='output_layer'))
modelTanh.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 activation_layer (Dense)    (None, 256)               11949056  
                                                                 
 output_layer (Dense)        (None, 4)                 1028      
                                                                 
Total params: 11,950,084
Trainable params: 11,950,084
Non-trainable params: 0
_________________________________________________________________


In [30]:
modelTanh.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [31]:
modelTanh.fit(trainFeatures, trainLabels,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(testFeatures, testLabels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b8c351f0>

In [32]:
score, acc = modelTanh.evaluate(testFeatures, testLabels, batch_size=BATCH_SIZE)
print(acc)

0.8359894752502441


## Relu Experiments

Since the last 3 models showed Relu as the top performer it will be used in the final model, after experimenting with dropout rates and normalizations

In [77]:
modelRelu2 = Sequential()
modelRelu2.add(Dropout(rate=0.1, name='dropout_1'))
modelRelu2.add(BatchNormalization(name='bn'))
modelRelu2.add(Dense(256, activation='relu', input_dim=INPUT_DIM, name="activation_layer"))
modelRelu2.add(Dense(4, activation='softmax', name='output_layer'))

In [78]:
modelRelu2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [79]:
modelRelu2.fit(trainFeatures, trainLabels,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(testFeatures, testLabels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b9957d60>

In [80]:
score, acc = modelRelu2.evaluate(testFeatures, testLabels, batch_size=BATCH_SIZE)
print(acc)

0.7352142333984375


# Build Results csv for Relu model

Load model

In [22]:
from keras.models import load_model

In [23]:
model = load_model("FinalRelu.model")

In [27]:
predicted = model.predict(testFeatures)

In [39]:
result = {"Headline": [], "Body ID": [], "Stance": []}

# 0=unrelated 1=discuss 2=agree 3=disagree
for i, stance in enumerate(testDataset.stances):
    result["Headline"].append(stance['Headline'])
    result["Body ID"].append(stance['Body ID'])
    prediction = predicted[i]
    predictionIndex = np.where(prediction == np.amax(prediction))[0][0]
    stance = 'unrelated' if predictionIndex == 0 else 'discuss' if predictionIndex == 1 else 'agree' if predictionIndex == 2 else 'disagree'
    result["Stance"].append(stance)

In [43]:
result = pd.DataFrame.from_dict(result)
    
result.to_csv('answer.csv', index=False, encoding='utf-8')