# Stance Detection

In [277]:
import torch
import pandas as pd
import nltk
import numpy as np
import preprocessing, feature_engineering, helpers
import importlib
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import scipy
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import time

In [98]:
importlib.reload(preprocessing)
importlib.reload(feature_engineering)
importlib.reload(helpers)
preprocess = preprocessing.Preprocessing()

In [157]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
train_stances = train_stances.loc[lambda x: x.Stance != "unrelated"]
print(train_stances.shape)
train_stances.head()

(13427, 3)


Unnamed: 0,Headline,Body ID,Stance
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
10,Gateway Pundit,2327,discuss


In [158]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [299]:
stances_tr, stances_val = preprocess.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

((10948, 3), (2479, 3))

In [300]:
ct = Counter(stances_val['Stance'])
print(ct.most_common(1)[0][1]/len(list(stances_val["Stance"])))

0.654296087131908


In [67]:
glove_dict = preprocess.get_glove_dict("glove.6B.50d")

In [208]:
list(stances_tr.iloc[14,:])

['Pope Francis turns out not to have made pets in heaven comment',
 1905,
 'disagree']

In [303]:
def process_text_stance(text, glove_dict, n_words = 20):
    def process_word_stance(word):
        #50d word vector
        if word in glove_dict:
            wv = glove_dict[word]
        else:
            wv = np.random.normal(scale=0.6, size=(50, ))
        #4d sentiment
        sent = preprocess.get_sentiment(word)
        #36d one-hot encoding of part of speech
        pos = nltk.pos_tag(word)[1]
        pos_encoding = [(1 if tag == pos else 0) for tag in preprocess.pos_tags]
        #boolean flag for negating word
        is_neg = (1 if word in preprocess.negating_words_lemmatized else 0)
        wv = np.concatenate([wv, [sent["pos"], sent["neg"], sent["neu"], sent["compound"], is_neg], pos_encoding])
        return wv
    tokens = preprocess.get_clean_tokens(text, False)
    if len(tokens)>=n_words:
        tokens = tokens[:n_words]
        text_encoding = np.array([process_word_stance(token) for token in tokens])
    elif len(tokens)<n_words:
        padding = [np.zeros((91,))]*(n_words-len(tokens))
        text_encoding = [process_word_stance(token) for token in tokens]+padding
        text_encoding = np.array(text_encoding)
    return text_encoding
#     if len(tokens)>=20:
#         tokens = tokens[:20]
#         text_encoding = np.concatenate([process_word_stance(token) for token in tokens])
#     elif len(tokens)<20:
#         padding = [np.zeros((91,))]*(20-len(tokens))
#         text_encoding = [process_word_stance(token) for token in tokens]+padding
#         text_encoding = np.concatenate(text_encoding)
#     return text_encoding

def process_bodies_stance(df, glove_dict):
    body_info = {}
    ids = list(df["Body ID"])
    for i in range(len(ids)):
        if i % 100 == 0 and i != 0:
            print("processed "+str(i))
        body_info[ids[i]] = process_text_stance(preprocess.get_body(ids[i],df), glove_dict, 40)
    print("done! processed " + str(len(ids)))
    return body_info

def process_feats_stance(data, body_dict, glove_dict):
    headline, body_id = data[0], int(data[1])
    return np.concatenate([process_text_stance(headline, glove_dict), body_dict[body_id]])

In [307]:
preprocess.get_clean_tokens("No, it's not Tiger Woods selling an island in Lake Mälaren didn't", False)

['no', 'not', 'tiger', 'wood', 'selling', 'island', 'lake', 'mälaren', 'didn']

In [305]:
body_dict = process_bodies_stance(train_bodies, glove_dict)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [306]:
start = time.time()
train_feats = [process_feats_stance(i, body_dict, glove_dict) for i in stances_tr.values]
val_feats = [process_feats_stance(i, body_dict, glove_dict) for i in stances_val.values]
end = time.time()
print(int(end-start))

77


In [313]:
train_feats[0].shape

(60, 91)

In [None]:
def get_batch(data, targets, i,batch_size):
    batches = data[i*batch_size:i*batch_size+batch_size]
    results = targets[i*batch_size:i*batch_size+batch_size]
    results = [(2 if result == "agree" else (1 if result == "discuss" else 0)) for result in results]
    return np.array(batches),np.array(results)

In [310]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 250
display_step = 1

In [294]:
# Test the Model
def eval_model(model):
    correct = 0
    total = 0
    batch_x_test,batch_y_test = get_batch(val_feats,[str(x[-1]) for x in stances_val.values],0,len(stances_val))
    inputs = Variable(torch.FloatTensor(batch_x_test))
    labels = torch.LongTensor(batch_y_test)
    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()

    print('Accuracy: %d %%' % (100 * correct / total))
    return predicted

In [246]:
predicted_l = [i.item() for i in list(predicted)]
Counter(predicted_l)

Counter({0: 126, 1: 1678, 2: 944})

In [297]:
def score_model(predictions):
    true_label = [(2 if x[-1] == "agree" else (1 if x[-1] == "discuss" else 0)) for x in stances_val.values]
    matrix = confusion_matrix(true_label,predictions)
    print('confusion matrix: \n{}\n'.format(matrix))
    #use FNC scorer to generate score report
    label_prediction = [("agree" if x == 2 else ("discuss" if x == 1 else "disagree")) for x in predicted_l]
    label_actual = pd.DataFrame(stances_val)['Stance']
    score.report_score(label_actual, label_prediction)

In [324]:
#architecture from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

class CNN(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x): 
        #x (batch size, 60, 91)
        embedded = x.unsqueeze(1) 
        #embedded (batch size, 1, 60, 91)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] 
        #conv_n = [batch size, n_filters, 60 - filter_sizes[n]]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] 
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1)) 
        #cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)

In [327]:
EMBEDDING_DIM = 91
N_FILTERS = 100
FILTER_SIZES = [3,4,5,6,7]
OUTPUT_DIM = 3
DROPOUT = 0.5

model = CNN(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [328]:
#Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(train_feats)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_feats, [str(x[-1]) for x in stances_tr.values],i,batch_size)
        inputs = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(train_feats)//batch_size, loss.data[0]))



Epoch [1/10], Step [10/43], Loss: 1.0179
Epoch [1/10], Step [20/43], Loss: 0.8079
Epoch [1/10], Step [30/43], Loss: 0.6389
Epoch [1/10], Step [40/43], Loss: 0.6790
Epoch [2/10], Step [10/43], Loss: 0.6695
Epoch [2/10], Step [20/43], Loss: 0.6424
Epoch [2/10], Step [30/43], Loss: 0.5114
Epoch [2/10], Step [40/43], Loss: 0.5600
Epoch [3/10], Step [10/43], Loss: 0.5682
Epoch [3/10], Step [20/43], Loss: 0.5268
Epoch [3/10], Step [30/43], Loss: 0.4264
Epoch [3/10], Step [40/43], Loss: 0.5136
Epoch [4/10], Step [10/43], Loss: 0.4838
Epoch [4/10], Step [20/43], Loss: 0.4582
Epoch [4/10], Step [30/43], Loss: 0.4077
Epoch [4/10], Step [40/43], Loss: 0.4609
Epoch [5/10], Step [10/43], Loss: 0.4282
Epoch [5/10], Step [20/43], Loss: 0.3978
Epoch [5/10], Step [30/43], Loss: 0.3438
Epoch [5/10], Step [40/43], Loss: 0.3910
Epoch [6/10], Step [10/43], Loss: 0.4043
Epoch [6/10], Step [20/43], Loss: 0.3717
Epoch [6/10], Step [30/43], Loss: 0.3334
Epoch [6/10], Step [40/43], Loss: 0.3826
Epoch [7/10], St

In [329]:
# Test the Model
predicted = eval_model(model)

Accuracy: 74 %


In [330]:
predicted_l = [i.item() for i in list(predicted)]
Counter(predicted_l)

Counter({0: 13, 1: 1921, 2: 545})

In [331]:
score_model(predicted_l)

confusion matrix: 
[[   7  101   67]
 [   4 1491  127]
 [   2  329  351]]

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    351    |     2     |    329    |     0     |
-------------------------------------------------------------
| disagree  |    67     |     7     |    101    |     0     |
-------------------------------------------------------------
|  discuss  |    127    |     4     |   1491    |     0     |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
Score: 2006.5 out of 2479.0	(80.9398951189996%)
