# Stance Detection

In [31]:
import torch
import pandas as pd
import nltk
import numpy as np
import preprocessing, feature_engineering, helpers
import importlib
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import scipy
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import time
import itertools
import utils

In [9]:
"""
    If you are downloading NLTK for the first time, uncomment this line in order to download the corpus. 
"""
#nltk.download()

'\n    If you are downloading NLTK for the first time, uncomment this line in order to download the corpus. \n'

In [32]:
importlib.reload(preprocessing)
importlib.reload(feature_engineering)
importlib.reload(helpers)
importlib.reload(utils)
preprocess = preprocessing.Preprocessing()

In [16]:
# uncomment the bottom line if you're getting an error: 
# AttributeError: 'LazyConfigValue' object has no attribute 'lower
# get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [17]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
train_stances = train_stances.loc[lambda x: x.Stance != "unrelated"]
print(train_stances.shape)
train_stances.head()

(13427, 3)


Unnamed: 0,Headline,Body ID,Stance
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
10,Gateway Pundit,2327,discuss


In [18]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [19]:
stances_tr, stances_val = preprocess.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

((10529, 3), (2898, 3))

In [20]:
ct = Counter(stances_val['Stance'])
print(ct)
print(ct.most_common(1)[0][1]/len(list(stances_val["Stance"])))

Counter({'discuss': 1866, 'agree': 810, 'disagree': 222})
0.6438923395445134


In [25]:
preprocess.extract_word_embeddings("glove.6B.50d")
glove_dict = preprocess.get_glove_dict("glove.6B.50d")

In [33]:
[(nltk.pos_tag([x]),preprocess.get_sentiment(x)) for x in preprocess.get_clean_tokens(list(stances_tr.iloc[2,:])[0], False)]

[([('nasa', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('confirms', 'NNS')],
  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('earth', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('experience', 'NN')],
  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('day', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('total', 'JJ')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('darkness', 'NN')],
  {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.25}),
 ([('december', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('fake', 'NN')], {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4767}),
 ([('news', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('story', 'NN')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('go', 'VB')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 ([('viral', 'JJ')], {'neg': 0.0, 'neu': 1.0, 'pos': 0.0,

In [34]:
preprocess.cosine_similarity(glove_dict['reveal'], glove_dict['revealed'])

0.7902431610341373

In [35]:
body_dict = preprocess.process_bodies_stance(train_bodies, glove_dict)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [36]:
start = time.time()
train_feats = [preprocess.process_feats_stance(i, body_dict, glove_dict) for i in stances_tr.values]
val_feats = [preprocess.process_feats_stance(i, body_dict, glove_dict) for i in stances_val.values]
end = time.time()
print(int(end-start))

61


In [37]:
def get_batch(data, targets, i,batch_size):
    batches = data[i*batch_size:i*batch_size+batch_size]
    results = targets[i*batch_size:i*batch_size+batch_size]
    results = [(2 if result == "agree" else (1 if result == "discuss" else 0)) for result in results]
    return np.array(batches),np.array(results)

In [82]:
print(len(stances_val))

2898


In [87]:
# Test the Model
def eval_model(model):
    correct = 0
    total = 0
    # batch_x_test,batch_y_test = get_batch(val_feats,[str(x[-1]) for x in stances_val.values],0,70)
    batch_x_test,batch_y_test = get_batch(val_feats,[str(x[-1]) for x in stances_val.values],0,len(stances_val))
    model.eval()
    predicted = None
    with torch.no_grad():
        inputs = Variable(torch.FloatTensor(batch_x_test))
        labels = torch.LongTensor(batch_y_test)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        print(f'output data: {outputs.data.shape}')
        print(f'labels shape: {labels.size(0)}')
        correct += (predicted == labels).sum()

    print('Accuracy: %d %%' % (100 * correct / total))
    return predicted

In [39]:
def score_model(predictions):    
    #use FNC scorer to generate score report
    label_prediction = [("agree" if x == 2 else ("discuss" if x == 1 else "disagree")) for x in predictions]
    label_actual = pd.DataFrame(stances_val)['Stance']
    matrix = confusion_matrix(label_actual,label_prediction)
    print('confusion matrix: \n{}\n'.format(matrix))
    score.report_score(label_actual, label_prediction)
    return matrix

In [40]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    correct = (preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [67]:
class RNN(nn.Module): 
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        #self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        #embedded = self.embedding(x)
        
        output, hidden = self.rnn(x)
        
        #assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [42]:
#architecture from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

class CNN(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x): 
        #x (batch size, 60, 91)
        embedded = x.unsqueeze(1) 
        #embedded (batch size, 1, 60, 91)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] 
        #conv_n = [batch size, n_filters, 60 - filter_sizes[n]]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] 
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1)) 
        #cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)

In [71]:
#INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 92
HIDDEN_DIM = 256
OUTPUT_DIM = 3

batch_size = 70
num_epochs = 5
display_step = 1


model = RNN(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

In [72]:
def train(model, total_batch, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_feats, [str(x[-1]) for x in stances_tr.values],i,batch_size)
        inputs = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        
        optimizer.zero_grad()
        predictions = model(inputs)
        loss = criterion(predictions, labels)
        _, predicted = torch.max(predictions.data, 1)
        acc = binary_accuracy(predicted, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / total_batch, epoch_acc / total_batch

def evaluate(model, total_batch, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for i in range(total_batch):
            batch_x,batch_y = get_batch(val_feats, [str(x[-1]) for x in stances_val.values],i,batch_size)
            inputs = Variable(torch.FloatTensor(batch_x))
            labels = Variable(torch.LongTensor(batch_y))
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            _, predicted = torch.max(predictions.data, 1)
            acc = binary_accuracy(predicted, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / total_batch, epoch_acc / total_batch

In [73]:
batches_train= int(len(train_feats)/batch_size)
batches_val = int(len(val_feats)/batch_size)

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, batches_train, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, batches_val, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.841 | Train Acc: 65.49% | Val. Loss: 0.840 | Val. Acc: 64.32% |
| Epoch: 02 | Train Loss: 0.792 | Train Acc: 66.89% | Val. Loss: 0.840 | Val. Acc: 64.32% |
| Epoch: 03 | Train Loss: 0.790 | Train Acc: 66.89% | Val. Loss: 0.840 | Val. Acc: 64.32% |
| Epoch: 04 | Train Loss: 0.789 | Train Acc: 66.89% | Val. Loss: 0.840 | Val. Acc: 64.32% |
| Epoch: 05 | Train Loss: 0.788 | Train Acc: 66.89% | Val. Loss: 0.841 | Val. Acc: 64.32% |


In [88]:
# Test the Model
predicted = eval_model(model)

output data: torch.Size([70, 3])
labels shape: 70
Accuracy: 58 %


In [89]:
predicted_l = [i.item() for i in list(predicted)]
Counter(predicted_l)

Counter({1: 70})

In [90]:
matrix = score_model(predicted_l)
utils.plot_confusion_matrix(matrix, classes=["Agree","Disagree", "Discuss"],
                      title='Normalized confusion matrix')
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [2898, 70]

In [121]:
true_label = [(2 if x[-1] == "agree" else (1 if x[-1] == "discuss" else 0)) for x in stances_val.values]
[list(x) for x in list(confusion_matrix(true_label,predicted_l))]

[[0, 155, 17], [0, 1723, 71], [0, 553, 165]]

In [123]:
torch.save(model.state_dict(), './saved_models/CNN_refuting_ft.pth')

# train finalized model below

In [99]:
model_final = CNN(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [122]:
fts = train_feats+val_feats
tr_labels = [str(x[-1]) for x in stances_tr.values]+[str(x[-1]) for x in stances_val.values]

def train_final(model, total_batch, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model_final.train()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(fts, tr_labels,i,batch_size)
        inputs = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        
        optimizer.zero_grad()
        predictions = model_final(inputs)
        loss = criterion(predictions, labels)
        _, predicted = torch.max(predictions.data, 1)
        acc = binary_accuracy(predicted, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / total_batch, epoch_acc / total_batch

In [104]:
batches_train= int(len(fts)/batch_size)

for epoch in range(num_epochs):
    train_loss, train_acc = train_final(model_f, batches_train, optimizer, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.727 | Train Acc: 69.69% |
| Epoch: 02 | Train Loss: 0.555 | Train Acc: 77.40% |
| Epoch: 03 | Train Loss: 0.465 | Train Acc: 80.90% |


In [105]:
torch.save(model_final.state_dict(), './CNN_model_softmax_final.pth')