# Stance Detection

In [1]:
import torch
import pandas as pd
import nltk
import numpy as np
import preprocessing, feature_engineering, helpers
import importlib
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import scipy
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import time
import itertools
import utils
import importlib
import re

ModuleNotFoundError: No module named 'preprocessing'

In [None]:
importlib.reload(preprocessing)
importlib.reload(feature_engineering)
importlib.reload(helpers)
importlib.reload(utils)
preprocess = preprocessing.Preprocessing()

In [None]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
train_stances = train_stances.loc[lambda x: x.Stance != "unrelated"]
print(train_stances.shape)
train_stances.head()

In [None]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

In [None]:
stances_tr, stances_val = preprocess.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

In [None]:
ct,ct2 = Counter(stances_val['Stance']),Counter(stances_tr['Stance'])
print(ct, ct2)
print(ct.most_common(1)[0][1]/len(list(stances_val["Stance"])))

In [None]:
glove_dict = preprocess.get_glove_dict("glove.6B.50d")

In [None]:
# [(nltk.pos_tag([x]),preprocess.get_sentiment(x)) for x in preprocess.get_clean_tokens(list(stances_tr.iloc[2,:])[0], False)]

In [None]:
# preprocess.cosine_similarity(glove_dict['reveal'], glove_dict['revealed'])

In [None]:
disagrees = stances_tr[stances_tr["Stance"]=="disagree"]
stances_tr = pd.concat([stances_tr, disagrees, disagrees]).sample(frac=1).reset_index(drop=True)

In [None]:
Counter(stances_tr['Stance'])

In [None]:
?preprocess.get_clean_tokens

In [None]:
def process_word_stance(word, glove_dict):
    #50d word vector
    if word in glove_dict:
        wv = glove_dict[word]
    else:
        wv = np.zeros((50, ))
    #4d sentiment
    sent = preprocess.get_sentiment(word)
    #16d one-hot encoding of part of speech (shortened)
    pos = nltk.pos_tag(word)[1][0]
    pos_encoding = [(1 if tag == pos else 0) for tag in preprocess.pos_short]
    #boolean flag for negating word
    stemmed_word = preprocess.stem_word(word)
    is_neg = (1 if stemmed_word in preprocess.negating_words_stemmed else 0)
    is_refuting = (1 if stemmed_word in preprocess.refuting_words_stemmed else 0)
    embedding = np.concatenate([wv, [sent["pos"], sent["neg"], sent["neu"], sent["compound"], is_neg, is_refuting], pos_encoding])
    return embedding

def process_text_stance(text, glove_dict, n_words = 20):
    tokens = preprocess.get_clean_tokens(text, False)
    if len(tokens)>=n_words:
        tokens = tokens[:n_words]
        encoding = np.array([process_word_stance(token, glove_dict) for token in tokens])
    elif len(tokens)<n_words:
        padding = [np.zeros((72,))]*(n_words-len(tokens))
        encoding = np.array([process_word_stance(token, glove_dict) for token in tokens]+padding)
    return encoding

def process_bodies_stance(df, glove_dict):
    body_info = {}
    ids = list(df["Body ID"])
    for i in range(len(ids)):
        if i % 100 == 0 and i != 0:
            print("processed "+str(i))
        body_info[ids[i]] = process_text_stance(preprocess.get_body(ids[i],df), glove_dict, 40)
    print("done! processed " + str(len(ids)))
    return body_info

def process_feats_stance(data, body_dict, glove_dict):
    headline, body_id = data[0], int(data[1])
    padding = [np.zeros((72,))]*(1)
    return np.concatenate([process_text_stance(headline, glove_dict), np.array(padding), body_dict[body_id]])

In [None]:
body_dict = process_bodies_stance(train_bodies, glove_dict)

In [None]:
start = time.time()
train_feats = [process_feats_stance(i, body_dict, glove_dict) for i in stances_tr.values]
val_feats = [process_feats_stance(i, body_dict, glove_dict) for i in stances_val.values]
end = time.time()
print(int(end-start))

In [None]:
np.array(val_feats).shape

In [None]:
def get_batch(data, targets, i,batch_size):
    batches = data[i*batch_size:i*batch_size+batch_size]
    results = targets[i*batch_size:i*batch_size+batch_size]
    results = [(2.0 if result == "agree" else (1.0 if result == "discuss" else 0.0)) for result in results]
    batches = np.array(batches)
    return np.swapaxes(batches, 0, 1), np.array(results)

In [None]:
# Test the Model
def eval_model(model):
    correct = 0
    total = 0
    batch_x_test,batch_y_test = get_batch(val_feats,[str(x[-1]) for x in stances_val.values],0,len(stances_val))
    model.eval()
    predicted = None
    with torch.no_grad():
        inputs = Variable(torch.FloatTensor(batch_x_test))
        labels = torch.LongTensor(batch_y_test)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Accuracy: %d %%' % (100 * correct / total))
    return predicted

In [None]:
def score_model(predictions):    
    #use FNC scorer to generate score report
    label_prediction = [("agree" if x == 2 else ("discuss" if x == 1 else "disagree")) for x in predictions]
    label_actual = pd.DataFrame(stances_val)['Stance']
    matrix = confusion_matrix(label_actual,label_prediction)
    print('confusion matrix: \n{}\n'.format(matrix))
    score.report_score(label_actual, label_prediction)
    return matrix

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    correct = (preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
# Vanilla RNN
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.output = None
        
    def forward(self, x):
        output, hidden = self.rnn(x)
        #hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        fc = self.fc(hidden.squeeze(0))
        self.output = output
        return fc

In [None]:
# LSTM
class RNN_LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        #x = [sent len, batch size, emb dim]
        output, (hidden, cell) = self.rnn(x)
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        #hidden = [batch size, hid dim * num directions]
        fc = self.fc(hidden.squeeze(0))
        fc2 = self.fc2(F.relu(fc))
        return fc2

In [None]:
# GRU
class RNN_GRU(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        output, hidden = self.rnn(x)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        fc = self.fc(hidden.squeeze(0))
        fc2 = self.fc2(F.relu(fc))
        return fc2

In [None]:
num_epochs = 8
batch_size = 250

EMBEDDING_DIM = 72
OUTPUT_DIM = 3
DROPOUT = 0.2
N_FILTERS = 100
FILTER_SIZE = 5

In [None]:
model1 = RNN(EMBEDDING_DIM, 128, OUTPUT_DIM, 1, DROPOUT)
opt1 = torch.optim.Adam(model1.parameters(), lr=2e-4)
m1 = model1, opt1

criterion = nn.CrossEntropyLoss()

In [None]:
queue = [m1]

In [None]:
def train(model, total_batch, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_feats, [str(x[-1]) for x in stances_tr.values],i,batch_size)
        inputs = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        optimizer.zero_grad()
        predictions = model(inputs)
        loss = criterion(predictions, labels)
        _, predicted = torch.max(predictions.data, 1)
        acc = binary_accuracy(predicted, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / total_batch, epoch_acc / total_batch

def evaluate(model, total_batch, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for i in range(total_batch):
            batch_x,batch_y = get_batch(val_feats, [str(x[-1]) for x in stances_val.values],i,batch_size)
            inputs = Variable(torch.FloatTensor(batch_x))
            labels = Variable(torch.LongTensor(batch_y))
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            _, predicted = torch.max(predictions.data, 1)
            acc = binary_accuracy(predicted, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / total_batch, epoch_acc / total_batch

In [None]:
batches_train= int(len(train_feats)/batch_size)
batches_val = int(len(val_feats)/batch_size)

for x in queue:
    model = x[0]
    optimizer = x[1]
    print("\n")
    start = time.time()
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, batches_train, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, batches_val, criterion)

        print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    end = time.time()
    print(int(end-start))

In [None]:
with torch.no_grad():
    batch_x,batch_y = get_batch(val_feats, [str(x[-1]) for x in stances_val.values],0,1)
    inputs = Variable(torch.FloatTensor(batch_x))
    labels = Variable(torch.LongTensor(batch_y))
    predictions = model(inputs)
    _, predicted = torch.max(predictions.data, 0)
output = model.output.permute([1,0,2]).squeeze(0)

In [None]:
# try comparing L2 norm for each input. This is for WHOLE hidden state
# ignore for now
"""hidden_mag = np.array([np.linalg.norm(i) for i in output])
print(scipy.stats.describe(hidden_mag))
print(hidden_mag.shape)""";

In [None]:
# looks at the activations for the specific neuron in a hidden state
# the features we'll actually compare
cell = np.array(np.swapaxes(output, 0, 1)[0])
print(scipy.stats.describe(cell))

In [None]:
# Get the full headline text
body = stances_val.iloc[0]["Body ID"]
text_body = preprocess.get_body(body,train_bodies)
text_headline = stances_val.iloc[0]["Headline"]

# Get the tokens that are actually fed into the network
tokens_body = preprocess.get_clean_tokens(text_body, False)[:40]
tokens_headline = preprocess.get_clean_tokens(text_headline, False)[:20]
tokens = np.concatenate((tokens_headline, tokens_body))

text_body = text_body.split(" ")
text_headline = text_headline.split(" ")
text = np.concatenate((text_headline[:20], text_body[:40]))

In [None]:
values = get_values(text, tokens, cell)
values_body = v[-40:] # Gets the body text
values_headline = v[:-40] # Gets the headline text
values_json = {"body":values_body, "headline":values_headline}

In [None]:
def get_values(text, tokens, cell):
    wln = nltk.WordNetLemmatizer()
    j = 0 # index in tokens for duplicate token values
    num_word = 0
    body = [{} for i in range(len(text))]
        
    for i in range(len(text)):
        test = preprocess.clean(text[i])
        test = preprocess.get_tokenized_lemmas(test)
        test = preprocess.remove_stopwords(test, True)
        if(len(test)==0): 
            body[i] = {text[i]:str(0)}
            #print(text_body[i], 0)
        else:
            #token_index = np.where(tokens[j:]==test[0])
            index = list(tokens[j:]).index(test[0])
            body[i] = {text[i]:str(cell[index])}
            j+=1
    return body

In [None]:
import json
with open('Vis/activations.json', 'w') as outfile:  
    json.dump(values_json, outfile)

In [None]:
# TODO: Recreate the whole sentence and body, with non-token words having a value of 0
# and turn into a JSON for viz. 

In [None]:
# TODO: Find a way to quickly identify cells that have meaningful pattenrs