# ReadMe

This notebook contains a pipeline for text data normalization and feature engineering for Stance Detection between two bodies of text, such as a title and abstract.  The pipeline was originally composed and executed for the [Fake News Challenge](http://www.fakenewschallenge.org/) whos dataset is roughly 70,000 observations.  This pipeline achieved roughly 98% and 96% accuracy for training and testing sets, respectively on the FNC data. The cells below normalize the *vt_contra_v9_covid19_metadata_200425.csv* data, completes feature engineering and then uses the saved FNC model weights to predict stances for the contradiction data {'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3}.

Feature engineering was done using google compute engine as colab does not offer enough ram.  

# Preliminary

In [None]:
#pip install xgboost
#pip install dtale
#!pip install gensim

In [1]:
import os
import sys
import csv

import pandas as pd
#pd.set_option('display.max_rows', None)
# pd.options.display.float_format = '{:, .2f}'.format
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_columns', 100)

import numpy as np
from numpy import save, load
from numpy import savez_compressed
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import copy
import pickle

#from scipy.misc import comb, logsumexp
from sklearn.manifold import TSNE #a tool to visualize high dimensional data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD # dimensionality reduction using truncated SVD (AKA LSA)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import accuracy_score


import xgboost as xgb
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.collocations import *
import string #python module
import re # python regex module
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize

import gensim
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

np.random.seed(0)

from sklearn.preprocessing import normalize
from functools import reduce

In [2]:
# import data
df_0 = pd.read_csv("vt_contra_v9_covid19_metadata_200425.csv")
df_0.head(1)

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,title_abstract
0,12519,uk8rfroj,b5d303cbcfe6be92d733ec593118b388db77452e,PMC,Complete Genome Sequence of a 2019 Novel Coronavirus (SARS-CoV-2) Strain Isolated in Nepal,10.1128/mra.00169-20,PMC7067954,32165386.0,cc-by,"A complete genome sequence was obtained for a severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) strain isolated from an oropharyngeal swab specimen of a Nepalese patient with coronavirus disease 2019 (COVID-19), who had returned to Nepal after traveling to Wuhan, China.",3/12/2020,"Sah, Ranjit; Rodriguez-Morales, Alfonso J.; Jha, Runa; Chu, Daniel K. W.; Gu, Haogao; Peiris, Malik; Bastola, Anup; Lal, Bibek Kumar; Ojha, Hemant Chanda; Rabaan, Ali A.; Zambrano, Lysien I.; Costello, Anthony; Morita, Kouichi; Pandey, Basu Dev; Poon, Leo L. M.",Microbiol Resour Announc,,,True,True,comm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7067954/,"complete genome sequence of a 2019 novel coronavirus (sars-cov-2) strain isolated in nepal a complete genome sequence was obtained for a severe acute respiratory syndrome coronavirus 2 (sars-cov-2) strain isolated from an oropharyngeal swab specimen of a nepalese patient with coronavirus disease 2019 (covid-19), who had returned to nepal after traveling to wuhan, china."


In [3]:
# isolate germane featrues
df1 = copy.deepcopy(df_0[['cord_uid','title', 'abstract']])

In [4]:
print("Number of unique cord_uid's: %s" % df1.cord_uid.nunique())
print("Number of unique title's: %s" % df1.title.nunique())
print("Number of unique abstract's: %s" % df1.abstract.nunique())

Number of unique cord_uid's: 3956
Number of unique title's: 3911
Number of unique abstract's: 3941


# Preprocessing -- Normalize Text data

In [5]:
# lowercase all text
df_2 = copy.deepcopy(df1)
df_2['title'] = df_2['title'].str.lower()
df_2['abstract'] = df_2['abstract'].str.lower()
#df_2['title_abstract'] = df_2['title_abstract'].str.lower()

# tokenize
tokenizer = RegexpTokenizer (r"(?u)\b\w\w+\b")
df_2['title_tokens'] = df_2['title'].map(tokenizer.tokenize)
df_2['abstract_tokens'] = df_2['abstract'].map(tokenizer.tokenize)
#df_2['title_abstract_tokens'] = df_2['abstract'].map(tokenizer.tokenize)
df_2.head(3)

Unnamed: 0,cord_uid,title,abstract,title_tokens,abstract_tokens
0,uk8rfroj,complete genome sequence of a 2019 novel coronavirus (sars-cov-2) strain isolated in nepal,"a complete genome sequence was obtained for a severe acute respiratory syndrome coronavirus 2 (sars-cov-2) strain isolated from an oropharyngeal swab specimen of a nepalese patient with coronavirus disease 2019 (covid-19), who had returned to nepal after traveling to wuhan, china.","[complete, genome, sequence, of, 2019, novel, coronavirus, sars, cov, strain, isolated, in, nepal]","[complete, genome, sequence, was, obtained, for, severe, acute, respiratory, syndrome, coronavirus, sars, cov, strain, isolated, from, an, oropharyngeal, swab, specimen, of, nepalese, patient, with, coronavirus, disease, 2019, covid, 19, who, had, returned, to, nepal, after, traveling, to, wuhan, china]"
1,ivwn4nhl,"first cases of coronavirus disease 2019 (covid-19) in the who european region, 24 january to 21 february 2020","in the who european region, covid-19 surveillance was implemented 27 january 2020. we detail the first european cases. as at 21 february, nine european countries reported 47 cases. among 38 cases studied, 21 were linked to two clusters in germany and france, 14 were infected in china. median case age was 42 years; 25 were male. late detection of the clusters’ index cases delayed isolation of further local cases. as at 5 march, there were 4,250 cases.","[first, cases, of, coronavirus, disease, 2019, covid, 19, in, the, who, european, region, 24, january, to, 21, february, 2020]","[in, the, who, european, region, covid, 19, surveillance, was, implemented, 27, january, 2020, we, detail, the, first, european, cases, as, at, 21, february, nine, european, countries, reported, 47, cases, among, 38, cases, studied, 21, were, linked, to, two, clusters, in, germany, and, france, 14, were, infected, in, china, median, case, age, was, 42, years, 25, were, male, late, detection, of, the, clusters, index, cases, delayed, isolation, of, further, local, cases, as, at, march, there,..."
2,4yuw7jo3,network-based drug repurposing for novel coronavirus 2019-ncov/sars-cov-2,"human coronaviruses (hcovs), including severe acute respiratory syndrome coronavirus (sars-cov) and 2019 novel coronavirus (2019-ncov, also known as sars-cov-2), lead global epidemics with high morbidity and mortality. however, there are currently no effective drugs targeting 2019-ncov/sars-cov-2. drug repurposing, representing as an effective drug discovery strategy from existing drugs, could shorten the time and reduce the cost compared to de novo drug discovery. in this study, we present ...","[network, based, drug, repurposing, for, novel, coronavirus, 2019, ncov, sars, cov]","[human, coronaviruses, hcovs, including, severe, acute, respiratory, syndrome, coronavirus, sars, cov, and, 2019, novel, coronavirus, 2019, ncov, also, known, as, sars, cov, lead, global, epidemics, with, high, morbidity, and, mortality, however, there, are, currently, no, effective, drugs, targeting, 2019, ncov, sars, cov, drug, repurposing, representing, as, an, effective, drug, discovery, strategy, from, existing, drugs, could, shorten, the, time, and, reduce, the, cost, compared, to, de,..."


In [6]:
nltk.download('punkt') # a sentance tokenizer
nltk.download('gutenberg') # a text corpora and lexical resources
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/durdenjax/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/durdenjax/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/durdenjax/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# instantiate list of stop words and other characters/punctuation to remove
stopwords_list = stopwords.words('english')
stopwords_list += ["''", '""', '...', '``',"_"]

# remove stop words / keep everything except stopwords_list
df_2['title_tokens'] = df_2['title_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
df_2['abstract_tokens'] = df_2['abstract_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
#df_2['title_abstract_tokens'] = df_2['title_abstract_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])

In [8]:
# alias stemmer method
stemmer = nltk.stem.SnowballStemmer('english')
# stem Headline_tokens and articleBody_tokens
df_2['title_tokens'] = df_2.apply(lambda row: [stemmer.stem(item) for item in row.title_tokens], axis=1)
df_2['abstract_tokens'] = df_2.apply(lambda row: [stemmer.stem(item) for item in row.abstract_tokens], axis=1)
#df_2['title_abstract_tokens'] = df_2.apply(lambda row: [stemmer.stem(item) for item in row.abstract_tokens], axis=1)
df_2.head(1)

Unnamed: 0,cord_uid,title,abstract,title_tokens,abstract_tokens
0,uk8rfroj,complete genome sequence of a 2019 novel coronavirus (sars-cov-2) strain isolated in nepal,"a complete genome sequence was obtained for a severe acute respiratory syndrome coronavirus 2 (sars-cov-2) strain isolated from an oropharyngeal swab specimen of a nepalese patient with coronavirus disease 2019 (covid-19), who had returned to nepal after traveling to wuhan, china.","[complet, genom, sequenc, 2019, novel, coronavirus, sar, cov, strain, isol, nepal]","[complet, genom, sequenc, obtain, sever, acut, respiratori, syndrom, coronavirus, sar, cov, strain, isol, oropharyng, swab, specimen, nepales, patient, coronavirus, diseas, 2019, covid, 19, return, nepal, travel, wuhan, china]"


# Feature Engineering

## Basic Count Features

In [9]:
# https://github.com/Cisco-Talos/fnc-1/blob/master/tree_model/ngram.py

# create functions to build n_grams
def getUnigram(words):
    #assert type(words) == []
    return words

def getBigram(words, join_string, skip=0):
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1, skip+2):
                if i + k < L:
                    lst.append(join_string.join([words[i], words[i+k]]))
        return lst
    else:
        # set it as unigram
        lst = getUnigram(words)
        return lst
                    
def getTrigram(words, join_string, skip=0):
    #assert type(words) == []
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1, skip+2):
                for k2 in range(1, skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append(join_string.join([words[i], words[i+k1], words[i+k1+k2]]))
        return lst
    else:
        #set as bigram
        lst = getBigram(words, join_string, skip)
        return lst
    
def getFourgram(words, join_string):

    #assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in xrange(L-3):
            lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
        return lst
    else:
        # set it as bigram
        lst = getTrigram(words, join_string)
    return lst



def getBiterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny', 'boy']
        Output: a list of biterm, e.g., ['I_am', 'I_Denny', 'I_boy', 'am_Denny', 'am_boy', 'Denny_boy']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for j in range(i+1,L):
                lst.append( join_string.join([words[i], words[j]]) )
        return lst
    
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst
    
def getTriterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of triterm, e.g., ['I_am_Denny', 'I_Denny_am', 'am_I_Denny',
        'am_Denny_I', 'Denny_I_am', 'Denny_am_I']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in xrange(L-2):
            for j in xrange(i+1,L-1):
                for k in xrange(j+1,L):
                    lst.append( join_string.join([words[i], words[j], words[k]]) )
        return lst
    else:
        # set it as biterm
        lst = getBiterm(words, join_string)
    return lst

In [10]:
# generate unigram
df_2["title_unigram"] = df_2["title_tokens"].map(lambda x: getUnigram(x))
df_2["abstract_unigram"] = df_2["abstract_tokens"].map(lambda x: getUnigram(x))
#df_2["title_abstract_unigram"] = df_2["title_abstract_tokens"].map(lambda x: getUnigram(x))

# generate bigram
join_str = "_"
df_2["title_bigram"] = df_2["title_unigram"].map(lambda x: getBigram(x, join_str))
df_2["abstract_bigram"] = df_2["abstract_unigram"].map(lambda x: getBigram(x, join_str))
#df_2["title_abstract_bigram"] = df_2["title_abstract_unigram"].map(lambda x: getBigram(x, join_str))
        
# generate trigram
join_str = "_"
df_2["title_trigram"] = df_2["title_unigram"].map(lambda x: getTrigram(x, join_str))
df_2["abstract_trigram"] = df_2["abstract_unigram"].map(lambda x: getTrigram(x, join_str))
#df_2["title_abstract_trigram"] = df_2["title_abstract_unigram"].map(lambda x: getTrigram(x, join_str))

In [11]:
# calc percent of text in given Headline or articleBody that is unique ( unique grams / ttl grams)

''' 
    count ttl # of n-gram
    count ttl # of unique n-gram
    divide ttl # uniqe by ttl #
    
'''

grams = ["unigram", "bigram", "trigram"]
feat_names = ["title", "abstract"]

for feat_name in feat_names:
    for gram in grams:
        df_2["count_of_%s_%s" % (feat_name, gram)] = list(df_2.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1))
        df_2["count_of_unique_%s_%s" % (feat_name, gram)] = \
              list(df_2.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1))
        df_2["ratio_of_unique_%s_%s" % (feat_name, gram)] = \
            df_2["count_of_unique_%s_%s"%(feat_name,gram)] / df_2["count_of_%s_%s"%(feat_name,gram)]
            #map(try_divide, df_2["count_of_unique_%s_%s"%(feat_name,gram)], df_2["count_of_%s_%s"%(feat_name,gram)])

In [12]:
# overlapping n-grams count

for gram in grams:
    # count grams appearing in Headline that are also inside its coresponding articleBody
    df_2["count_of_title_%s_in_abstract" % gram] = \
        list(df_2.apply(lambda x: sum([1. for w in x["title_" + gram] if w in set(x["abstract_" + gram])]), axis=1))
    
    # return the ratio of overlapping grams to ttl Headline grams
    df_2["ratio_of_title_%s_in_abstract" % gram] = \
        df_2["count_of_title_%s_in_abstract" % gram] / df_2["count_of_title_%s" % gram]
        #map(try_divide, df["count_of_Headline_%s_in_articleBody" % gram], df["count_of_Headline_%s" % gram])

In [13]:
# count number of sentences in title, abstract
for feat_name in feat_names:
    df_2['len_sent_%s' % feat_name] = df_2[feat_name].apply(lambda x: len(sent_tokenize(x)))

In [15]:
 # save basic count features to disk for later use

feat_names_bcf = [ n for n in df_2.columns \
                if "count" in n \
                or "ratio" in n \
                or "len_sent" in n]


feat_names_bcf

['count_of_title_unigram',
 'count_of_unique_title_unigram',
 'ratio_of_unique_title_unigram',
 'count_of_title_bigram',
 'count_of_unique_title_bigram',
 'ratio_of_unique_title_bigram',
 'count_of_title_trigram',
 'count_of_unique_title_trigram',
 'ratio_of_unique_title_trigram',
 'count_of_abstract_unigram',
 'count_of_unique_abstract_unigram',
 'ratio_of_unique_abstract_unigram',
 'count_of_abstract_bigram',
 'count_of_unique_abstract_bigram',
 'ratio_of_unique_abstract_bigram',
 'count_of_abstract_trigram',
 'count_of_unique_abstract_trigram',
 'ratio_of_unique_abstract_trigram',
 'count_of_title_unigram_in_abstract',
 'ratio_of_title_unigram_in_abstract',
 'count_of_title_bigram_in_abstract',
 'ratio_of_title_bigram_in_abstract',
 'count_of_title_trigram_in_abstract',
 'ratio_of_title_trigram_in_abstract',
 'len_sent_title',
 'len_sent_abstract']

In [16]:
xBasicCounts = df_2[feat_names_bcf].values
print(type(xBasicCounts))
print(xBasicCounts.shape)

<class 'numpy.ndarray'>
(3957, 26)


In [17]:
with open("basic_count_features.pkl", "wb") as outfile:
    #pickle.dump(feat_names, outfile, -1)
    pickle.dump(xBasicCounts, outfile, -1)

## Latent Symantic Analysis

### TF-IDF

In [None]:
def cat_text(x):
    res = '%s %s' % (' '.join(x['title_unigram']), ' '.join(x['abstract_unigram']))
    return res

# concatenate title and abstract so we can fit a tfidf vectorizer that will learn the combined vocabulary
df_2['all_text'] = list(df_2.apply(cat_text, axis = 1))

In [None]:
# fit a TfidfVectorizer on the concatenated strings (fit learns the vocabulary and idf)

#vec = TfidfVectorizer(ngram_range = (1, 3), max_df= 0.8, min_df= 2)
vec = TfidfVectorizer(ngram_range = (1, 3))
vec.fit(df_2['all_text'])
vocabulary = vec.vocabulary_

In [None]:
# transform title unigrams into tf-idf vector using the learned vocabulary
vec_Title = TfidfVectorizer(ngram_range=(1,3), max_df=0.8, min_df= 2, vocabulary=vocabulary)
Title_tfidf = vec_Title.fit_transform(df_2['title_unigram'].map(lambda x: ' '.join(x)))
print ("Title_tfidf.shape:" + str(Title_tfidf.shape))

# transform abstract unigrams using the learned vocabulary
vec_abstract = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2, vocabulary=vocabulary)
abstract_tfidf = vec_abstract.fit_transform(df_2['abstract_unigram'].map(lambda x: ' '.join(x)))
print ("abstract_tfidf.shape:" +  str(abstract_tfidf.shape))

# save title tfidf for later use
outfilename_title_tfidf = "title_tfidf.pkl"
with open (outfilename_title_tfidf, 'wb') as outfile:
    pickle.dump(Title_tfidf, outfile, -1)
    
# save abstract tfidf for later use
outfilename_abstract_tfidf = "abstract_tfidf.pkl"
with open(outfilename_abstract_tfidf, "wb") as outfile:
    pickle.dump(abstract_tfidf, outfile, -1) 


In [None]:
'''scikit-learn has a cosine_similarity function though, we must consider the input shape of our data and the desired output shape. 
   We need to take in extremely large 2-D arrays and end up with a 2-D array of one feature. To do this, we first convert each input 
   into a Coordinate Format matrix before computing cosine_similarity , calculate the row-wise cosine_similarity and finally coerce it 
   from a 1-D to 2-D array.'''

def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1)
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print (x)
        print (y)
        d = 0.
    return d

In [None]:
# calculate cosine similarity between Headline and articleBody

simTfidf_train = np.asarray(list(map(cosine_sim,Title_tfidf, abstract_tfidf)))[:, np.newaxis]

print(simTfidf_train.shape)

# save for later use
outfilename_simtfidf_train = "sim_tfidf.pkl"
with open(outfilename_simtfidf_train, "wb") as outfile:
    pickle.dump(simTfidf_train, outfile, -1)


### SVD

In [None]:
from scipy.sparse import vstack

x_title_abstract_tfidf = vstack((Title_tfidf, abstract_tfidf)).toarray() # toarray() converts the csr_matrix objects to numpy arrays
svd = TruncatedSVD(n_components=100, n_iter=15, random_state = 42)

print(x_title_abstract_tfidf.shape)

# fit to the combined train-test set 
svd.fit(x_title_abstract_tfidf)

In [None]:
# transform title tfidf features using svd
x_title_Svd = svd.transform(Title_tfidf)
print ('x_title_Svd.shape:')
print (x_title_Svd.shape)

# save for later use
with open("title_svd.pkl", "wb") as outfile:
    pickle.dump(TitleSvdTrain, outfile, -1)
    
    
# transform abstract tfidf features using svd
x_abstract_Svd = svd.transform(abstract_tfidf)
print ('x_abstract_Svd.shape:')
print (x_abstract_Svd.shape)

# save for later use
with open("abstract_svd.pkl", "wb") as outfile:
    pickle.dump(x_abstract_Svd, outfile, -1)


In [None]:
# calculate cosine similarity for each record

simSvd_train = np.asarray(list(map(cosine_sim, x_title_Svd, x_abstract_Svd)))[:, np.newaxis]
print ('sim_svd_train shape:')
print (simSvd_train.shape)

# save for later use

with open("sim_svd.pkl", "wb") as outfile:
    pickle.dump(simSvd_train, outfile, -1)

### Word2Vec Word Embeddings

In [None]:
df_2["title_unigram_vec"] = df_2["title_tokens"]
df_2["abstract_unigram_vec"] = df_2["abstract_tokens"]

In [None]:
# load pre-trained model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
title_unigram_array = df_2["title_unigram_vec"].values
print("df_2 title_unigram_vec type: %s" % type(df_2["title_unigram_vec"]))
print("title_unigram_array type: %s" % type(title_unigram_array))
print()


title_vec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), title_unigram_array)))
title_vec_norm = normalize(title_vec)
print("title_vec type: %s" % type(title_vec))
print("title_vec shape:" +  str(title_vec.shape))
print()
print("title_vec_norm type: %s" % type(title_vec_norm))
print("title_vec_norm shape:" + str(title_vec_norm.shape))

#save word embeddings
with open("title_w2v.pkl", "wb") as outfile:
    pickle.dump(title_vec_norm, outfile, -1)

In [None]:
abstract_unigram_array = df_2["abstract_unigram_vec"].values
print("df_2 abstract_unigram_vec type: %s" % type(df_2["abstract_unigram_vec"]))
print("abstract_unigram_array type: %s" % type(abstract_unigram_array))
print()

abstract_vec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), abstract_unigram_array)))
abstract_vec_norm = normalize(abstract_vec)

print("abstract_vec type: %s" % type(abstract_vec))
print("abstract_vec:" +  str(abstract_vec.shape))
print()
print("abstract_vec_norm type: %s" % type(abstract_vec_norm))
print("abstract_vec_norm shape:" + str(abstract_vec_norm.shape))

with open("abstract_w2v.pkl", "wb") as outfile:
    pickle.dump(abstract_vec_norm, outfile, -1)

In [None]:
# compute cosine similarity between title & abstract word2vec features
simVec_w2v = np.asarray(list(map(cosine_sim, title_vec_norm, abstract_vec_norm)))[:, np.newaxis]
print(type(simVec_w2v))
print(simVec_w2v.shape)
print("simVec_w2v num dimensions:" + str(simVec_w2v.ndim))
print(simVec_w2v[0:2])

with open("sim_w2v.pkl", "wb") as outfile:
    pickle.dump(simVec_w2v, outfile, -1)

### Sentiment Features

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
# calculate polarity score of each sentance in a Headline observation and return the average

sid = SentimentIntensityAnalyzer() # https://www.nltk.org/howto/sentiment.html

def compute_sentiment(sentences):
    result = []
    for sentence in sentences:
        ss = sid.polarity_scores(sentence) # https://www.nltk.org/howto/sentiment.html
        result.append(ss)
    return pd.DataFrame(result).mean()

In [None]:
df_2["title_senti"] = df_2['title'].apply(lambda x: sent_tokenize(x)) # nltk's method sent_tokenize()
df_2 = pd.concat([df_2, df_2['title_senti'].apply(lambda x: compute_sentiment(x))], axis=1)
df_2.rename(columns={'compound':'T_compound', 'neg':'T_neg', 'neu':'T_neu', 'pos':'T_pos'}, inplace=True)

Title_Senti = df_2[['T_compound','T_neg','T_neu','T_pos']].values
print ('Title_Senti shape:' + str(Title_Senti.shape))
print()

# save title sentiment
with open("title_sentiment.pkl", "wb") as outfile:
    pickle.dump(Title_Senti, outfile, -1)
    


df_2["abstract_senti"] = df_2['abstract'].apply(lambda x: sent_tokenize(x)) # nltk's method sent_tokenize()
df_2 = pd.concat([df_2, df_2['abstract_senti'].apply(lambda x: compute_sentiment(x))], axis=1)
df_2.rename(columns={'compound':'A_compound', 'neg':'A_neg', 'neu':'A_neu', 'pos':'A_pos'}, inplace=True)

Abstract_Senti = df_2[['A_compound','A_neg','A_neu','A_pos']].values
print ('Abstract_Senti shape:' + str(Abstract_Senti.shape))

# save abstract sentiment
with open("abstract_sentiment.pkl", "wb") as outfile:
    pickle.dump(Abstract_Senti, outfile, -1)

# combine engineered features into one dataset

In [18]:
with open ("CORD_data/basic_count_features.pkl", "rb") as infile:
    #feat_names = pickle.load(infile)
    basic_count_feats = pickle.load(infile)
    
with open ("CORD_data/title_tfidf.pkl", "rb") as infile_:
    title_tfidf = pickle.load(infile_)
    title_tfidf = title_tfidf.toarray()
    
with open ("CORD_data/abstract_tfidf.pkl", "rb") as outfile:
    abstract_tfidf = pickle.load(outfile)
    abstract_tfidf = abstract_tfidf.toarray()

with open ("CORD_data/sim_tfidf.pkl", "rb") as outfile_:
    sim_tfidf = pickle.load(outfile_)
    
with open ("CORD_data/title_svd.pkl", "rb") as svd_title:
    title_svd = pickle.load(svd_title)
    
with open ("CORD_data/abstract_svd.pkl", "rb") as svd_abstract:
    abstract_svd = pickle.load(svd_abstract)
    
with open ("CORD_data/sim_svd.pkl", "rb") as sim_svd:
    sim_svd = pickle.load(sim_svd)
    
with open ("CORD_data/title_w2v.pkl", "rb") as Tw:
    title_w2v = pickle.load(Tw)
    
with open ("CORD_data/abstract_w2v.pkl", "rb") as Aw:
    abstract_w2v = pickle.load(Aw)
    
with open ("CORD_data/sim_w2v.pkl", "rb") as Sw:
    sim_w2v = pickle.load(Sw)
    
with open ("CORD_data/title_sentiment.pkl", "rb") as Ts:
    title_sentiment = pickle.load(Ts)
    
with open ("CORD_data/abstract_sentiment.pkl", "rb") as As:
    abstract_sentiment = pickle.load(As)
    

In [19]:
vectors = [basic_count_feats, sim_tfidf, 
           title_svd, abstract_svd, sim_svd, 
           title_w2v, abstract_w2v, sim_w2v, 
           title_sentiment, abstract_sentiment]

for vec in vectors:
    print(vec.ndim)
    print(vec.shape)
    print(type(vec))

In [22]:
cord_data = np. hstack(vectors)

In [23]:
cord_data.shape

(3957, 837)

In [24]:
with open ("engineered_cord_data.pkl", "wb") as all_data:
    pickle.dump(cord_data, all_data, protocol = 4)

# XGBoost

In [None]:
loaded_FNC_model = pickle.load(open("/content/drive/My Drive/CoronaWhy/mini_task_CONTRADICTION/data_files/FNC data for CORD/XGB_fit_fnc_features_for_cord.pkl", "rb"))

loaded_contra_features = pickle.load(open("/content/drive/My Drive/CoronaWhy/mini_task_CONTRADICTION/data_files/CONTRADICTION_data/engineered_cord_data.pkl", "rb"))


In [None]:
# use XGB predict on contradiction features
predictions = loaded_FNC_model.predict(loaded_contra_features)

In [None]:
df_preds = pd.DataFrame(predictions, columns = ['predictions'])

contra_txt_df = pd.read_csv("/content/drive/My Drive/CoronaWhy/mini_task_CONTRADICTION/data_files/vt_contra_v9_covid19_metadata_200425.csv")

In [None]:
df_txt_and_preds = pd.concat([contra_txt_df, df_preds], axis = 0)

In [None]:
df_preds1.head(1)