# dataset: drug_individual_claims_filtered(1).csv_
### using mancon for training

# Preliminary

In [None]:
#pip install xgboost
#pip install dtale
#!pip install gensim

In [1]:
import os
import sys
import csv

import pandas as pd
#pd.set_option('display.max_rows', None)
# pd.options.display.float_format = '{:, .2f}'.format
pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_columns', 100)

import numpy as np
from numpy import save, load
from numpy import savez_compressed
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import copy
import pickle

#from scipy.misc import comb, logsumexp
from sklearn.manifold import TSNE #a tool to visualize high dimensional data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD # dimensionality reduction using truncated SVD (AKA LSA)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import accuracy_score


import xgboost as xgb
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.collocations import *
import string #python module
import re # python regex module
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize

import gensim
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

np.random.seed(0)

from sklearn.preprocessing import normalize
from functools import reduce



In [2]:
# import data
train_df = pd.read_csv("manconcorpus_sent_pairs_200516.tsv", delimiter = "\t", encoding = "utf-8")
test_df = pd.read_excel("drug_individual_claims_similarity_annotated_v05.19.xlsx")

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df['label'].value_counts()

In [3]:
y_train = train_df["label"]
y_train.replace({'neutral': '0', 'entailment': '1', 'contradiction':'2'}, inplace=True)
y_train.to_csv("y_train.csv", "rb")

In [None]:
type(y_train)

In [4]:
y_train.shape

(17911,)

In [5]:
y_train.value_counts()

0    15217
1     1966
2      728
Name: label, dtype: int64

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28643 entries, 0 to 28642
Data columns (total 9 columns):
paper1_cord_uid     28643 non-null object
paper2_cord_uid     28643 non-null object
text1               28643 non-null object
text2               28643 non-null object
similarity_score    28643 non-null float64
drugs1              28643 non-null object
drugs2              28643 non-null object
annotation          107 non-null object
evaluated_for       104 non-null object
dtypes: float64(1), object(8)
memory usage: 2.2+ MB


In [None]:
# isolate germane featrues in test_df
test_df = copy.deepcopy(test_df[['paper1_cord_uid','paper2_cord_uid','text1','text2']])

# add a guid column to cord df so we can eventually stack all the data and preprocess together
test_df.insert(0, 'guid', range(17911, 17911 + len(test_df)))

# rename cord text columns to match mancon column names
test_df.rename(columns= {"text1":"text_a", "text2":"text_b"}, inplace=True)

# drop paper1_cord_uid and paper2_cord_uid
test_df.drop(columns = ['paper1_cord_uid', 'paper2_cord_uid'], inplace = True)

In [None]:
test_df.head()

In [None]:
# combine train and test data
frames = [train_df, test_df]
data = pd.concat(frames, sort = False)
data = data[['guid', 'text_a', 'text_b', 'label']]
data.info()

In [None]:
import dtale
d = dtale.show(data, ignore_duplicate=True)
d
#d.kill(data)

# Preprocessing -- Normalize Text data

In [None]:
data = data.apply(lambda x: x.astype(str).str.lower())

In [None]:
# lowercase all text
#df_2['text1'] = df_2['text1'].str.lower()
#df_2['text2'] = df_2['text2'].str.lower()

# tokenize
tokenizer = RegexpTokenizer (r"(?u)\b\w\w+\b")
#data['text1_tokens'] = data['text1'].map(tokenizer.tokenize)
#data['text2_tokens'] = data['text2'].map(tokenizer.tokenize)
data['text_a_tokens'] = data['text_a'].map(tokenizer.tokenize)
data['text_b_tokens'] = data['text_b'].map(tokenizer.tokenize)

In [None]:
nltk.download('punkt') # a sentance tokenizer
nltk.download('gutenberg') # a text corpora and lexical resources
nltk.download('stopwords')

In [None]:
# instantiate list of stop words and other characters/punctuation to remove
stopwords_list = stopwords.words('english')
stopwords_list += ["''", '""', '...', '``',"_"]

# remove stop words / keep everything except stopwords_list

#data['text1_tokens'] = data['text1_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
#data['text2_tokens'] = data['text2_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])

data['text_a_tokens'] = data['text_a_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])
data['text_b_tokens'] = data['text_b_tokens'].apply(lambda x: [item for item in x if item not in stopwords_list])

In [None]:
# alias stemmer method
stemmer = nltk.stem.SnowballStemmer('english')
# stem Headline_tokens and articleBody_tokens

#data['text1_tokens'] = data.apply(lambda row: [stemmer.stem(item) for item in row.text1_tokens], axis=1)
#data['text2_tokens'] = data.apply(lambda row: [stemmer.stem(item) for item in row.text2_tokens], axis=1)

data['text_a_tokens'] = data.apply(lambda row: [stemmer.stem(item) for item in row.text_a_tokens], axis=1)
data['text_b_tokens'] = data.apply(lambda row: [stemmer.stem(item) for item in row.text_b_tokens], axis=1)

# Feature Engineering

## Basic Count Features

In [None]:
# https://github.com/Cisco-Talos/fnc-1/blob/master/tree_model/ngram.py

# create functions to build n_grams
def getUnigram(words):
    #assert type(words) == []
    return words

def getBigram(words, join_string, skip=0):
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1, skip+2):
                if i + k < L:
                    lst.append(join_string.join([words[i], words[i+k]]))
        return lst
    else:
        # set it as unigram
        lst = getUnigram(words)
        return lst
                    
def getTrigram(words, join_string, skip=0):
    #assert type(words) == []
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1, skip+2):
                for k2 in range(1, skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append(join_string.join([words[i], words[i+k1], words[i+k1+k2]]))
        return lst
    else:
        #set as bigram
        lst = getBigram(words, join_string, skip)
        return lst
    
def getFourgram(words, join_string):

    #assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in xrange(L-3):
            lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
        return lst
    else:
        # set it as bigram
        lst = getTrigram(words, join_string)
    return lst



def getBiterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny', 'boy']
        Output: a list of biterm, e.g., ['I_am', 'I_Denny', 'I_boy', 'am_Denny', 'am_boy', 'Denny_boy']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for j in range(i+1,L):
                lst.append( join_string.join([words[i], words[j]]) )
        return lst
    
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst
    
def getTriterm(words, join_string):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of triterm, e.g., ['I_am_Denny', 'I_Denny_am', 'am_I_Denny',
        'am_Denny_I', 'Denny_I_am', 'Denny_am_I']
        I use _ as join_string for this example.
    """
   # assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in xrange(L-2):
            for j in xrange(i+1,L-1):
                for k in xrange(j+1,L):
                    lst.append( join_string.join([words[i], words[j], words[k]]) )
        return lst
    else:
        # set it as biterm
        lst = getBiterm(words, join_string)
    return lst

In [None]:
# generate unigram
#data["text1_unigram"] = data["text1_tokens"].map(lambda x: getUnigram(x))
#data["text2_unigram"] = data["text2_tokens"].map(lambda x: getUnigram(x))
data["text_a_unigram"] = data["text_a_tokens"].map(lambda x: getUnigram(x))
data["text_b_unigram"] = data["text_b_tokens"].map(lambda x: getUnigram(x))

# generate bigram
join_str = "_"
#data["text1_bigram"] = data["text1_unigram"].map(lambda x: getBigram(x, join_str))
#data["text2_bigram"] = data["text2_unigram"].map(lambda x: getBigram(x, join_str))
data["text_a_bigram"] = data["text_a_unigram"].map(lambda x: getBigram(x, join_str))
data["text_b_bigram"] = data["text_b_unigram"].map(lambda x: getBigram(x, join_str))
        
# generate trigram
join_str = "_"
#data["text1_trigram"] = data["text1_unigram"].map(lambda x: getTrigram(x, join_str))
#data["text2_trigram"] = data["text2_unigram"].map(lambda x: getTrigram(x, join_str))
data["text_a_trigram"] = data["text_a_unigram"].map(lambda x: getTrigram(x, join_str))
data["text_b_trigram"] = data["text_b_unigram"].map(lambda x: getTrigram(x, join_str))


In [None]:
# calc percent of text in given Headline or articleBody that is unique ( unique grams / ttl grams)

''' 
    count ttl # of n-gram
    count ttl # of unique n-gram
    divide ttl # uniqe by ttl #
    
'''

grams = ["unigram", "bigram", "trigram"]
feat_names = [ "text_a", "text_b"]

for feat_name in feat_names:
    for gram in grams:
        data["count_of_%s_%s" % (feat_name, gram)] = list(data.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1))
        data["count_of_unique_%s_%s" % (feat_name, gram)] = \
              list(data.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1))
        data["ratio_of_unique_%s_%s" % (feat_name, gram)] = \
            data["count_of_unique_%s_%s"%(feat_name,gram)] / data["count_of_%s_%s"%(feat_name,gram)]
            #map(try_divide, df_2["count_of_unique_%s_%s"%(feat_name,gram)], df_2["count_of_%s_%s"%(feat_name,gram)])

In [None]:
# overlapping n-grams count

for gram in grams:
    # count grams appearing in Headline that are also inside its coresponding articleBody
    data["count_of_text_a_%s_in_text_b" % gram] = \
        list(data.apply(lambda x: sum([1. for w in x["text_a_" + gram] if w in set(x["text_b_" + gram])]), axis=1))
    
    # return the ratio of overlapping grams to ttl Headline grams
    data["ratio_of_text_a_%s_in_text_b" % gram] = \
        data["count_of_text_a_%s_in_text_b" % gram] / data["count_of_text_a_%s" % gram]
        #map(try_divide, df["count_of_Headline_%s_in_articleBody" % gram], df["count_of_Headline_%s" % gram])

In [None]:
# count number of sentences in title, abstract
for feat_name in feat_names:
    data['len_sent_%s' % feat_name] = data[feat_name].apply(lambda x: len(sent_tokenize(x)))

In [None]:
 # save basic count features to disk for later use

feat_names_bcf = [ n for n in data.columns \
                if "count" in n \
                or "ratio" in n \
                or "len_sent" in n]


feat_names_bcf

In [None]:
xBasicCounts = data[feat_names_bcf].values
print(type(xBasicCounts))
print(xBasicCounts.shape)

In [None]:
data.to_csv("data_with_ngrams_&_basic_Counts.csv")

In [None]:
with open("basic_count_features.pkl", "wb") as outfile:
    #pickle.dump(feat_names, outfile, -1)
    pickle.dump(xBasicCounts, outfile, -1)

## Latent Symantic Analysis

### TF-IDF

In [None]:
data = pd.read_csv("data_with_ngrams_&_basic_Counts.csv")

In [None]:
def cat_text(x):
    res = '%s %s' % (' '.join(x['text_a_unigram']), ' '.join(x['text_b_unigram']))
    return res

# concatenate title and abstract so we can fit a tfidf vectorizer that will learn the combined vocabulary
data['all_text'] = list(data.apply(cat_text, axis = 1))

In [None]:
# fit a TfidfVectorizer on the concatenated strings (fit learns the vocabulary and idf)

#vec = TfidfVectorizer(ngram_range = (1, 3), max_df= 0.8, min_df= 2)
vec = TfidfVectorizer(ngram_range = (1, 3))
vec.fit(data['all_text'])
vocabulary = vec.vocabulary_

In [None]:
data.head(1)

In [None]:
# transform title unigrams into tf-idf vector using the learned vocabulary
vec_text_a = TfidfVectorizer(ngram_range=(1,3), max_df=0.8, min_df= 2, vocabulary=vocabulary)
text_a_tfidf = vec_text_a.fit_transform(data['text_a_unigram'].map(lambda x: ' '.join(x)))
print ("text_a_tfidf.shape:" + str(text_a_tfidf.shape))

# transform abstract unigrams using the learned vocabulary
vec_text_b = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=2, vocabulary=vocabulary)
text_b_tfidf = vec_text_b.fit_transform(data['text_b_unigram'].map(lambda x: ' '.join(x)))
print ("text_b_tfidf.shape:" +  str(text_b_tfidf.shape))

# save text1 tfidf for later use
with open ("text_a_tfidf.pkl", 'wb') as outfile:
    pickle.dump(text_a_tfidf, outfile, -1)
    
# save text2 tfidf for later use
with open("text_b_tfidf.pkl", "wb") as outfile:
    pickle.dump(text_b_tfidf, outfile, -1) 

In [None]:
'''scikit-learn has a cosine_similarity function though, we must consider the input shape of our data and the desired output shape. 
   We need to take in extremely large 2-D arrays and end up with a 2-D array of one feature. To do this, we first convert each input 
   into a Coordinate Format matrix before computing cosine_similarity , calculate the row-wise cosine_similarity and finally coerce it 
   from a 1-D to 2-D array.'''

def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1)
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print (x)
        print (y)
        d = 0.
    return d

In [None]:
# calculate cosine similarity between Headline and articleBody

simTfidf = np.asarray(list(map(cosine_sim,text_a_tfidf, text_b_tfidf)))[:, np.newaxis]

print(simTfidf.shape)

# save for later use
with open("sim_tfidf.pkl", "wb") as outfile:
    pickle.dump(simTfidf, outfile, -1)


### SVD

In [None]:
from scipy.sparse import vstack

x_text_a_text_b_tfidf = vstack((text_a_tfidf, text_b_tfidf)).toarray() # toarray() converts the csr_matrix objects to numpy arrays
svd = TruncatedSVD(n_components=100, n_iter=15, random_state = 42)

print(x_text_a_text_b_tfidf.shape)

# fit to the combined train-test set 
svd.fit(x_text_a_text_b_tfidf)

In [None]:
# transform title tfidf features using svd
x_text_a_Svd = svd.transform(text_a_tfidf)
print ('x_text_a_Svd.shape:')
print (x_text_a_Svd.shape)

# save for later use
with open("text_a_svd.pkl", "wb") as outfile:
    pickle.dump(x_text_a_Svd, outfile, -1)
    
    
# transform abstract tfidf features using svd
x_text_b_Svd = svd.transform(text_b_tfidf)
print ('x_text_b_Svd.shape:')
print (x_text_b_Svd.shape)

# save for later use
with open("text_b_svd.pkl", "wb") as outfile:
    pickle.dump(x_text_b_Svd, outfile, -1)


In [None]:
# calculate cosine similarity for each record

simSvd = np.asarray(list(map(cosine_sim, x_text_a_Svd, x_text_b_Svd)))[:, np.newaxis]
print ('simSvd shape:')
print (simSvd.shape)

# save for later use
with open("sim_svd.pkl", "wb") as outfile:
    pickle.dump(simSvd, outfile, -1)

### BioWordVec Word Embeddings

In [None]:
data["text_a_unigram_vec"] = data["text_a_tokens"]
data["text_b_unigram_vec"] = data["text_b_tokens"]

In [None]:
# load pre-trained model
model = gensim.models.KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', 
                                                        binary=True)

In [None]:
text_a_unigram_array = data["text_a_unigram_vec"].values
print("text_a_unigram_vec type: %s" % type(data["text_a_unigram_vec"]))
print("text_a_unigram_array type: %s" % type(text_a_unigram_array))
print()


text_a_vec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*200), text_a_unigram_array)))
text_a_vec_norm = normalize(text_a_vec)
print("text_a_vec type: %s" % type(text_a_vec))
print("text_a_vec shape:" +  str(text_a_vec.shape))
print()
print("text_a_vec_norm type: %s" % type(text_a_vec_norm))
print("text_a_vec_norm shape:" + str(text_a_vec_norm.shape))

#save word embeddings
with open("text_a_BioWordVec.pkl", "wb") as outfile:
    pickle.dump(text_a_vec_norm, outfile, -1)

In [None]:
text_b_unigram_array = data["text_b_unigram_vec"].values
print("text_b_unigram_vec type: %s" % type(data["text_b_unigram_vec"]))
print("text_b_unigram_array type: %s" % type(text_b_unigram_array))
print()

text_b_vec = np.array(list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*200), text_b_unigram_array)))
text_b_vec_norm = normalize(text_b_vec)

print("text_b_vec type: %s" % type(text_b_vec))
print("text_b_vec:" +  str(text_b_vec.shape))
print()
print("text_b_vec_norm type: %s" % type(text_b_vec_norm))
print("text_b_vec_norm shape:" + str(text_b_vec_norm.shape))

with open("text_b_BioWordVec.pkl", "wb") as outfile:
    pickle.dump(text_b_vec_norm, outfile, -1)

In [None]:
# compute cosine similarity between title & abstract word2vec features
simVec_BioWordVec = np.asarray(list(map(cosine_sim, text_a_vec_norm, text_b_vec_norm)))[:, np.newaxis]
print(type(simVec_BioWordVec))
print(simVec_BioWordVec.shape)
print("simVec_BioWordVec num dimensions:" + str(simVec_BioWordVec.ndim))
print(simVec_BioWordVec[0:2])

with open("sim_BioWordVec.pkl", "wb") as outfile:
    pickle.dump(simVec_BioWordVec, outfile, -1)

### Sentiment Features

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')

In [None]:
# calculate polarity score of each sentance in a Headline observation and return the average

sid = SentimentIntensityAnalyzer() # https://www.nltk.org/howto/sentiment.html

def compute_sentiment(sentences):
    result = []
    for sentence in sentences:
        ss = sid.polarity_scores(sentence) # https://www.nltk.org/howto/sentiment.html
        result.append(ss)
    return pd.DataFrame(result).mean()

In [None]:
data["text_a_senti"] = data['text_a'].apply(lambda x: sent_tokenize(x)) # nltk's method sent_tokenize()
data = pd.concat([data, data['text_a_senti'].apply(lambda x: compute_sentiment(x))], axis=1)
data.rename(columns={'compound':'T_a_compound', 'neg':'T_a_neg', 'neu':'T_a_neu', 'pos':'T_a_pos'}, inplace=True)

text_a_Senti = data[['T_a_compound','T_a_neg','T_a_neu','T_a_pos']].values
print ('text_a_Senti shape:' + str(text_a_Senti.shape))
print()

# save title sentiment
with open("text_a_sentiment.pkl", "wb") as outfile:
    pickle.dump(text_a_Senti, outfile, -1)
    


data["text_b_senti"] = data['text_b'].apply(lambda x: sent_tokenize(x)) # nltk's method sent_tokenize()
data = pd.concat([data, data['text_b_senti'].apply(lambda x: compute_sentiment(x))], axis=1)
data.rename(columns={'compound':'t_b_compound', 'neg':'t_b_neg', 'neu':'t_b_neu', 'pos':'t_b_pos'}, inplace=True)

text_b_Senti = data[['t_b_compound','t_b_neg','t_b_neu','t_b_pos']].values
print ('text_b_Senti shape:' + str(text_b_Senti.shape))

# save abstract sentiment
with open("text_b_sentiment.pkl", "wb") as outfile:
    pickle.dump(text_b_Senti, outfile, -1)

# combine engineered features into one dataset

In [None]:
with open ("basic_count_features.pkl", "rb") as infile:
    #feat_names = pickle.load(infile)
    basic_count_feats = pickle.load(infile)
    
with open ("text_a_tfidf.pkl", "rb") as infile_:
    text_a_tfidf = pickle.load(infile_)
    text_a_tfidf = text_a_tfidf.toarray()
    
with open ("text_b_tfidf.pkl", "rb") as outfile:
    text_b_tfidf = pickle.load(outfile)
    text_b_tfidf = text_b_tfidf.toarray()

with open ("sim_tfidf.pkl", "rb") as outfile_:
    sim_tfidf = pickle.load(outfile_)
    
with open ("text_a_svd.pkl", "rb") as svd_title:
    text_a_svd = pickle.load(svd_title)
    
with open ("text_b_svd.pkl", "rb") as svd_abstract:
    text_b_svd = pickle.load(svd_abstract)
    
with open ("sim_svd.pkl", "rb") as sim_svd:
    sim_svd = pickle.load(sim_svd)
    
with open ("text_a_BioWordVec.pkl", "rb") as Tw:
    text_a_BioWordVec = pickle.load(Tw)
    
with open ("text_b_BioWordVec.pkl", "rb") as Aw:
    text_b_BioWordVec = pickle.load(Aw)
    
with open ("sim_BioWordVec.pkl", "rb") as Sw:
    sim_BioWordVec = pickle.load(Sw)
    
with open ("text_a_sentiment.pkl", "rb") as Ts:
    text_a_sentiment = pickle.load(Ts)
    
with open ("text_b_sentiment.pkl", "rb") as As:
    text_b_sentiment = pickle.load(As)
    

In [None]:
vectors = [basic_count_feats, sim_tfidf, 
           text_a_svd, text_b_svd, sim_svd, 
           text_a_BioWordVec, text_b_BioWordVec, sim_BioWordVec, 
           text_a_sentiment, text_b_sentiment]

for vec in vectors:
    print(vec.ndim)
    print(vec.shape)
    print(type(vec))

In [None]:
contra_data = np. hstack(vectors)

In [None]:
contra_data.shape

In [None]:
with open ("all_engineered_contra_data.pkl", "wb") as all_data:
    pickle.dump(contra_data, all_data, protocol = 4)