In [57]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
import nltk
import string
import re
import math
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

## Step 1 - Import pre-trained Word Embedding Model

In [58]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('../../Downloads/GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
google_model['nasa']

In [59]:
#This is the list of words for which the google model has a vector
google_vocab = google_model.vocab.keys()

## Step 2 - Find Word Embeddings

### 2.1- Embeddings of Claims

In [60]:
my_claims = pd.read_csv('Snopes_articles_with_predicted_stance.csv', index_col = 0)
my_claims.head()

Unnamed: 0,claim,articleHeadline,predictedStance
0,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Snopes.com,for
1,Meijer is offering 100 off Back to School coup...,Fake Meijer 100 back-to-school coupon goes vir...,against
2,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Hoax - Trendolizer,observing
3,Meijer is offering 100 off Back to School coup...,100 OFF Meijer Coupon Deals April 2017 HotDeal...,against
4,Meijer is offering 100 off Back to School coup...,10 off 100 in Visa Gift Cards at Meijer - Freq...,for


In [61]:
#Takes in a string. Tokenizes it, removes any punctuation in it
#Does NOT change the case of the words though.
def preprocessing(text):
    if pd.isnull(text): #if article doesn't exist. Some claims have <10 articles.
        return ""
    for c in string.punctuation:
        text = text.replace(c,"")
    words = text.split()
    return words

In [62]:
#For a claim or an articleHeadline, this function will return the word embedding for it
def calc_vector(current_text, previous_text, previous_vector):
    
    #if article doesn't exist. Some claims have <10 articles
    if pd.isnull(current_text):
        return []
    
    #as each claim is repeated many times, this will be efficient for the 2nd through 10th time that we are finding the vector 
    #for a particular claim
    if current_text==previous_text:
        return previous_vector
    
    #if this is the first time we are finding the vector for this claim
    words = preprocessing(current_text)
    
    text_vec = []
    for word in words:
        #checking if the word as written is in google model
        if word in google_vocab:
            text_vec =+ google_model[word]
        
        #checking if word in lowercase is in google model
        elif word.lower() in google_vocab:
            text_vec =+ google_model[word.lower()]
        
        #checking if word in uppercase is in google model
        elif word.upper() in google_vocab:
            text_vec =+ google_model[word.upper()]
        
        #checking if word in capital case (first character capitalized) is in model
        elif word.capitalize() in google_vocab:
            text_vec =+ google_model[word.capitalize()]
        
        #if not, just leave text_vec unchanged
        else:
            text_vec = text_vec
    
    #updating previous text and previous vector
    previous_text = current_text
    previous_vector = text_vec
    
    return text_vec

In [63]:
claim_word_embeddings = [] #This will be the first column in the new dataframe. 'claimWordEmbedding'
previous_claim = ''
previous_vector =[]
for claim in my_claims['claim']:
    claim_word_embeddings.append(calc_vector(claim, previous_claim, previous_vector))

In [64]:
len(claim_word_embeddings)

46772

In [65]:
unique_claims = []
length = len(claim_word_embeddings)
for index in range(0, length, 10):
    unique_claims.append(claim_word_embeddings[index])

In [66]:
len(unique_claims)

4678

I now have the word embeddings for the claims. Now I have to get the word embeddings for the articles

### 2.2 Word Embeddings of Article Headlines

In [67]:
df = pd.read_csv('Snopes_articles_with_predicted_stance.csv', index_col = 0)
df.head()

Unnamed: 0,claim,articleHeadline,predictedStance
0,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Snopes.com,for
1,Meijer is offering 100 off Back to School coup...,Fake Meijer 100 back-to-school coupon goes vir...,against
2,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Hoax - Trendolizer,observing
3,Meijer is offering 100 off Back to School coup...,100 OFF Meijer Coupon Deals April 2017 HotDeal...,against
4,Meijer is offering 100 off Back to School coup...,10 off 100 in Visa Gift Cards at Meijer - Freq...,for


In [75]:
#claim_embeddings is a list of numpy arrays. This should be a list of lists of numpy arrays
article_embeddings = []
articles = df['articleHeadline']
multiples = math.floor(df.shape[0]/10)

previous_claim2 = ""
previous_vector2 = []

for i in range(10):
    article_embeddings.append([])
    
for i in range(df.shape[0]):
    current = calc_vector(articles[i], previous_claim2, previous_vector2)
    article_embeddings[i%10].append(current)
#         article_embeddings[i].append(calc_vector(articles[i + num*10], previous_claim2, previous_vector2))

In [76]:
for i in range(10):
    print(len(article_embeddings[i]))

4678
4678
4677
4677
4677
4677
4677
4677
4677
4677


Now, article_embeddings is a list. Each element of article_embeddings[5], for example, is the fifth article for every claim. Currently, because the number of articles is not a multiple of 10, the lists are not of equal size. So I will add dummy vectors to make them even.

In [77]:
for i in range(2, 10):
    article_embeddings[i].append([])
    
for i in range(10):
    print(len(article_embeddings[i]))

4678
4678
4678
4678
4678
4678
4678
4678
4678
4678


Now they are even.

At this point, I have all the word embeddings I need. I will try building an XGBoost model just with the word embeddings and see if it is respectable. Otherwise, I will create the following binary features for every article - 

    - whether it contains a hedging word
    - whether it contains a question mark

In [78]:
final_df = pd.DataFrame()
final_df['claimEmbedding'] = unique_claims
for index in range(10):
    text = 'article' + str(index)
    final_df[text] = article_embeddings[index]

In [79]:
final_df.head()

Unnamed: 0,claimEmbedding,article0,article1,article2,article3,article4,article5,article6,article7,article8,article9
0,"[-0.125977, -0.0322266, -0.0568848, 0.220703, ...","[0.097168, 0.196289, -0.146484, 0.0849609, 0.1...","[0.0976563, -0.00927734, -0.267578, -0.28125, ...","[0.355469, 0.0201416, -0.065918, -0.0722656, 0...","[0.0888672, 0.00328064, 0.026001, -0.141602, -...","[0.065918, 0.0922852, -0.0952148, 0.0839844, 0...","[0.0556641, 0.378906, -0.179688, 0.330078, 0.0...","[0.251953, -0.191406, -0.121582, -0.00708008, ...","[0.0727539, -0.0927734, -0.174805, -0.022583, ...","[-0.00695801, 0.106445, -0.00830078, -0.023071...","[-0.0703125, -0.151367, -0.0932617, 0.0561523,..."
1,"[0.149414, 0.0654297, -0.0272217, -0.129883, 0...","[0.115234, -0.0805664, -0.078125, 0.120605, -0...","[-0.125977, 0.0253906, 0.166992, 0.550781, -0....","[-0.201172, -0.0349121, -0.0742188, 0.0581055,...",[],"[0.101563, -0.267578, -0.0922852, 0.182617, -0...","[0.0456543, -0.157227, 0.0820313, 0.171875, 0....","[0.0456543, -0.157227, 0.0820313, 0.171875, 0....","[0.0380859, 0.291016, 0.0522461, 0.269531, -0....","[0.00595093, 0.310547, 0.0605469, 0.12793, -0....","[-0.151367, -0.283203, 0.402344, -0.0976563, 0..."
2,"[0.078125, -0.0390625, -0.0766602, 0.416016, -...","[0.19043, -0.425781, -0.550781, 0.164063, -0.5...","[0.117676, 0.0532227, -0.324219, 0.0771484, 0....","[-0.090332, 0.0600586, -0.240234, 0.193359, -0...","[0.251953, -0.191406, -0.121582, -0.00708008, ...","[0.078125, -0.0390625, -0.0766602, 0.416016, -...","[-0.135742, 0.141602, -0.00756836, 0.143555, 0...","[-0.135742, 0.141602, -0.00756836, 0.143555, 0...","[0.00823975, -0.236328, 0.234375, -0.223633, -...","[-0.161133, 0.0786133, 0.0913086, -0.363281, -...","[0.0947266, 0.328125, -0.048584, -0.00665283, ..."
3,"[0.111816, 0.189453, 0.176758, -0.022583, 0.18...","[0.0351563, 0.208008, -0.417969, -0.165039, -0...","[0.111816, 0.189453, 0.176758, -0.022583, 0.18...","[-0.237305, -0.380859, -0.148438, 0.106445, 0....","[-0.0708008, -0.0664063, 0.0240479, -0.0500488...","[0.251953, -0.191406, -0.121582, -0.00708008, ...","[0.515625, 0.162109, -0.0263672, -0.0578613, -...","[-0.0610352, -0.314453, 0.150391, -0.0595703, ...","[-0.0610352, -0.314453, 0.150391, -0.0595703, ...","[-0.00958252, 0.371094, 0.0639648, 0.0922852, ...","[-0.0456543, 0.0908203, 0.0446777, 0.365234, -..."
4,"[0.0810547, -0.0400391, 0.050293, -0.157227, -...","[0.0429688, -0.0130615, 0.0703125, -0.161133, ...","[0.0810547, -0.0400391, 0.050293, -0.157227, -...","[0.0810547, -0.0400391, 0.050293, -0.157227, -...","[-0.24707, 0.00927734, 0.00787354, 0.176758, -...","[-0.0981445, -0.324219, -0.412109, -0.0517578,...","[0.121094, 0.145508, 0.145508, -0.206055, 0.04...","[0.0284424, -0.036377, -0.225586, 0.0322266, 0...","[0.0267334, -0.0908203, 0.027832, 0.204102, 0....","[0.0263672, 0.0300293, 0.032959, 0.245117, 0.1...","[-0.0334473, 0.0253906, -0.300781, 0.0356445, ..."


Excluding the veracity information (which I will read from the JSON files) this is ~85% of the dataframe I want. There are two problems now -

1. There are some articles for which the google model has no words. So for those, the word embedding vector is just empty. I thought I fixed this problem but I can solve this by just going through and replacing them with the means of the columns.

2. I had previously decided that for the claims with missing articles, I would just put those embeddings as a vector of zeros. On second thoughts, it might be a better idea to replace them with the means of the respective columns.

I will first replace the zero vectors, and the empty ones because the zero vectors will affect the column mean, which I do not want

EDIT: I am not sure how to remove the zero vectors. So I will just replace the empty ones with the column mean and see how that works

In [80]:
final_df = final_df.fillna(final_df.mean())
final_df[-5:]

Unnamed: 0,claimEmbedding,article0,article1,article2,article3,article4,article5,article6,article7,article8,article9
4673,"[0.12793, 0.00341797, 0.195313, 0.283203, -0.0...","[-0.273438, -0.134766, -0.225586, 0.0644531, 0...","[-0.236328, -0.0249023, -0.169922, -0.185547, ...","[0.0834961, 0.0649414, 0.155273, 0.166016, 0.2...","[-0.236328, -0.0249023, -0.169922, -0.185547, ...","[0.3125, -0.289063, -0.177734, 0.302734, 0.146...","[0.423828, -0.00717163, 0.105469, -0.147461, -...","[0.125, -0.124023, -0.137695, 0.130859, -0.394...","[-0.0383301, 0.12207, 0.168945, 0.0239258, -0....","[-0.0673828, -0.0981445, 0.363281, 0.10791, 0....","[-0.236328, -0.0249023, -0.169922, -0.185547, ..."
4674,"[-0.0446777, 0.0571289, 0.328125, 0.0629883, 0...","[0.220703, -0.0703125, -0.246094, -0.273438, -...","[-0.0922852, 0.203125, -0.186523, 0.15918, -0....","[0.0537109, 0.15625, -0.192383, 0.146484, -0.0...","[-0.0644531, 0.182617, 0.0639648, 0.0625, -0.0...","[-0.236328, -0.0249023, -0.169922, -0.185547, ...","[-0.0175781, 0.224609, -0.0683594, 0.0673828, ...","[0.125977, -0.18457, -0.304688, 0.0568848, -0....","[0.124023, 0.28125, -0.0272217, -0.02771, -0.0...","[-0.324219, 0.163086, -0.255859, 0.0495605, -0...","[-0.414063, 0.0153198, -0.123535, 0.251953, 0...."
4675,"[-0.0175781, 0.224609, -0.0683594, 0.0673828, ...","[0.172852, 0.283203, -0.28125, 0.417969, -0.07...","[0.0378418, 0.200195, 0.267578, 0.0088501, -0....","[0.0834961, 0.0649414, 0.155273, 0.166016, 0.2...","[-0.126953, 0.208984, -0.106445, 0.0471191, -0...","[0.109375, -0.0375977, -0.0693359, -0.149414, ...","[0.251953, -0.191406, -0.121582, -0.00708008, ...","[0.240234, 0.144531, -0.177734, -0.0297852, 0....","[0.00656128, 0.222656, -0.0292969, 0.179688, -...","[-0.105957, 0.213867, 0.118652, -0.0314941, -0...","[0.0529785, -0.0742188, -0.130859, 0.171875, 0..."
4676,"[0.245117, -0.0177002, -0.0634766, 0.128906, 0...","[0.0529785, -0.0742188, -0.130859, 0.171875, 0...","[0.0529785, -0.0742188, -0.130859, 0.171875, 0...","[-0.105957, 0.213867, 0.118652, -0.0314941, -0...","[0.117676, 0.0532227, -0.324219, 0.0771484, 0....","[0.376953, 0.175781, 0.169922, 0.197266, -0.00...","[0.138672, 0.119141, -0.359375, 0.235352, -0.0...","[-0.201172, -0.198242, -0.244141, -0.0471191, ...","[-0.0510254, 0.120605, -0.0125732, 0.0308838, ...","[-0.0585938, -0.0375977, 0.0727539, 0.108887, ...","[-0.0498047, -0.328125, -0.0415039, 0.271484, ..."
4677,"[0.0568848, 0.0742188, 0.0432129, -0.106934, -...","[0.155273, -0.318359, -0.133789, 0.0766602, -0...","[-0.209961, -0.0354004, -0.158203, 0.00357056,...",[],[],[],[],[],[],[],[]
