In [65]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import math
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Step 1 - Import pre-trained Word Embedding Model

In [2]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('../../Downloads/GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
google_model['nasa']

In [4]:
#This is the list of words for which the google model has a vector
google_vocab = google_model.vocab.keys()

## Step 2 - Find Word Embeddings

### 2.1- Embeddings of Claims

In [53]:
my_claims = pd.read_csv('Snopes_articles_with_predicted_stance.csv', index_col = 0)
my_claims.head()

Unnamed: 0,claim,articleHeadline,predictedStance
0,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Snopes.com,against
1,Meijer is offering 100 off Back to School coup...,Fake Meijer 100 back-to-school coupon goes vir...,against
2,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Hoax - Trendolizer,against
3,Meijer is offering 100 off Back to School coup...,100 OFF Meijer Coupon Deals April 2017 HotDeal...,against
4,Meijer is offering 100 off Back to School coup...,10 off 100 in Visa Gift Cards at Meijer - Freq...,against


In [54]:
#Takes in a string. Tokenizes it, removes any punctuation in it
#Does NOT change the case of the words though.
def preprocessing(text):
    if pd.isnull(text): #if article doesn't exist. Some claims have <10 articles.
        return ""
    for c in string.punctuation:
        text = text.replace(c,"")
    words = text.split()
    return words

In [55]:
#For a claim or an articleHeadline, this function will return the word embedding for it
def calc_vector(current_text, previous_text, previous_vector):
    
    #if article doesn't exist. Some claims have <10 articles
    if pd.isnull(current_text):
        return np.zeros(300)
    
    #as each claim is repeated many times, this will be efficient for the 2nd through 10th time that we are finding the vector 
    #for a particular claim
    if current_text==previous_text:
        return previous_vector
    
    #if this is the first time we are finding the vector for this claim
    words = preprocessing(current_text)
    
    text_vec = []
    for word in words:
        #checking if the word as written is in google model
        if word in google_vocab:
            text_vec =+ google_model[word]
        
        #checking if word in lowercase is in google model
        elif word.lower() in google_vocab:
            text_vec =+ google_model[word.lower()]
        
        #checking if word in uppercase is in google model
        elif word.upper() in google_vocab:
            text_vec =+ google_model[word.upper()]
        
        #checking if word in capital case (first character capitalized) is in model
        elif word.capitalize() in google_vocab:
            text_vec =+ google_model[word.capitalize()]
        
        #if not, just leave text_vec unchanged
        else:
            text_vec = text_vec
    
    #updating previous text and previous vector
    previous_text = current_text
    previous_vector = text_vec
    
    return text_vec

In [56]:
claim_word_embeddings = [] #This will be the first column in the new dataframe. 'claimWordEmbedding'
previous_claim = ''
previous_vector =[]
for claim in my_claims['claim']:
    claim_word_embeddings.append(calc_vector(claim, previous_claim, previous_vector))

In [58]:
len(claim_word_embeddings)

46754

I now have the word embeddings for the claims. Now I have to get the word embeddings for the articles

### 2.2 Word Embeddings of Article Headlines

In [59]:
df = pd.read_csv('Snopes_articles_with_predicted_stance.csv', index_col = 0)
df.head()

Unnamed: 0,claim,articleHeadline,predictedStance
0,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Snopes.com,against
1,Meijer is offering 100 off Back to School coup...,Fake Meijer 100 back-to-school coupon goes vir...,against
2,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Hoax - Trendolizer,against
3,Meijer is offering 100 off Back to School coup...,100 OFF Meijer Coupon Deals April 2017 HotDeal...,against
4,Meijer is offering 100 off Back to School coup...,10 off 100 in Visa Gift Cards at Meijer - Freq...,against


In [98]:
#claim_embeddings is a list of numpy arrays. This should be a list of lists of numpy arrays
article_embeddings = []
articles = df['articleHeadline']
multiples = math.floor(df.shape[0]/10)

previous_claim2 = ""
previous_vector2 = []

for i in range(10):
    article_embeddings.append([])
    
for i in range(df.shape[0]):
    current = calc_vector(articles[i], previous_claim2, previous_vector2)
    article_embeddings[i%10].append(current)
#         article_embeddings[i].append(calc_vector(articles[i + num*10], previous_claim2, previous_vector2))

In [99]:
for i in range(10):
    print(len(article_embeddings[i]))

4676
4676
4676
4676
4675
4675
4675
4675
4675
4675


Now, article_embeddings is a list. Each element of article_embeddings[5], for example, is the fifth article for every claim. Currently, because the number of articles is not a multiple of 10, the lists are not of equal size. So I will add dummy vectors to make them even.

In [100]:
for i in range(4, 10):
    article_embeddings[i].append(np.zeros(300))
    
for i in range(10):
    print(len(article_embeddings[i]))

4676
4676
4676
4676
4676
4676
4676
4676
4676
4676


Now they are even.

At this point, I have all the word embeddings I need. I will try building an XGBoost model just with the word embeddings and see if it is respectable. Otherwise, I will create the following binary features for every article - 

    - whether it contains a hedging word
    - whether it contains a question mark