In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



## Step 1 - Import pre-trained Word Embedding Model

In [2]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('../../Downloads/GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
google_model['nasa']

In [27]:
#This is the list of words for which the google model has a vector
google_vocab = google_model.vocab.keys()

## Step 2 - Find Word Embeddings

In [7]:
my_claims = pd.read_csv('my_claims_csv_cleaned.csv', index_col = 1)
my_claims.head()

Unnamed: 0.1,Unnamed: 0,claimId,claimTruthiness,claimHeadline,articleId,articleVersion,articleHeadline,articleHeadlineStance
0,0,1,unknown,Meijer is offering 100 off Back to School coup...,0,1,100 Meijer Coupon - Snopes.com,for
1,1,1,unknown,Meijer is offering 100 off Back to School coup...,1,1,Fake Meijer 100 back-to-school coupon goes vir...,for
2,2,1,unknown,Meijer is offering 100 off Back to School coup...,2,1,100 Meijer Coupon - Hoax - Trendolizer,for
3,3,1,unknown,Meijer is offering 100 off Back to School coup...,3,1,100 OFF Meijer Coupon Deals April 2017 HotDeal...,for
4,4,1,unknown,Meijer is offering 100 off Back to School coup...,4,1,10 off 100 in Visa Gift Cards at Meijer - Freq...,for


In [54]:
#Takes in a string. Tokenizes it, removes any punctuation in it
#Does NOT change the case of the words though.
def preprocessing(text):
    for c in string.punctuation:
        text = text.replace(c,"")
    words = text.split()
    return words

In [55]:
#For a claim or an articleHeadline, this function will return the word embedding for it
def calc_vector(current_text, previous_text, previous_vector):
    
    #as each claim is repeated many times, this will be efficient for the 2nd through 10th time that we are finding the vector 
    #for a particular claim
    if current_text==previous_text:
        return previous_vector
    
    #if this is the first time we are finding the vector for this claim
    words = preprocessing(current_text)
    
    text_vec = []
    for word in words:
        #checking if the word as written is in google model
        if word in google_vocab:
            text_vec =+ google_model[word]
        
        #checking if word in lowercase is in google model
        elif word.lower() in google_vocab:
            text_vec =+ google_model[word.lower()]
        
        #checking if word in uppercase is in google model
        elif word.upper() in google_vocab:
            text_vec =+ google_model[word.upper()]
        
        #checking if word in capital case (first character capitalized) is in model
        elif word.capitalize() in google_vocab:
            text_vec =+ google_model[word.capitalize()]
        
        #if not, just leave text_vec unchanged
        else:
            text_vec = text_vec
    
    #updating previous text and previous vector
    previous_text = current_text
    previous_vector = text_vec
    
    return text_vec

In [56]:
claim_word_embeddings = [] #This will be the first column in the new dataframe. 'claimWordEmbedding'
previous_claim = ''
previous_vector =[]
for claim in my_claims['claimHeadline']:
    claim_word_embeddings.append(calc_vector(claim, previous_claim, previous_vector))

In [57]:
len(claim_word_embeddings)

46820

I now have the word embeddings for the claims. Now I have to get the word embeddings for the articles