In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
import nltk
import string
import re
import math
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder



## Step 1 - Import pre-trained Word Embedding Model

In [2]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('../../Downloads/GoogleNews-vectors-negative300.bin', binary = True)

In [3]:
google_model['nasa']

array([-0.07226562,  0.1171875 ,  0.16113281,  0.25976562, -0.14257812,
       -0.08837891,  0.16992188, -0.00640869, -0.15917969,  0.03112793,
       -0.11083984, -0.34179688, -0.1484375 ,  0.16796875, -0.07080078,
        0.26171875,  0.07617188,  0.19628906, -0.00476074,  0.21972656,
       -0.4140625 , -0.28320312,  0.11035156,  0.05517578, -0.1484375 ,
       -0.08984375,  0.05737305,  0.09716797, -0.19921875, -0.25      ,
        0.07617188,  0.05664062,  0.11279297, -0.11376953,  0.06982422,
       -0.48828125, -0.25585938,  0.18457031,  0.18457031, -0.32226562,
       -0.11865234, -0.03735352,  0.66015625,  0.3828125 ,  0.07910156,
        0.3046875 , -0.09716797,  0.14160156, -0.06591797,  0.23632812,
       -0.09277344,  0.01220703,  0.15527344,  0.25390625, -0.42773438,
        0.11425781, -0.15039062, -0.45117188,  0.36328125, -0.09277344,
       -0.22460938, -0.0559082 ,  0.1875    , -0.07275391,  0.18847656,
       -0.36132812,  0.12207031, -0.07080078,  0.19335938,  0.19

In [4]:
#This is the list of words for which the google model has a vector
google_vocab = google_model.vocab.keys()

## Step 2 - Find Word Embeddings

### 2.1- Embeddings of Claims

In [44]:
my_claims = pd.read_csv('Snopes_articles_with_predicted_stance.csv', index_col = 0)
my_claims.head()

Unnamed: 0,claim,articleHeadline,predictedStance
0,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Snopes.com,for
1,Meijer is offering 100 off Back to School coup...,Fake Meijer 100 back-to-school coupon goes vir...,against
2,Meijer is offering 100 off Back to School coup...,100 Meijer Coupon - Hoax - Trendolizer,observing
3,Meijer is offering 100 off Back to School coup...,100 OFF Meijer Coupon Deals April 2017 HotDeal...,against
4,Meijer is offering 100 off Back to School coup...,10 off 100 in Visa Gift Cards at Meijer - Freq...,for


In [6]:
#Takes in a string. Tokenizes it, removes any punctuation in it
#Does NOT change the case of the words though.
def preprocessing(text):
    if pd.isnull(text): #if article doesn't exist. Some claims have <10 articles.
        return ""
    for c in string.punctuation:
        text = text.replace(c,"")
    words = text.split()
    return words

In [7]:
#For a claim or an articleHeadline, this function will return the specified element of the word embedding for it
def calc_vector(current_text, previous_text, previous_vector):
    
    #if article doesn't exist. Some claims have <10 articles
    if pd.isnull(current_text):
        return []
    
    #as each claim is repeated many times, this will be efficient for the 2nd through 10th time that we are finding the vector 
    #for a particular claim
    if current_text==previous_text:
        return previous_vector
    
    #if this is the first time we are finding the vector for this claim
    words = preprocessing(current_text)
    
    text_vec = []
    for word in words:
        #checking if the word as written is in google model
        if word in google_vocab:
            text_vec =+ google_model[word]
        
        #checking if word in lowercase is in google model
        elif word.lower() in google_vocab:
            text_vec =+ google_model[word.lower()]
        
        #checking if word in uppercase is in google model
        elif word.upper() in google_vocab:
            text_vec =+ google_model[word.upper()]
        
        #checking if word in capital case (first character capitalized) is in model
        elif word.capitalize() in google_vocab:
            text_vec =+ google_model[word.capitalize()]
        
        #if not, just leave text_vec unchanged
        else:
            text_vec = text_vec
    
    #updating previous text and previous vector
    previous_text = current_text
    previous_vector = text_vec
    
    return text_vec

# Model Information

Column 0-299 - Word Embedding of claim

Columns 300-599 - Word embedding of article headline

Column 600 - Predicted stance (categorical)

Column 601 - Claim veracity

In [8]:
claim_columns = []
for index in range(300):
    claim_columns.append([])

In [9]:
previous_vector2 = ""
previous_claim2 = ""
for claim in my_claims['claim']:
    vector = calc_vector(claim, previous_claim2, previous_vector2)
    for index in range(300):
        if vector==[]:
            claim_columns[index].append("")
        else:
            claim_columns[index].append(vector[index])

The above cell takes the word embedding for each vector and puts in its own column. For example - claim_columns[0] has the first element of each claim's word embedding. claim_columns[1] has the second element, claim_columns[299] has the last element of each claim's word embedding.

These will be the first 300 columns of my dataframe

In [10]:
article_columns = []
for index in range(300):
    article_columns.append([])

In [11]:
previous_vector2 = ""
previous_claim2 = ""
for article in my_claims['articleHeadline']:
    vector = calc_vector(article, previous_claim2, previous_vector2)
    for index in range(300):
        if vector==[]:
            article_columns[index].append("")
        else:
            article_columns[index].append(vector[index])

In [12]:
final_df = pd.DataFrame()

#transferring claim_columns to this dataframe
for index in range(300):
    final_df['claim'+str(index)] = claim_columns[index]
    
#transferring article_columns
for index in range(300):
    final_df['article'+str(index)] = article_columns[index]

In [13]:
final_df = final_df.fillna(final_df.mean())

In [14]:
final_df.head(15)

Unnamed: 0,claim0,claim1,claim2,claim3,claim4,claim5,claim6,claim7,claim8,claim9,...,article290,article291,article292,article293,article294,article295,article296,article297,article298,article299
0,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.322266,0.00793457,-0.558594,-0.162109,0.416016,-0.057373,-0.117676,-0.0437012,-0.213867,0.0157471
1,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.0126343,-0.115234,-0.125,0.00674438,0.148438,-0.240234,0.050293,0.121582,-0.0727539,-0.0157471
2,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.139648,-0.147461,-0.722656,0.349609,0.097168,0.257812,-0.341797,-0.228516,0.0927734,-0.108887
3,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.0605469,-0.185547,-0.115234,0.0280762,0.106445,-0.265625,-0.209961,0.0786133,0.0966797,-0.125
4,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.139648,-0.0230713,-0.361328,0.273438,0.306641,-0.126953,-0.0202637,-0.135742,-0.0456543,0.0678711
5,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.183594,0.0480957,-0.308594,0.0683594,0.170898,0.232422,0.0241699,-0.0600586,0.306641,-0.28125
6,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.273438,0.108887,-0.228516,-0.140625,-0.0167236,-0.25,0.632812,-0.117188,-0.15625,-0.196289
7,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.0639648,-0.0644531,-0.112305,0.108398,0.103027,-0.0424805,0.166992,0.131836,-0.00262451,-0.130859
8,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.137695,0.0673828,-0.239258,-0.0947266,0.22168,-0.104004,0.211914,-0.116211,0.115723,-0.0527344
9,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.090332,-0.21582,-0.132812,-0.0203857,0.162109,-0.169922,-0.10498,-0.0206299,-0.0114136,-0.017334


In [15]:
predicted_stance = my_claims['predictedStance']
le = LabelEncoder()
predicted_stances = le.fit_transform(predicted_stance)

#against = 0, for = 1, observing = 2

In [16]:
predicted_stances[0:5]

array([1, 0, 2, 0, 1], dtype=int64)

In [17]:
final_df['predictedStance'] = predicted_stances
final_df.head()

Unnamed: 0,claim0,claim1,claim2,claim3,claim4,claim5,claim6,claim7,claim8,claim9,...,article291,article292,article293,article294,article295,article296,article297,article298,article299,predictedStance
0,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,0.00793457,-0.558594,-0.162109,0.416016,-0.057373,-0.117676,-0.0437012,-0.213867,0.0157471,1
1,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.115234,-0.125,0.00674438,0.148438,-0.240234,0.050293,0.121582,-0.0727539,-0.0157471,0
2,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.147461,-0.722656,0.349609,0.097168,0.257812,-0.341797,-0.228516,0.0927734,-0.108887,2
3,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.185547,-0.115234,0.0280762,0.106445,-0.265625,-0.209961,0.0786133,0.0966797,-0.125,0
4,-0.125977,-0.0322266,-0.0568848,0.220703,-0.0192871,0.427734,-0.112305,0.0703125,-0.0712891,-0.0966797,...,-0.0230713,-0.361328,0.273438,0.306641,-0.126953,-0.0202637,-0.135742,-0.0456543,0.0678711,1


In [18]:
final_df.shape

(46772, 601)

Now the only thing left is to bring in the claim veracities from the Snopes JSON files.

My plan - 

1. First open my_claims_csv_cleaned.csv (because this has all the claims-article pairs) and then read the veracities into here. I will create a new column for this. I will save this as a new CSV (claims_with_veracities.csv)

2. The problem now is that Snopes_articles_with_predicted_stance.csv has about 100 fewer articles in it (some claims have more than 10 articles associated with it). So for each claim which is present in Snopes_articles_with_predicted_stance.csv, I will search for that claim in claims_with_veracities.csv and then read off the corresponding veracity.

In [19]:
import json
import glob

In [45]:
my_claims2 = pd.read_csv('my_claims_csv_cleaned.csv', index_col = 0)

In [46]:
veracity_dict = {}
claims_file = glob.glob('Snopes\*.json')
index = 1
for l in claims_file:
    item = json.loads(open(l).read())
    veracity_dict[index] = item['Credibility']
    index+=1

In [47]:
def label_veracity(row):
    index = row['claimId']
    return veracity_dict[index]

In [48]:
my_claims2['veracity'] = my_claims2.apply (lambda row: label_veracity (row),axis=1)

In [49]:
#There are four types of veracities - True, False, Mostly True, Mostly False

#Number of instances of each - True: 12450, False:33177, Mostly True:279, Mostly False: 914

#Because Mostly True and Mostly False are not that common, I am going to combine them into True and False. That is, I will 
#change Mostly True into True and Mostly False into False.

new_column = []
for v in my_claims2['veracity']:
    if v=="mostly false":
        new_column.append("False")
    elif v=="mostly true":
        new_column.append("True")
    else:
        new_column.append(v)
        
my_claims2['veracity'] = new_column

In [71]:
relevant_csv = my_claims2[['claimHeadline', 'veracity']]
relevant_csv.head()

Unnamed: 0,claimHeadline,veracity
0,Meijer is offering 100 off Back to School coup...,False
1,Meijer is offering 100 off Back to School coup...,False
2,Meijer is offering 100 off Back to School coup...,False
3,Meijer is offering 100 off Back to School coup...,False
4,Meijer is offering 100 off Back to School coup...,False


Now I just need to do step 2 - map the claims from relevant_csv to Snopes_articles_with_predicted_stance.csv