In [85]:
import pandas
import nltk

In [86]:
#nltk.download()
#Run nltk.download() if you encounter nltk issues.

In [87]:
bodies = pandas.read_csv('./fn_data/train_bodies.csv', index_col = False)

In [88]:
def map_bodies(data, body_col):
    """
    data: DataFrame
    body_col : string
    
    data represents a DataFrame containing Body IDs and actual text bodies.
    
    Ex:    Body ID                                        articleBody
    0           0  A small meteorite crashed into a wooded area i...
    1           4  Last week we hinted at what was to come as Ebo...
    2           5  (NEWSER) – Wonder how long a Quarter Pounder w...
    3           6  Posting photos of a gun-toting child online, I...
    4           7  At least 25 suspected Boko Haram insurgents we...
    5           8  There is so much fake stuff on the Internet in...
    6           9  (CNN) -- A meteorite crashed down in Managua, ...



    body_col is the name of the column containing article text bodies
    
    Returns: dictionary such that {Body ID : Body Text}
    """
    dictionary = dict()
    
    for x in range(len(data[body_col])):
        dictionary.update({data.iloc[x,0] : data.iloc[x,1]})
    
    return dictionary

In [89]:
bodydict = map_bodies(bodies, 'articleBody')

In [90]:
def tokenize_dict(dictionary):
    """
    dictionary : dictionary
    
    Takes in a dictionary containing mappings from Body ID to Body.
    Returns a dictionary containing mappings from Body ID to Tokenized Bodies.
    """
    new_dict = dict()
    for x in dictionary:
        tokens = nltk.word_tokenize(dictionary.get(x))
        new_dict.update({x:tokens})
    return new_dict

In [91]:
bodydict = tokenize_dict(bodydict)

In [92]:
tagged = nltk.pos_tag(bodydict.get(0))

In [104]:
def tag_tokens(dictionary):
    """
    Takes in a dictionary containing mappings from Body ID to tokenized bodies.
    Returns a dictionary containing mappings from Body ID to tagged tokenized bodies.
    """
    new_dict = dict()
    for x in dictionary:
        tagged = nltk.pos_tag(dictionary.get(x))
        new_dict.update({x:tagged})
    return new_dict

In [105]:
headlines = pandas.read_csv('./fn_data/train_stances.csv', index_col = False)

In [106]:
headline_dict = dict()
    
for x in range(len(headlines['Headline'])):
    headline_dict.update({headlines.iloc[x,1] : headlines.iloc[x,0]})

In [107]:
headline_dict.get(712)

'A Mass Grave Points to a Student Massacre in Mexico'

In [108]:
tokenizeddict = tag_tokens(tokenize_dict(map_bodies(bodies, 'articleBody')))

In [109]:
master_dict = dict()
for x in headline_dict:
    master_dict.update({x:[headline_dict.get(x), tokenizeddict.get(x)]})

In [127]:
headline_embedding = nltk.pos_tag(nltk.word_tokenize(master_dict.get(295)[0]))

In [150]:
def match_nouns(headline, body):
    noun_count = 0
    head_nouns = set()
    head_embedding = nltk.pos_tag(nltk.word_tokenize(headline))
    for x in head_embedding:
        if x[1] == "NN" or x[1] == "NNP" or x[1] == "NNS":
            head_nouns.add(x[0])
    for y in body:
        if y[1] == "NN" or y[1] == "NNP" or y[1] == "NNS":
            if y[0] in head_nouns:
                noun_count += 1
    return noun_count

In [151]:
def complete_dictionary(bodydata, body_col, headlinedata):
    dictionary = dict()
    dictionary = map_bodies(bodydata, body_col)
    dictionary = tag_tokens(tokenize_dict(dictionary))
    
    headline_dict = dict()
    for x in range(len(headlinedata['Headline'])):
        headline_dict.update({headlines.iloc[x,1] : headlines.iloc[x,0]})
    
    master_dict = dict()
    for x in headline_dict:
        master_dict.update({x:[headline_dict.get(x), dictionary.get(x), match_nouns(headline_dict.get(x), dictionary.get(x))]})
    
    return master_dict

In [154]:
final = complete_dictionary(bodies, 'articleBody', headlines)

In [155]:
print(final.get(154))

['’6 Days Darkness in December 2014′ 100% Fake; NASA Confirmed 3 Days Total Darkness Hoax as Well', [('Thousands', 'NNS'), ('of', 'IN'), ('people', 'NNS'), ('have', 'VBP'), ('been', 'VBN'), ('duped', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('fake', 'JJ'), ('news', 'NN'), ('story', 'NN'), ('claiming', 'VBG'), ('that', 'IN'), ('Nasa', 'NNP'), ('has', 'VBZ'), ('forecast', 'VBN'), ('a', 'DT'), ('total', 'JJ'), ('blackout', 'NN'), ('of', 'IN'), ('earth', 'NN'), ('for', 'IN'), ('six', 'CD'), ('days', 'NNS'), ('in', 'IN'), ('December', 'NNP'), ('.', '.'), ('The', 'DT'), ('story', 'NN'), (',', ','), ('entitled', 'VBN'), ('``', '``'), ('Nasa', 'NNP'), ('Confirms', 'NNP'), ('Earth', 'NNP'), ('Will', 'NNP'), ('Experience', 'NNP'), ('6', 'CD'), ('Days', 'NNP'), ('of', 'IN'), ('Total', 'NNP'), ('Darkness', 'NNP'), ('in', 'IN'), ('December', 'NNP'), ('2014', 'CD'), ('!', '.'), ("''", "''"), ('originated', 'VBD'), ('from', 'IN'), ('Huzlers.com', 'NNP'), (',', ','), ('a', 'DT'), ('website', 'JJ'), ('well',

In [159]:
stance_dict = dict()
for x in range(len(headlines['Headline'])):
    #Incorporating stance
    stance_dict.update({headlines.iloc[x,1] : headlines.iloc[x,2]})

In [161]:
ids = []
noun_count = []
stance = []
for x in final:
    ids.append(x)
    noun_count.append(final.get(x)[2])
    stance.append(stance_dict.get(x))

In [171]:
d = {'ID': ids, 'Noun Matches': noun_count, 'Stance': stance}
df = pandas.DataFrame(data = d)

In [172]:
print(df)

        ID  Noun Matches     Stance
0      712             0  unrelated
1      158             0  unrelated
2      137             0  unrelated
3     1034             0  unrelated
4     1923             0  unrelated
5      154            22      agree
6      962             2  unrelated
7     2033             0  unrelated
8     1739             0  unrelated
9      882             3  unrelated
10    2327             0  unrelated
11    1468             9   disagree
12    1003             0  unrelated
13    2132             0      agree
14      47             0  unrelated
15     615             0  unrelated
16    2463             0  unrelated
17     295             1  unrelated
18     570             0  unrelated
19     608             0  unrelated
20    1500            19   disagree
21    1681             0  unrelated
22    1545             0  unrelated
23    1196             0  unrelated
24    1014             3      agree
25     633             0  unrelated
26      56             0  un

In [175]:
pandas.DataFrame.to_csv(df, 'noun_count_vs_stance.csv')