In [85]:
import pandas
import nltk

In [86]:
#nltk.download()
#Run nltk.download() if you encounter nltk issues.

In [87]:
bodies = pandas.read_csv('./fn_data/train_bodies.csv', index_col = False)

In [88]:
def map_bodies(data, body_col):
    """
    data: DataFrame
    body_col : string
    
    data represents a DataFrame containing Body IDs and actual text bodies.
    
    Ex:    Body ID                                        articleBody
    0           0  A small meteorite crashed into a wooded area i...
    1           4  Last week we hinted at what was to come as Ebo...
    2           5  (NEWSER) – Wonder how long a Quarter Pounder w...
    3           6  Posting photos of a gun-toting child online, I...
    4           7  At least 25 suspected Boko Haram insurgents we...
    5           8  There is so much fake stuff on the Internet in...
    6           9  (CNN) -- A meteorite crashed down in Managua, ...



    body_col is the name of the column containing article text bodies
    
    Returns: dictionary such that {Body ID : Body Text}
    """
    dictionary = dict()
    
    for x in range(len(data[body_col])):
        dictionary.update({data.iloc[x,0] : data.iloc[x,1]})
    
    return dictionary

In [89]:
bodydict = map_bodies(bodies, 'articleBody')

In [90]:
def tokenize_dict(dictionary):
    """
    dictionary : dictionary
    
    Takes in a dictionary containing mappings from Body ID to Body.
    Returns a dictionary containing mappings from Body ID to Tokenized Bodies.
    """
    new_dict = dict()
    for x in dictionary:
        tokens = nltk.word_tokenize(dictionary.get(x))
        new_dict.update({x:tokens})
    return new_dict

In [91]:
bodydict = tokenize_dict(bodydict)

In [92]:
tagged = nltk.pos_tag(bodydict.get(0))

In [104]:
def tag_tokens(dictionary):
    """
    Takes in a dictionary containing mappings from Body ID to tokenized bodies.
    Returns a dictionary containing mappings from Body ID to tagged tokenized bodies.
    """
    new_dict = dict()
    for x in dictionary:
        tagged = nltk.pos_tag(dictionary.get(x))
        new_dict.update({x:tagged})
    return new_dict

In [105]:
headlines = pandas.read_csv('./fn_data/train_stances.csv', index_col = False)

In [106]:
headline_dict = dict()
    
for x in range(len(headlines['Headline'])):
    headline_dict.update({headlines.iloc[x,1] : headlines.iloc[x,0]})

In [107]:
headline_dict.get(712)

'A Mass Grave Points to a Student Massacre in Mexico'

In [108]:
tokenizeddict = tag_tokens(tokenize_dict(map_bodies(bodies, 'articleBody')))

In [109]:
master_dict = dict()
for x in headline_dict:
    master_dict.update({x:[headline_dict.get(x), tokenizeddict.get(x)]})

In [110]:
master_dict.get(0)

['Italian fisherman catches monstrous 280-pound catfish',
 [('A', 'DT'),
  ('small', 'JJ'),
  ('meteorite', 'NN'),
  ('crashed', 'VBN'),
  ('into', 'IN'),
  ('a', 'DT'),
  ('wooded', 'JJ'),
  ('area', 'NN'),
  ('in', 'IN'),
  ('Nicaragua', 'NNP'),
  ("'s", 'POS'),
  ('capital', 'NN'),
  ('of', 'IN'),
  ('Managua', 'NNP'),
  ('overnight', 'NN'),
  (',', ','),
  ('the', 'DT'),
  ('government', 'NN'),
  ('said', 'VBD'),
  ('Sunday', 'NNP'),
  ('.', '.'),
  ('Residents', 'NNP'),
  ('reported', 'VBD'),
  ('hearing', 'VBG'),
  ('a', 'DT'),
  ('mysterious', 'JJ'),
  ('boom', 'NN'),
  ('that', 'WDT'),
  ('left', 'VBD'),
  ('a', 'DT'),
  ('16-foot', 'JJ'),
  ('deep', 'JJ'),
  ('crater', 'NN'),
  ('near', 'IN'),
  ('the', 'DT'),
  ('city', 'NN'),
  ("'s", 'POS'),
  ('airport', 'NN'),
  (',', ','),
  ('the', 'DT'),
  ('Associated', 'NNP'),
  ('Press', 'NNP'),
  ('reports', 'NNS'),
  ('.', '.'),
  ('Government', 'NNP'),
  ('spokeswoman', 'NN'),
  ('Rosario', 'NNP'),
  ('Murillo', 'NNP'),
  ('said'

In [114]:
headline_embedding = nltk.pos_tag(nltk.word_tokenize(master_dict.get(0)[0]))

In [120]:
headline_nouns = set()
for x in headline_embedding:
    if x[1] == "NN":
        headline_nouns.add(x[0])

In [126]:
for x in headline_nouns:
    for y in master_dict.get(0)[1]:
        if y[1] == "NN":
            print(y[0], x)

meteorite catfish
area catfish
capital catfish
overnight catfish
government catfish
boom catfish
crater catfish
city catfish
airport catfish
spokeswoman catfish
committee catfish
government catfish
event catfish
meteorite catfish
asteroid catfish
asteroid catfish
diameter catfish
Earth catfish
weekend catfish
crater catfish
meteorite catfish
radius catfish
depth catfish
volcanologist catfish
committee catfish
meteorite catfish
meteorite catfish
asteroid catfish
planet catfish
night catfish
rock catfish
adviser catfish
one catfish
streak catfish
anyone catfish
photo catfish
something catfish
loud catfish
boom catfish
night catfish
anything catfish
sky catfish
porch catfish
nothing catfish
blast catfish
bomb catfish
wave catfish
site catfish
crater catfish
airport catfish
air catfish
force catfish
base catfish
state catfish
meteorite fisherman
area fisherman
capital fisherman
overnight fisherman
government fisherman
boom fisherman
crater fisherman
city fisherman
airport fisherman
spokesw