# Test anchor detection on annotated data
We've annotated a set of tweets (generated [here](determine_anchor_role.ipynb#Annotate-anchors), annotated [here](https://docs.google.com/spreadsheets/d/1_1vwu_UgVlSgIZ09pvKkqK-wGBaK1zhb-9k-k7T5dyA/edit?usp=sharing)) with two English speakers. Let's try to detect anchoring using the following strategies:

- state descriptor (LOC, STATE)
- description phrase (LOC + DESCRIPTOR)

We will test the overall precision and recall.

We will also test the effect of adding the following two detecetion strategies:

- compound descriptor (LOC + NOUN + DESCRIPTOR)
- conjunction descriptor (LOC + LOC + DESCRIPTOR)

In [1]:
import numpy as np
import pandas as pd

In [None]:
## load importance data for anchors
import pickle
## add importance vars
geonames_data = pickle.load(open('/hg190/corpora/GeoNames/allCountriesSimplified_lookup_US.pickle', 'rb'))
geonames_max_pop = {k : v.loc[:, 'population'].max() for k,v in geonames_data.items()}
geonames_max_alt_names = {k : v.loc[:, 'alternate_name_count'].max() for k,v in geonames_data.items()}
# test
print(geonames_max_pop['san juan'])
print(geonames_max_alt_names['san juan'])

In [173]:
## TODO: fix NEs in annotations => capital and spaced to fit parse matching...UGH!!
# load data
annotated_data = pd.read_csv('../../data/mined_tweets/NE_twitter_anchor_sample_annotated.tsv', sep='\t', index_col=False)
display(annotated_data.head())
agree_cols = ['state_gold_agreement', 'descriptor_gold_agreement']
annotated_data = annotated_data.assign(**{'all_gold_agreement' : annotated_data.loc[:, agree_cols].min(axis=1)})
print('%d/%d agreement'%(annotated_data.loc[:, 'all_gold_agreement'].sum(), annotated_data.shape[0]))
print(annotated_data.loc[:, ['state_gold_agreement', 'descriptor_gold_agreement']].sum(axis=0))
annotated_data_agree = annotated_data[annotated_data.loc[:, 'all_gold_agreement']==1]
print('%d/%d agree'%(annotated_data_agree.shape[0], annotated_data.shape[0]))
# add gold markers
annotated_data_agree = annotated_data_agree.assign(**{
    'state_gold' : annotated_data_agree.loc[:, ['state_gold_1', 'state_gold_2']].min(axis=1),
    'descriptor_gold' : annotated_data_agree.loc[:, ['descriptor_gold_1', 'descriptor_gold_2']].min(axis=1),  
})

annotated_data_agree = annotated_data_agree.assign(**{
    'all_gold' : annotated_data_agree.loc[:, gold_cols].max(axis=1)
})
# need valid loc marker for later
annotated_data_agree = annotated_data_agree.assign(**{'valid_loc':1})

Unnamed: 0,data_name,NE,txt,state_gold_1,descriptor_gold_1,state_gold_2,descriptor_gold_2,state_gold_agreement,descriptor_gold_agreement,id
0,florence,mayfair,Homes in the Mayfair neighborhood of Lumberton...,0,1,0,1,1,1,1041724904327061504
1,harvey,yale,"VIDEO: I-10 at Yale, The Heights, Houston (res...",0,1,0,1,1,1,901931817862758400
2,florence,new_bern,RT @EdValleeWx: Our models specifically used f...,1,0,1,0,1,1,1039450960312102912
3,florence,wilmington,RT @WMO: Hurricane #Florence is likely to make...,1,0,1,0,1,1,1039621551052791808
4,florence,cape_fear,RT @ABC: LATEST: Hurricane #Florence a Categor...,1,0,1,0,1,1,1039714656217128960


199/226 agreement
state_gold_agreement         218
descriptor_gold_agreement    206
dtype: int64
199/226 agree


Load text data because we need complete list of valid NEs for each status!! Ugh!!

In [116]:
# from ast import literal_eval
txt_data = pd.read_csv('../../data/mined_tweets/combined_tweet_tag_data_NE_flat.gz', sep='\t', index_col=False, compression='gzip')
# remove NAN NEs
txt_data = txt_data[~txt_data.loc[:, 'NE'].apply(lambda x: np.isnan(x) if type(x) is float else False)]
# keep valid NEs
txt_data = txt_data[txt_data.loc[:, 'valid_loc']==1]
txt_data.rename(columns={'data_name_fixed':'data_name'}, inplace=True)
# parsed_data = pd.read_csv('../../data/mined_tweets/combined_tweet_NE_flat_data_parsed.gz', sep='\t', index_col=False, compression='gzip', converters={'parse' : lambda x: literal_eval(x), 'id' : np.int64})
# # combine parse data!! one row per id
# parsed_data = pd.concat([pd.Series([i, list(x.loc[:, 'parse'].values)], index=['id', 'parse']) for i,x in parsed_data.groupby('id')], axis=1).transpose()
display(txt_data.head())
# display(parsed_data.head())

Unnamed: 0,id,txt,data_name,username,date,lang,NE,NE_type,NE_LOC,valid_loc,NE_fixed,has_descriptor,NE_fixed_clean,max_population,max_alternate_name_count,max_population_anchor,max_population_diff,max_alternate_name_count_anchor,max_alternate_name_count_diff
17,899098735367647232,Tropical Depression #Harvey is 1543 miles SSE ...,harvey,"#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_...",2017-08-19 22:40:00,en,Raleigh,LOCATION,True,True,raleigh,False,raleigh,451066.0,43.0,False,0.0,False,0.0
31,899258634223353856,"#Harvey , #Illinois #firefighters ' #pension o...",harvey,"#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_...",2017-08-20 09:15:00,en,Chicago,LOCATION,True,True,chicago,False,chicago,2720546.0,70.0,False,0.0,False,0.0
33,899272853954101249,"Harvey's. 2380 Wyandotte Street West, Windsor,...",harvey,"#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_...",2017-08-20 10:12:00,en,Windsor,LOCATION,True,True,windsor,False,windsor,28778.0,16.0,False,0.0,False,0.0
49,899425632215588864,NHC_Atlantic: #Harvey 's remnants are likely t...,harvey,"#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_...",2017-08-20 20:19:00,en,Bay,LOCATION,True,True,bay,False,bay,15402.0,15.0,False,0.0,False,0.0
55,899443698693529601,@NHC_Atlantic #Harvey 's remnants are likely t...,harvey,"#Irma,#HurricaneIrma,#Harvey,#HurricaneHarvey_...",2017-08-20 21:31:00,en,Bay,LOCATION,True,True,bay,False,bay,15402.0,15.0,False,0.0,False,0.0


In [174]:
txt_data_relevant = txt_data[txt_data.loc[:, 'id'].isin(annotated_data.loc[:, 'id'].unique())]
annotated_data_full = []
# iteratively build full data to avoid copying
# annotated NEs from txt data
for id_i, data_i in annotated_data_agree.groupby('id'):
    txt_data_i = txt_data_relevant[txt_data_relevant.loc[:, 'id']==id_i]
    txt_data_i = txt_data_i.drop('NE', axis=1, inplace=False).rename(columns={'NE_fixed' : 'NE'}, inplace=False)
    txt_data_i = txt_data_i[~txt_data_i.loc[:, 'NE'].isin(data_i.loc[:, 'NE'].values)]
    data_i = pd.concat([data_i, txt_data_i], axis=0)
    data_i.fillna(0, inplace=True)
    data_i = data_i.loc[:, annotated_data_agree.columns]
    annotated_data_full.append(data_i)
annotated_data_full = pd.concat(annotated_data_full, axis=0)
display(annotated_data_full.head())

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


Unnamed: 0,data_name,NE,txt,state_gold_1,descriptor_gold_1,state_gold_2,descriptor_gold_2,state_gold_agreement,descriptor_gold_agreement,id,all_gold_agreement,state_gold,descriptor_gold,all_gold,valid_loc
49,harvey,corpus_christi,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,1.0,1.0,900454330544926720,1.0,0.0,0.0,0.0,1
1542,harvey,houston,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,0.0,0.0,900454330544926720,0.0,0.0,0.0,0.0,1
50,harvey,rockport,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,1.0,1.0,900765014142976000,1.0,0.0,0.0,0.0,1
4309,harvey,corpus_christi,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,0.0,0.0,900765014142976000,0.0,0.0,0.0,0.0,1
51,harvey,brownsville,The eye of #HurricaneHarvey is now showing on ...,1.0,0.0,1.0,0.0,1.0,1.0,900894782763012096,1.0,1.0,0.0,1.0,1


Let's parse the data ourselves.

In [201]:
from importlib import reload
import parse_twitter_data
reload(parse_twitter_data)
from parse_twitter_data import parse_data
annotated_data_parsed = parse_data(annotated_data_full.drop_duplicates('id').loc[:, ['id', 'txt']])
annotated_data_parsed = pd.DataFrame([[i, list(x.loc[:, 'parse'].values)] for i,x in annotated_data_parsed.groupby('id')])
annotated_data_parsed.columns = ['id', 'parse']
# combine parses
annotated_data_parsed = pd.merge(annotated_data_parsed, annotated_data_full, on='id')
display(annotated_data_parsed.head())

Unnamed: 0,id,parse,data_name,NE,txt,state_gold_1,descriptor_gold_1,state_gold_2,descriptor_gold_2,state_gold_agreement,descriptor_gold_agreement,all_gold_agreement,state_gold,descriptor_gold,all_gold,valid_loc
0,900454330544926720,"[[[The, DET, 3, det, 0], [first, ADJ, 3, amod,...",harvey,corpus_christi,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1
1,900454330544926720,"[[[The, DET, 3, det, 0], [first, ADJ, 3, amod,...",harvey,houston,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,900765014142976000,"[[[As, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",harvey,rockport,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1
3,900765014142976000,"[[[As, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",harvey,corpus_christi,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,900894782763012096,"[[[The, DET, 1, det, 0], [eye, NOUN, 6, nsubj,...",harvey,brownsville,The eye of #HurricaneHarvey is now showing on ...,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1


In [205]:
annotated_data_parsed = annotated_data_parsed.drop('NE', axis=1, inplace=False).assign(**{
    'NE' : annotated_data_parsed.loc[:, 'NE'].apply(lambda x: x.replace('_', ' '))
})
annotated_data_parsed = annotated_data_parsed.assign(**{
    'max_population' : annotated_data_parsed.loc[:, 'NE'].apply(lambda x: geonames_max_pop[x] if x in geonames_max_pop else 0.),
    'max_alternate_names' : annotated_data_parsed.loc[:, 'NE'].apply(lambda x: geonames_max_alt_names[x] if x in geonames_max_alt_names else 0.),
})

In [206]:
if('parse_clean' not in annotated_data_parsed.columns):
    annotated_data_parsed = annotated_data_parsed.assign(**{
        'parse_clean' : annotated_data_parsed.loc[:, 'parse'].apply(lambda x: [[[z[0].lower(), z[1], z[2], z[3], z[4]] for z in y] for y in x])
    })

In [207]:
display(annotated_data_parsed.head())

Unnamed: 0,id,parse,data_name,txt,state_gold_1,descriptor_gold_1,state_gold_2,descriptor_gold_2,state_gold_agreement,descriptor_gold_agreement,all_gold_agreement,state_gold,descriptor_gold,all_gold,valid_loc,max_population,max_alternate_names,parse_clean,NE
0,900454330544926720,"[[[The, DET, 3, det, 0], [first, ADJ, 3, amod,...",harvey,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1,324074,31,"[[[the, DET, 3, det, 0], [first, ADJ, 3, amod,...",corpus christi
1,900454330544926720,"[[[The, DET, 3, det, 0], [first, ADJ, 3, amod,...",harvey,The first Hurricane Watches for this part of #...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2296224,60,"[[[the, DET, 3, det, 0], [first, ADJ, 3, amod,...",houston
2,900765014142976000,"[[[As, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",harvey,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1,10490,10,"[[[as, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",rockport
3,900765014142976000,"[[[As, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",harvey,"As of 10 a.m., the modeling has #Harvey making...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,324074,31,"[[[as, ADP, 0, ROOT, 0], [of, ADP, 0, prep, 1]...",corpus christi
4,900894782763012096,"[[[The, DET, 1, det, 0], [eye, NOUN, 6, nsubj,...",harvey,The eye of #HurricaneHarvey is now showing on ...,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1,183887,24,"[[[the, DET, 1, det, 0], [eye, NOUN, 6, nsubj,...",brownsville


## Detect anchors
Now we try to detect anchors using the parse trees.

In [215]:
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import detect_anchor_by_type
def detect_anchor_prec_recall(data, anchor_types_to_use=['descriptor', 'state'], gold_var='all_gold', anchor_var='max_population', data_var='data_name'):
    """
    Compute precision and recall for anchor detection.
    """
    gold_vals = []
    pred_vals = []
    all_anchor_types = ['state', 'descriptor',  'compound', 'list']
    agree_col = 'all_gold_agreement'
    NE_col = 'NE'
    parse_col = 'parse_clean'
    anchor_col = 'anchor'
    all_pred_data = []
    for id_i, data_i in data.groupby('id'):
        anchor_state, anchor_descriptor, anchor_compound, anchor_list = detect_anchor_by_type(data_i, parse_var=parse_col, anchor_var=anchor_var, data_name_var=data_var, verbose=True)
        pred_i = pd.concat([anchor_state, anchor_descriptor, anchor_compound, anchor_list], axis=1)
        pred_i.columns = all_anchor_types
        # restrict to annotated NEs
        data_i_annotated = data_i[data_i.loc[:, agree_col]==1]
        NE_i_annotated = data_i_annotated.loc[:, NE_col]
#         print(pred_i)
        pred_i = pred_i.loc[NE_i_annotated, :]
#         if(any(pred_i.loc[:, ['compound', 'list']].max(axis=1).values)):
#             print('compound/list detected\n%s'%(pred_i))
#         print('pred = \n%s'%(pred_i))
        pred_i = pred_i.loc[:, anchor_types_to_use].max(axis=1)
        pred_i_data = pd.DataFrame(pred_i.copy(), columns=[anchor_col])
        pred_vals += list(pred_i_data.loc[:, anchor_col].values)
        gold_vals += list(data_i_annotated.loc[:, gold_var].values)
#         print('pred data =\n%s'%(pred_i_data))
#         print('idx = %d'%(len(pred_i_data.index)))
        # save rest of data for debugging
        N = pred_i_data.shape[0]
        pred_i_data = pred_i_data.assign(**{
            'NE' : pred_i_data.index,
            'txt' : [data_i.loc[:, 'txt'].iloc[0],]*N,
            'parse' : [data_i.loc[:, 'parse'].iloc[0],]*N,
            'id' : [id_i,]*N,
        })
        all_pred_data.append(pred_i_data)
    gold_vals = np.array(gold_vals)
    pred_vals = np.array(pred_vals)
    gold_anchor_idx = np.where(gold_vals == 1)[0]
    prec = pred_vals[gold_anchor_idx].sum() / (pred_vals.sum())
    rec = pred_vals[gold_anchor_idx].sum() / len(gold_anchor_idx)
    all_pred_data = pd.concat(all_pred_data, axis=0)
#     print('TP = %d'%(pred_vals[gold_anchor_idx].sum()))
#     print('gold idx = %d'%(len(gold_anchor_idx)))
    # find false negatives => improve recall!
    FN_idx = np.where((gold_vals == 1) & (pred_vals == 0))[0]
    print('%d/%d FN'%(len(FN_idx), len(gold_vals)))
    display(all_pred_data.iloc[FN_idx, :].head(10).loc[:, ['NE', 'txt', 'parse']].values)
    return prec, rec, all_pred_data

In [216]:
# anchor_types_to_use = [['state'], ['descriptor', 'state'], ['descriptor', 'state', 'compound'], ['descriptor', 'state', 'list'], ['descriptor', 'state', 'compound', 'list']]
anchor_types_to_use = [['state', 'descriptor', 'compound', 'list']]
for anchor_types_to_use_i in anchor_types_to_use:
    prec, rec, pred_data = detect_anchor_prec_recall(annotated_data_parsed, anchor_types_to_use=anchor_types_to_use_i)
    print('anchor_types = %s, P=%.3f, R=%.3f'%(','.join(anchor_types_to_use_i), prec, rec))

full parse [[['the', 'DET', 3, 'det', 0], ['first', 'ADJ', 3, 'amod', 1], ['hurricane', 'PROPN', 3, 'compound', 2], ['watches', 'PROPN', 3, 'ROOT', 3], ['for', 'ADP', 3, 'prep', 4], ['this', 'DET', 6, 'det', 5], ['part', 'NOUN', 4, 'pobj', 6], ['of', 'ADP', 6, 'prep', 7], ['#texas', 'NOUN', 7, 'pobj', 8], ['in', 'ADP', 3, 'prep', 9], ['3', 'NUM', 11, 'compound', 10], ['267', 'NUM', 12, 'nummod', 11], ['days', 'NOUN', 9, 'pobj', 12], ['(', 'PUNCT', 3, 'punct', 13], ['9yrs', 'NUM', 3, 'appos', 14], ['!', 'PUNCT', 3, 'punct', 15], [')', 'PUNCT', 3, 'punct', 16]], [['like', 'ADP', 0, 'ROOT', 0], ['houston', 'PROPN', 0, 'pobj', 1], ['&', 'CCONJ', 1, 'cc', 2], ['corpus', 'PROPN', 5, 'compound', 3], ['christi', 'PROPN', 5, 'compound', 4], ['#txwx', 'PROPN', 1, 'conj', 5], ['#harvey', 'PROPN', 10, 'nmod', 6], ['h', 'NOUN', 10, 'intj', 7], ['/', 'SYM', 9, 'punct', 8], ['t', 'NOUN', 10, 'compound', 9], ['@kathrynprociv', 'PROPN', 10, 'ROOT', 10]]]
candidate 0=corpus christi
anchor NE candidates 

full parse [[['rain', 'NOUN', 3, 'nmod', 0], ['and', 'CCONJ', 0, 'cc', 1], ['lots', 'NOUN', 0, 'conj', 2], ['wind', 'NOUN', 3, 'ROOT', 3], ['in', 'ADP', 3, 'prep', 4], ['san', 'PROPN', 6, 'compound', 5], ['marcos', 'PROPN', 7, 'compound', 6], ['tx', 'PROPN', 4, 'pobj', 7], ['.', 'PUNCT', 3, 'punct', 8]], [['checked', 'VERB', 0, 'ROOT', 0], ['in', 'PART', 0, 'prt', 1], ['with', 'ADP', 0, 'prep', 2], ['my', 'ADJ', 4, 'poss', 3], ['parentals', 'NOUN', 2, 'pobj', 4], ['and', 'CCONJ', 0, 'cc', 5], ['everyone', 'NOUN', 7, 'nsubj', 6], ['is', 'VERB', 0, 'conj', 7], ['ok', 'ADJ', 7, 'acomp', 8], ['with', 'ADP', 7, 'prep', 9], ['a', 'DET', 12, 'det', 10], ['little', 'ADJ', 12, 'amod', 11], ['damage', 'NOUN', 9, 'pobj', 12], ['.', 'PUNCT', 7, 'punct', 13]], [['thanks', 'NOUN', 1, 'compound', 0], ['#hurricaneharvey', 'PUNCT', 1, 'ROOT', 1]]]
candidate 0=san marcos
anchor NE candidates = 
full parse [[["i'm", 'NUM', 0, 'ROOT', 0], ['at', 'ADP', 0, 'prep', 1], ['a', 'DET', 3, 'det', 2], ['shelter',

full parse [[['noaa', 'PROPN', 13, 'nsubj', 0], [':', 'PUNCT', 0, 'punct', 1], ['rainfall', 'NOUN', 0, 'acl', 2], ['total', 'NOUN', 2, 'dobj', 3], ['from', 'ADP', 3, 'prep', 4], ['#harvey', 'PROPN', 4, 'pobj', 5], ['for', 'ADP', 3, 'prep', 6], ['cedar', 'PROPN', 8, 'compound', 7], ['bayou', 'PROPN', 6, 'pobj', 8], ['in', 'ADP', 8, 'prep', 9], ['harris', 'PROPN', 11, 'compound', 10], ['county', 'PROPN', 12, 'compound', 11], ['texas', 'PROPN', 9, 'pobj', 12], ['is', 'VERB', 13, 'ROOT', 13], ['at', 'ADP', 13, 'prep', 14], ['51.88', 'NUM', 16, 'nummod', 15], ['”', 'NOUN', 14, 'pobj', 16], ['a', 'DET', 20, 'det', 17], ['contiguous', 'ADJ', 20, 'amod', 18], ['us', 'PROPN', 20, 'compound', 19], ['record', 'NOUN', 13, 'attr', 20], ['for', 'ADP', 20, 'prep', 21], ['any', 'DET', 24, 'det', 22], ['tropical', 'ADJ', 24, 'amod', 23], ['system', 'NOUN', 21, 'pobj', 24], ['.', 'PUNCT', 13, 'punct', 25]]]
candidate 0=harris county
anchor NE candidates = 
full parse [[['president', 'PROPN', 1, 'compoun

full parse [[["we're", 'PROPN', 3, 'nsubjpass', 0], ['gonna', 'VERB', 3, 'aux', 1], ['be', 'VERB', 3, 'auxpass', 2], ['headed', 'VERB', 3, 'ROOT', 3], ['to', 'ADP', 3, 'prep', 4], ['woodsboro', 'PROPN', 6, 'compound', 5], ['tx', 'PROPN', 7, 'compound', 6], ['pop', 'NOUN', 4, 'pobj', 7], ['.', 'PUNCT', 3, 'punct', 8]], [['1', 'NUM', 1, 'nummod', 0], ['512', 'NUM', 1, 'ROOT', 1], ['tomorrow', 'NOUN', 1, 'npadvmod', 2], ['to', 'PART', 4, 'aux', 3], ['drop', 'VERB', 1, 'relcl', 4], ['off', 'PART', 4, 'prt', 5], ['supplies', 'NOUN', 4, 'dobj', 6], ['.', 'PUNCT', 1, 'punct', 7]], [['#harvey', 'PROPN', 1, 'compound', 0], ['amazon', 'PROPN', 3, 'compound', 1], ['wish', 'PROPN', 3, 'compound', 2], ['list', 'NOUN', 3, 'ROOT', 3]]]
candidate 0=woodsboro
anchor NE candidates = 
full parse [[['@kellycass', 'PUNCT', 2, 'nmod', 0], ['good', 'ADJ', 2, 'amod', 1], ['morning', 'NOUN', 2, 'ROOT', 2], ['from', 'ADP', 2, 'prep', 3], ['morgantown', 'PROPN', 5, 'compound', 4], ['wv', 'PROPN', 3, 'pobj', 5], 

full parse [[['my', 'ADJ', 1, 'poss', 0], ['family', 'NOUN', 4, 'nsubj', 1], ['in', 'ADP', 1, 'prep', 2], ['miami', 'PROPN', 2, 'pobj', 3], ['evacuated', 'VERB', 4, 'ROOT', 4], ['yesterday', 'NOUN', 4, 'npadvmod', 5], ['to', 'ADP', 4, 'prep', 6], ['tampa', 'PROPN', 6, 'pobj', 7], ['.', 'PUNCT', 4, 'punct', 8]], [['they', 'PRON', 1, 'nsubj', 0], ["aren't", 'VERB', 7, 'nmod', 1], ['out', 'ADP', 1, 'prep', 2], ['of', 'ADP', 2, 'prep', 3], ['#irma', 'PROPN', 3, 'pobj', 4], ["'", 'PUNCT', 7, 'punct', 5], ['s', 'PRON', 7, 'nmod', 6], ['path', 'NOUN', 7, 'ROOT', 7], ['completely', 'ADV', 7, 'advmod', 8], ['but', 'CCONJ', 7, 'cc', 9], ["i'm", 'NUM', 7, 'conj', 10], ['so', 'ADV', 12, 'advmod', 11], ['grateful', 'ADJ', 12, 'ROOT', 12], ['they', 'PRON', 14, 'nsubj', 13], ['were', 'VERB', 12, 'ccomp', 14], ['able', 'ADJ', 14, 'acomp', 15], ['to', 'PART', 17, 'aux', 16], ['leave', 'VERB', 15, 'xcomp', 17], ['.', 'PUNCT', 12, 'punct', 18]]]
candidate 0=tampa
anchor NE candidates = miami
data NE tree

full parse [[['@msnbc', 'PROPN', 1, 'nsubj', 0], ['appreciate', 'VERB', 1, 'ROOT', 1], ['thoroughness', 'NOUN', 1, 'dobj', 2], ['on', 'ADP', 2, 'prep', 3], ['#irma', 'PROPN', 3, 'pobj', 4], ['but', 'CCONJ', 3, 'cc', 5], ['at', 'ADP', 11, 'prep', 6], ['what', 'ADJ', 8, 'det', 7], ['point', 'NOUN', 6, 'pobj', 8], ['do', 'VERB', 11, 'aux', 9], ['you', 'PRON', 11, 'nsubj', 10], ['cover', 'VERB', 11, 'ROOT', 11], ['prep', 'NOUN', 14, 'nmod', 12], ['/', 'SYM', 14, 'punct', 13], ['problems', 'NOUN', 15, 'compound', 14], ['north', 'PROPN', 11, 'dobj', 15], ['of', 'ADP', 15, 'prep', 16], ['keys', 'PROPN', 19, 'nmod', 17], ['/', 'SYM', 19, 'punct', 18], ['miami', 'PROPN', 16, 'pobj', 19], ['?', 'PUNCT', 11, 'punct', 20]], [['&', 'CCONJ', 1, 'cc', 0], ["how's", 'NOUN', 1, 'ROOT', 1], ['houston', 'PROPN', 1, 'npadvmod', 2], ['?', 'PUNCT', 1, 'punct', 3]], [['nature', 'NOUN', 0, 'ROOT', 0], ['itself', 'PRON', 0, 'appos', 1], ['?', 'PUNCT', 0, 'punct', 2]], [['!', 'PUNCT', 0, 'ROOT', 0]]]
candidate 

full parse [[['#irma', 'PROPN', 0, 'ROOT', 0], ['seeing', 'VERB', 0, 'acl', 1], ['pockets', 'NOUN', 1, 'dobj', 2], ['of', 'ADP', 2, 'prep', 3], ['heavy', 'ADJ', 5, 'amod', 4], ['traffic', 'NOUN', 3, 'pobj', 5], ['sb', 'PROPN', 1, 'ccomp', 6], ['i', 'PRON', 8, 'nsubj', 7], ['75', 'NUM', 1, 'ccomp', 8], ['from', 'ADP', 1, 'prep', 9], ['gainesville', 'PROPN', 9, 'pobj', 10], ['thru', 'ADP', 1, 'prep', 11], ['ocala', 'PROPN', 11, 'pobj', 12], ['as', 'ADP', 15, 'mark', 13], ['people', 'NOUN', 15, 'nsubj', 14], ['return', 'VERB', 1, 'advcl', 15], ['to', 'ADP', 15, 'prep', 16], ['their', 'ADJ', 18, 'poss', 17], ['homes', 'NOUN', 16, 'pobj', 18]]]
candidate 0=ocala
anchor NE candidates = gainesville
data NE tree=[['ocala', 'PROPN', 11, 'pobj', 12]]
NE parse token at tree=0, token=13:
['ocala', 'PROPN', 11, 'pobj', 12]
NE parent token:
['thru', 'ADP', 1, 'prep', 11]
candidate 1=gainesville
anchor NE candidates = 
full parse [[['volunteers', 'NOUN', 12, 'nsubj', 0], ['from', 'ADP', 0, 'prep', 1]

full parse [[['this', 'DET', 1, 'nsubj', 0], ['is', 'VERB', 1, 'ROOT', 1], ['how', 'ADV', 6, 'advmod', 2], ['ponce', 'PROPN', 5, 'compound', 3], ['puerto', 'PROPN', 5, 'compound', 4], ['rico', 'PROPN', 6, 'nsubj', 5], ['looked', 'VERB', 1, 'ccomp', 6], ['about', 'ADV', 8, 'advmod', 7], ['30', 'NUM', 9, 'nummod', 8], ['mins', 'NOUN', 10, 'npadvmod', 9], ['ago', 'ADV', 6, 'advmod', 10], ['.', 'PUNCT', 1, 'punct', 11]], [['#maria', 'PUNCT', 0, 'ROOT', 0]]]
candidate 0=ponce
anchor NE candidates = 
full parse [[['rt', 'PROPN', 1, 'compound', 0], ['@hurrtrackerapp', 'PROPN', 7, 'dep', 1], [':', 'PUNCT', 7, 'punct', 2], ['breaking', 'VERB', 7, 'ccomp', 3], [':', 'PUNCT', 7, 'punct', 4], ['hurricane', 'PROPN', 6, 'compound', 5], ['#maria', 'PUNCT', 7, 'nsubj', 6], ['makes', 'VERB', 7, 'ROOT', 7], ['landfall', 'NOUN', 7, 'dobj', 8], ['near', 'ADP', 8, 'prep', 9], ['yabucoa', 'PROPN', 12, 'compound', 10], ['puerto', 'PROPN', 12, 'compound', 11], ['rico', 'PROPN', 9, 'pobj', 12], ['as', 'ADP', 7

full parse [[['seeking', 'VERB', 0, 'ROOT', 0], ['info', 'NOUN', 0, 'dobj', 1], ['on', 'ADP', 1, 'prep', 2], ['my', 'ADJ', 4, 'poss', 3], ['aunt', 'NOUN', 2, 'pobj', 4], ['milly', 'PROPN', 6, 'compound', 5], ['bodon', 'PROPN', 4, 'appos', 6], ['&', 'CCONJ', 6, 'cc', 7], ['luis', 'PROPN', 6, 'conj', 8], ['in', 'ADP', 4, 'prep', 9], ['jayuya', 'PROPN', 12, 'compound', 10], ['puerto', 'PROPN', 12, 'compound', 11], ['rico', 'PROPN', 9, 'pobj', 12], ['(', 'PUNCT', 12, 'punct', 13], ['cuabey', 'PROPN', 12, 'appos', 14], [')', 'PUNCT', 12, 'punct', 15], ['.', 'PUNCT', 0, 'punct', 16]], [['its', 'ADJ', 2, 'poss', 0], ['only', 'ADV', 2, 'advmod', 1], ['accessible', 'ADJ', 2, 'ROOT', 2], ['by', 'ADP', 2, 'prep', 3], ['helicopter.no', 'NOUN', 8, 'amod', 4], ['power', 'NOUN', 7, 'nmod', 5], ['/', 'SYM', 7, 'punct', 6], ['phone', 'NOUN', 8, 'compound', 7], ['#hurricanemaria', 'PROPN', 3, 'pobj', 8]]]
candidate 0=jayuya
anchor NE candidates = 
full parse [[['#sanjuan', 'NOUN', 0, 'ROOT', 0], ['#puer

full parse [[['dodges', 'VERB', 0, 'ROOT', 0], ['the', 'DET', 3, 'det', 1], ['draft', 'NOUN', 3, 'compound', 2], ['attacks', 'NOUN', 0, 'dobj', 3], ['john', 'PROPN', 5, 'compound', 4], ['mccain', 'PROPN', 3, 'appos', 5], ['.', 'PUNCT', 0, 'punct', 6]], [['plays', 'VERB', 8, 'nsubj', 0], ['golf', 'NOUN', 0, 'dobj', 1], ['in', 'ADP', 0, 'prep', 2], ['nj', 'PROPN', 5, 'compound', 3], ['attacks', 'NOUN', 5, 'compound', 4], ['mayor', 'NOUN', 2, 'pobj', 5], ['of', 'ADP', 5, 'prep', 6], ['hurricane', 'NOUN', 6, 'pobj', 7], ['ravaged', 'VERB', 8, 'ROOT', 8], ['san', 'PROPN', 10, 'compound', 9], ['juan', 'PROPN', 8, 'dobj', 10], ['.', 'PUNCT', 8, 'punct', 11]], [['#puertorico', 'PROPN', 2, 'compound', 0], ['#weakness', 'PROPN', 2, 'compound', 1], ['#trump', 'PUNCT', 2, 'ROOT', 2]]]
candidate 0=san juan
anchor NE candidates = 
full parse [[['can', 'VERB', 3, 'aux', 0], ['we', 'PRON', 3, 'nsubj', 1], ['please', 'INTJ', 3, 'intj', 2], ['help', 'VERB', 3, 'ROOT', 3], ['puerto', 'PROPN', 5, 'compoun

full parse [[['rt', 'PROPN', 1, 'compound', 0], ['@edvalleewx', 'PROPN', 1, 'ROOT', 1], [':', 'PUNCT', 1, 'punct', 2], ['our', 'ADJ', 4, 'poss', 3], ['models', 'NOUN', 10, 'nsubj', 4], ['specifically', 'ADV', 6, 'advmod', 5], ['used', 'VERB', 4, 'acl', 6], ['for', 'ADP', 6, 'prep', 7], ['forecasting', 'VERB', 7, 'pcomp', 8], ['hurricanes', 'NOUN', 8, 'dobj', 9], ['have', 'VERB', 10, 'ROOT', 10], ['great', 'ADJ', 12, 'amod', 11], ['agreement', 'NOUN', 10, 'dobj', 12], ['in', 'ADP', 12, 'prep', 13], ['#florence', 'NOUN', 15, 'nsubj', 14], ['making', 'VERB', 13, 'pcomp', 15], ['landfall', 'NOUN', 15, 'dobj', 16], ['near', 'ADP', 16, 'prep', 17], ['new', 'PROPN', 20, 'compound', 18], ['bern', 'PROPN', 20, 'compound', 19], ['nc', 'PROPN', 17, 'pobj', 20], ['…', 'PUNCT', 10, 'punct', 21]]]
candidate 0=new bern
anchor NE candidates = 
full parse [[['rt', 'PROPN', 1, 'compound', 0], ['@wmo', 'PROPN', 1, 'ROOT', 1], [':', 'PUNCT', 1, 'punct', 2], ['hurricane', 'PROPN', 4, 'compound', 3], ['#flo

full parse [[['late', 'ADJ', 1, 'amod', 0], ['dinner', 'NOUN', 1, 'ROOT', 1], ['for', 'ADP', 1, 'prep', 2], ['linemen', 'NOUN', 2, 'pobj', 3], ['in', 'ADP', 3, 'prep', 4], ['maxton', 'PROPN', 6, 'compound', 5], ['nc', 'PROPN', 4, 'pobj', 6], ['tonight', 'NOUN', 1, 'npadvmod', 7], ['.', 'PUNCT', 1, 'punct', 8]], [['it', 'PRON', 1, 'nsubj', 0], ['’', 'VERB', 1, 'ROOT', 1], ['s', 'VERB', 1, 'case', 2], ['windy', 'ADJ', 1, 'amod', 3], ['and', 'CCONJ', 3, 'cc', 4], ['rainy', 'ADJ', 3, 'conj', 5], ['but', 'CCONJ', 1, 'cc', 6], ['the', 'DET', 8, 'det', 7], ['base', 'NOUN', 9, 'nsubj', 8], ['camp', 'NOUN', 1, 'conj', 9], ['up', 'PART', 9, 'prt', 10], ['and', 'CCONJ', 10, 'cc', 11], ['operational', 'ADJ', 10, 'conj', 12], ['and', 'CCONJ', 9, 'cc', 13], ['is', 'VERB', 9, 'conj', 14], ['able', 'ADJ', 14, 'acomp', 15], ['to', 'PART', 17, 'aux', 16], ['get', 'VERB', 15, 'xcomp', 17], ['crews', 'NOUN', 21, 'nsubj', 18], ['a', 'DET', 21, 'det', 19], ['hot', 'ADJ', 21, 'amod', 20], ['meal', 'NOUN', 17

full parse [[['@weatherchannel', 'PROPN', 2, 'punct', 0], ['this', 'DET', 2, 'nsubj', 1], ['is', 'VERB', 2, 'ROOT', 2], ['from', 'ADP', 2, 'prep', 3], ['williamstown', 'PROPN', 5, 'compound', 4], ['ma', 'PROPN', 3, 'pobj', 5], ['.', 'PUNCT', 2, 'punct', 6]], [['our', 'ADJ', 1, 'poss', 0], ['bridge', 'NOUN', 2, 'nsubj', 1], ['is', 'VERB', 2, 'ROOT', 2], ['about', 'ADJ', 2, 'acomp', 3], ['to', 'PART', 6, 'aux', 4], ['get', 'VERB', 6, 'auxpass', 5], ['wiped', 'VERB', 3, 'xcomp', 6], ['out', 'PART', 6, 'prt', 7], ['!', 'PUNCT', 2, 'punct', 8]], [['even', 'ADV', 1, 'advmod', 0], ['mass', 'PROPN', 4, 'nsubjpass', 1], ['is', 'VERB', 4, 'aux', 2], ['being', 'VERB', 4, 'auxpass', 3], ['affected', 'VERB', 4, 'ROOT', 4], ['by', 'ADP', 4, 'agent', 5], ['#florence', 'PROPN', 5, 'pobj', 6]]]
candidate 0=williamstown
anchor NE candidates = 
full parse [[['looks', 'VERB', 0, 'ROOT', 0], ['like', 'ADP', 3, 'mark', 1], ['i', 'PRON', 3, 'nsubj', 2], ['made', 'VERB', 0, 'advcl', 3], ['it', 'PRON', 3, 'dob

full parse [[['thanks', 'NOUN', 13, 'npadvmod', 0], ['to', 'ADP', 0, 'prep', 1], ['the', 'DET', 4, 'det', 2], ['amazing', 'ADJ', 4, 'amod', 3], ['generosity', 'NOUN', 1, 'pobj', 4], ['of', 'ADP', 4, 'prep', 5], ['people', 'NOUN', 5, 'pobj', 6], ['in', 'ADP', 4, 'prep', 7], ['and', 'CCONJ', 7, 'cc', 8], ['around', 'ADP', 7, 'conj', 9], ['the', 'DET', 12, 'det', 10], ['triangle', 'PROPN', 12, 'compound', 11], ['@wral', 'PROPN', 9, 'pobj', 12], ['helped', 'VERB', 13, 'ROOT', 13], ['fill', 'VERB', 13, 'xcomp', 14], ['5', 'NUM', 16, 'nummod', 15], ['buses', 'NOUN', 14, 'dative', 16], ['5', 'NUM', 18, 'nummod', 17], ['trailers', 'NOUN', 14, 'dobj', 18], ['3', 'NUM', 20, 'nummod', 19], ['vans', 'NOUN', 18, 'appos', 20], ['and', 'CCONJ', 20, 'cc', 21], ['a', 'DET', 24, 'det', 22], ['small', 'ADJ', 24, 'amod', 23], ['truck', 'NOUN', 20, 'conj', 24], ['with', 'ADP', 24, 'prep', 25], ['supplies', 'NOUN', 25, 'pobj', 26], ['for', 'ADP', 26, 'prep', 27], ['#florence', 'NOUN', 29, 'compound', 28], [

full parse [[['our', 'ADJ', 4, 'poss', 0], ['first', 'ADJ', 2, 'amod', 1], ['#hurricanemichael', 'PROPN', 4, 'compound', 2], ['response', 'NOUN', 4, 'compound', 3], ['teams', 'NOUN', 6, 'nsubj', 4], ['have', 'VERB', 6, 'aux', 5], ['arrived', 'VERB', 6, 'ROOT', 6], ['at', 'ADP', 6, 'prep', 7], ['a', 'DET', 10, 'det', 8], ['staging', 'NOUN', 10, 'compound', 9], ['area', 'NOUN', 7, 'pobj', 10], ['in', 'ADP', 10, 'prep', 11], ['okaloosa', 'PROPN', 13, 'compound', 12], ['county', 'PROPN', 14, 'compound', 13], ['fl', 'PROPN', 11, 'pobj', 14], ['just', 'ADV', 16, 'advmod', 15], ['across', 'ADP', 6, 'prep', 16], ['the', 'DET', 18, 'det', 17], ['b', 'NOUN', 16, 'pobj', 18], ['…', 'PUNCT', 6, 'punct', 19]]]
candidate 0=okaloosa county
anchor NE candidates = 
full parse [[['posting', 'VERB', 0, 'ROOT', 0], ['for', 'ADP', 0, 'prep', 1], ['a', 'DET', 3, 'det', 2], ['friend', 'NOUN', 1, 'pobj', 3], [':', 'PUNCT', 0, 'punct', 4], ['“', 'INTJ', 0, 'appos', 5], ['if', 'ADP', 8, 'mark', 6], ['anyone', '

full parse [[['devastating', 'ADJ', 1, 'amod', 0], ['damage', 'NOUN', 1, 'ROOT', 1], [':', 'PUNCT', 1, 'punct', 2], ['this', 'DET', 4, 'nsubj', 3], ['is', 'VERB', 4, 'ROOT', 4], ['my', 'ADJ', 7, 'poss', 5], ['2nd', 'ADJ', 7, 'amod', 6], ['year', 'NOUN', 4, 'attr', 7], ['living', 'VERB', 7, 'acl', 8], ['in', 'ADP', 8, 'prep', 9], ['florida', 'PROPN', 9, 'pobj', 10], ['during', 'ADP', 8, 'prep', 11], ['hurricane', 'PROPN', 13, 'compound', 12], ['season', 'PROPN', 11, 'pobj', 13], ['and', 'CCONJ', 7, 'cc', 14], ['the', 'DET', 16, 'det', 15], ['images', 'NOUN', 18, 'nsubj', 16], ["don't", 'VERB', 18, 'aux', 17], ['get', 'VERB', 4, 'conj', 18], ['easier', 'ADJ', 18, 'acomp', 19], ['to', 'PART', 21, 'aux', 20], ['watch', 'VERB', 19, 'xcomp', 21], ['.', 'PUNCT', 4, 'punct', 22]], [['this', 'DET', 1, 'det', 0], ['picture', 'NOUN', 2, 'nsubj', 1], ['shows', 'VERB', 2, 'ROOT', 2], ['the', 'DET', 4, 'det', 3], ['devastation', 'NOUN', 2, 'dobj', 4], ['left', 'VERB', 4, 'acl', 5], ['behind', 'ADV',

full parse [[['check', 'VERB', 0, 'ROOT', 0], ['out', 'PART', 0, 'prt', 1], ['calhoun', 'PROPN', 0, 'dobj', 2], ['(', 'PUNCT', 2, 'punct', 3], ['98', 'NUM', 5, 'nummod', 4], ['%', 'NOUN', 2, 'appos', 5], ['out', 'PART', 5, 'prt', 6], [')', 'PUNCT', 2, 'punct', 7], ['gulf', 'PROPN', 8, 'ROOT', 8], ['(8', 'PROPN', 8, 'punct', 9], ['6', 'NUM', 11, 'nummod', 10], ['%', 'NOUN', 8, 'appos', 11], [')', 'PUNCT', 8, 'punct', 12], ['jackson', 'PROPN', 16, 'nmod', 13], ['(8', 'PROPN', 16, 'punct', 14], ['3', 'NUM', 16, 'nummod', 15], ['%', 'NOUN', 16, 'ROOT', 16], [')', 'PUNCT', 16, 'punct', 17], ['liberty', 'PROPN', 18, 'ROOT', 18], ['(', 'PUNCT', 18, 'punct', 19], ['71', 'NUM', 21, 'nummod', 20], ['%', 'NOUN', 18, 'appos', 21], [')', 'PUNCT', 18, 'punct', 22], ['and', 'CCONJ', 18, 'cc', 23], ['bay', 'PROPN', 18, 'conj', 24], ['(', 'PUNCT', 24, 'punct', 25], ['56', 'NUM', 27, 'nummod', 26], ['%', 'NOUN', 24, 'appos', 27], [')', 'PUNCT', 24, 'punct', 28], ['.', 'PUNCT', 18, 'punct', 29]], [['then

10/199 FN


array([['harris county',
        'Sending prayers - More than 1,700 square miles of Harris County in Texas in underwater - more than New York City & Chicago combined. #Harvey <URL>',
        list([[['Sending', 'VERB', 0, 'ROOT', 0], ['prayers', 'NOUN', 0, 'dobj', 1], ['More', 'ADJ', 5, 'amod', 2], ['than', 'ADP', 5, 'quantmod', 3], ['1', 'NUM', 5, 'compound', 4], ['700', 'NUM', 7, 'nummod', 5], ['square', 'ADJ', 7, 'amod', 6], ['miles', 'NOUN', 0, 'dobj', 7], ['of', 'ADP', 7, 'prep', 8], ['Harris', 'PROPN', 10, 'compound', 9], ['County', 'PROPN', 8, 'pobj', 10], ['in', 'ADP', 10, 'prep', 11], ['Texas', 'PROPN', 11, 'pobj', 12], ['in', 'ADP', 0, 'prep', 13], ['underwater', 'NOUN', 13, 'pobj', 14], ['more', 'ADJ', 14, 'amod', 15], ['than', 'ADP', 15, 'prep', 16], ['New', 'PROPN', 18, 'compound', 17], ['York', 'PROPN', 19, 'compound', 18], ['City', 'PROPN', 22, 'nsubj', 19], ['&', 'CCONJ', 19, 'cc', 20], ['Chicago', 'PROPN', 19, 'conj', 21], ['combined', 'VERB', 1, 'relcl', 22], ['.', 'PU

anchor_types = state,descriptor,compound,list, P=0.843, R=0.896


model|P|R
---|---|---
state+descriptor|0.988|0.833
state+descriptor+compound|0.966|0.875
state+descriptor+conjunction|0.976|0.865
state+descriptor+compound+conjunction|0.966|0.875
state+descriptor+compound_lax+conjunction|0.843|0.896

For "compound_lax" we try to find any anchor ("[Beaufort]_1 [Colleton County]_2") in the NE parent subtree; for "compound" we only try to find a state anchor. Unsurprisingly the "compound_lax" strategy improves recall at the cost of precision.