# Compare NER vs lexicon retrieval
In which we determine whether NER or lexicon matching is a better method for toponym resolution.

In [5]:
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
from data_helpers import clean_raw_txt

## Load data

In [66]:
fb_group_data = pd.read_csv('../../data/facebook-maria/combined_group_data.tsv', sep='\t', index_col=False)
print(fb_group_data.head())

          group_id   status_author_id  \
0  351272391991842  10159623474155515   
1  351272391991842  10155877836122783   
2  351272391991842  10215020607489848   
3  351272391991842   1716175958400853   
4  351272391991842    141039043205101   

                                      status_message status_lang  \
0  Estoy preocupada por mi tia vicky vazquez que ...          es   
1  Para alguien que lo necesite... Dale Share..A ...          es   
2       One unsafe well in Dorado on Carr 694 km 4.5          en   
3  En apoyo al alcalde Carlos Lopez y al municipi...          es   
4  Pueblo de Puerto Rico no se monten en las gran...          es   

                         status_id  
0  351272391991842_362333787552369  
1  351272391991842_361695457616202  
2  351272391991842_361487960970285  
3  351272391991842_361453717640376  
4  351272391991842_358929311226150  


In [67]:
# limit to Spanish data
lang = 'es'
fb_group_data_es = fb_group_data[fb_group_data.loc[:, 'status_lang'] == lang]
print('%d %s statuses'%(fb_group_data_es.shape[0], lang))

23662 es statuses


In [8]:
print('\n'.join(fb_group_data_es.loc[:,'status_message'].head(100).values))

Estoy preocupada por mi tia vicky vazquez que desde el dia antes del huracan no se de ella pues ella es una mujer enferma tiene que usar oxijeno y ella me dijo el dia antes del huracan  que su oxijeno se avia acabado y que no se sentia bien pero que se estava dando terapias con la maquina del asma ella vive en higuillar en Dorado creo qye se llama arenar en la calle que vive al frente de su casa antes hera una vaqueria yo espero que ella este bien por favor si alguien del grupo es de esa aria y la conose digale que su sobrina la esta buscando gracias por favor de compartir
Para alguien que lo necesite... Dale Share..A todos nuestros amigos, pacientes y al publico en general;Hipnoterapia Clínica Ericksoniana Sistémica se une al esfuerzo de recuperación que todos vivimos en Puerto Rico y reanuda sus servicios de Hipnosis desde 10 de octubre del 2017.Durante los meses de Octubre/2017 y Noviembre/2017 estaremos ofreciendo servicios de recuperación por la crisis del Huracán María, con una o

In [68]:
print(fb_group_data_es.columns)

Index([u'group_id', u'status_author_id', u'status_message', u'status_lang',
       u'status_id'],
      dtype='object')


In [97]:
import data_helpers
reload(data_helpers)
from data_helpers import clean_raw_txt
import codecs
# preprocess and dump to file
# need to keep track of indices for cross-referencing
fb_group_data_es_txt_df = fb_group_data_es.loc[:, ['status_message', 'status_id']].copy()
fb_group_data_es_txt = fb_group_data_es.loc[:, 'status_message'].values.tolist()
# clean, tokenize
# non-lowercased to help out the NER tagger
tokenizer = TweetTokenizer()
fb_group_data_es_txt_df.loc[:, 'status_message_clean'] = clean_raw_txt(fb_group_data_es_txt, lower=False)
print('%d lines to start'%(len(fb_group_data_es_txt)))
# get rid of lines that only contain URL
fb_group_data_es_txt_df = fb_group_data_es_txt_df[fb_group_data_es_txt_df.loc[:, 'status_message_clean'].apply(lambda x: x.strip()!='<URL>')]
print('%d lines after URL filter'%(fb_group_data_es_txt_df.shape[0]))
fb_group_data_es_txt_df.loc[:, 'status_message_clean'] = fb_group_data_es_txt_df.loc[:, 'status_message_clean'].apply(lambda x: ' '.join(tokenizer.tokenize(x)))

23662 lines to start
23148 lines after URL filter


In [98]:
# write to file
fb_group_data_es_txt = fb_group_data_es_txt_df.loc[:, 'status_message_clean']
out_file_name = '../../data/facebook-maria/combined_group_statuses.txt'
with codecs.open(out_file_name, 'w', encoding='utf-8') as out_file:
    # TODO: linebreak delimiter to help re-split data after tagging?
    out_file.write('\nSOLSOLSOL\t'.join(fb_group_data_es_txt))

In [46]:
! head -100 ../../data/facebook-maria/combined_group_statuses.txt

Estoy preocupada por mi tia vicky vazquez que desde el dia antes del huracan no se de ella pues ella es una mujer enferma tiene que usar oxijeno y ella me dijo el dia antes del huracan que su oxijeno se avenida ia acabado y que no se sentia bien pero que se estavenida a dando terapias con la maquina del asma ella vive en higuillar en Dorado creo qye se llama arenar en la calle que vive al frente de su casa antes hera una vaqueria yo espero que ella este bien por favenida or si alguien del grupo es de esa aria y la conose digale que su sobrina la esta buscando gracias por favenida or de compartir
SOLSOLSOL	Para alguien que lo necesite ... Dale Share .. A to1 nuestros amigos , pacientes y al publico en general ; Hipnoterapia Clínica Ericksoniana Sistémica se une al esfuerzo de recuperación que to1 vivimos en Puerto Rico y reanuda sus servicios de Hipnosis desde 10 de octubre del 2017.Durante los meses de Octubre / 2017 y Noviembre / 2017 estaremos ofreciendo servicios de recuperación po

## Run NER
Use the NER tagger to extract locations.

In [99]:
!bash tag_spanish_txt.sh

Invoked on Wed Nov 29 14:03:10 EST 2017 with arguments: -loadClassifier classifiers/spanish.ancora.distsim.s512.crf.ser.gz -textFile ../../../data/facebook-maria/combined_group_statuses.txt
loadClassifier=classifiers/spanish.ancora.distsim.s512.crf.ser.gz
textFile=../../../data/facebook-maria/combined_group_statuses.txt
Loading classifier from classifiers/spanish.ancora.distsim.s512.crf.ser.gz ... done [1.1 sec].
Untokenizable: � (U+FFFD, decimal: 65533)
CRFClassifier tagged 1031371 words in 57852 documents at 16179.64 words per second.


In [100]:
# fix weird delimiters
import data_helpers
reload(data_helpers)
from data_helpers import clean_tagged_txt
import codecs
ner_tagged_file = '../../data/facebook-maria/combined_group_statuses_ner.txt'
tagged_txt = [l.strip() for l in codecs.open(ner_tagged_file, 'r', encoding='utf-8')]
print(len(tagged_txt))
delim = 'SOLSOLSOL'
fixed_txt = clean_tagged_txt(tagged_txt, delim=delim)
print(len(fb_group_data_es_txt))
print(len(fixed_txt))

57852
processed 0 lines
processed 10000 lines
processed 20000 lines
processed 30000 lines
processed 40000 lines
processed 50000 lines
23148
23144


We're missing data. Maybe some lines got glued together?

In [101]:
# why are we missing data??
# iterate line by line and find line with wrong number of tokens
from itertools import izip
token_diff = 20
for z, (i,j) in enumerate(izip(fb_group_data_es_txt, fixed_txt)):
    i_tokens = i.split(' ')
    j_tokens = j.split(' ')
    if(abs(len(i_tokens) - len(j_tokens)) >= token_diff):
        print('error line found at index %d! %d vs. %d tokens'%(z, len(i_tokens), len(j_tokens)))
        print(i)
        print(j)
        break

error line found at index 12! 150 vs. 124 tokens
Dios , tú que escuchas nuestros ruegos , nuestras oraciones , nuestro dolor , nuestras ansiedades , desesperacion y vez nuestros corazones rotos con lo que está sucediendo en mi Puerto Rico ... Por favor mi Dios , ayuda a toda mi familia , amistades y a toda mi gente en Puerto Rico a encontrar las fuerzas , la paz y la paciencia para poder lidear con esta pesadilla diaria ... � � Señor a ti te imploro con todo mi corazón y toda mi alma que levantes a mi isla querida de Puerto Rico ! ! ! � � � � Que le des la fe y la esperanza para volver a empezar ... en el nombre del Padre , del Hijo y del Espíritu Santo , Amen ... � � � � � � � � � � � � � � � � � � � �
Dios/ORG ,/O tú/O que/O escuchas/O nuestros/O ruegos/O ,/O nuestras/O oraciones/O ,/O nuestro/O dolor/O ,/O nuestras/O ansiedades/O ,/O desesperacion/O y/O vez/O nuestros/O corazones/O rotos/O con/O lo/O que/O está/O sucediendo/O en/O mi/O Puerto/LUG Rico/LUG .../O Por/O favor/O mi/O Di

Not sure what's going on, but maybe some short lines in the original data got skipped.

Sanity check: make sure the tagged locations make sense.

In [102]:
from data_helpers import collect_entities_from_txt
tagged_entities_complete = map(collect_entities_from_txt, fixed_txt)
tagged_entities_type = map(lambda x: collect_entities_from_txt(x, include_type=True), fixed_txt)
print(tagged_entities[0:10])
print(tagged_entities_type[0:10])

[[u'Dorado'], [u'Dale_Share', u'Hipnoterapia_Cl\xednica_Ericksoniana_Sist\xe9mica', u'Puerto_Rico', u'Hipnosis', u'Octubre', u'Noviembre_/_2017', u'Hurac\xe1n_Mar\xeda', u'Hipnoterapia_Individual', u'Hipnoterapia', u'Caguas', u'11am', u'630pm', u'Turabo_Gardens', u'Caguas', u'Santa_Ana', u'Famcoop_-RRB-'], [u'Carlos_Lopez', u'Dorado_Dorado'], [u'Pueblo', u'Puerto_Rico', u'America', u'Puerto_Rico', u'shelter', u'America', u'Puerto_Rico'], [u'Puerto_Rico', u'Puerto_Rico', u'America', u'Nacion_Borincana', u'Trino_Dios', u'Puerto_Rico', u'Puerto_Rico', u'Cordero', u'Dios', u'Dios', u'Puerto_Rico'], [u'Guayama'], [u'Videos'], [u'Puerto_Rico', u'Florida', u'Maria', u'PR', u'Florida', u'Centro', u'Ayuda', u'Servicio', u'Dale_Copy', u'Paste_-LRB-_Share'], [u'Estoy', u'Ivan_J', u'Porrata', u'Tulio', u'Vicky', u'Utuado', u'Toa_Baja', u'P', u'Box', u'Dorado', u'Facebook'], [u'Alguien']]
[[u'Dorado/ORG'], [u'Dale_Share/PERS', u'Hipnoterapia_Cl\xednica_Ericksoniana_Sist\xe9mica/ORG', u'Puerto_Rico/

In [103]:
from collections import Counter
tagged_entities_flat = reduce(lambda x,y: x+y, tagged_entities)
tagged_entities_type_flat = reduce(lambda x,y: x+y, tagged_entities_type)
tagged_entity_counts = Counter(tagged_entities_flat)
tagged_entity_type_counts = Counter(tagged_entities_type_flat)
# print most common
top_k = 10
print(tagged_entity_counts.most_common(top_k))
print(tagged_entity_type_counts.most_common(top_k))

[(u'Alguien', 2268), (u'Dios', 1600), (u'Puerto_Rico', 1385), (u'PR', 1050), (u'Guayama', 874), (u'Hola', 671), (u'Gracias', 660), (u'Coamo', 478), (u'Barranquitas', 460), (u'Ponce', 419)]
[(u'Alguien/ORG', 2099), (u'Puerto_Rico/LUG', 1328), (u'Dios/PERS', 1050), (u'PR/ORG', 1003), (u'Guayama/LUG', 837), (u'Gracias/OTROS', 637), (u'Hola/ORG', 631), (u'Dios/OTROS', 501), (u'Barranquitas/LUG', 443), (u'Coamo/LUG', 433)]


In [104]:
# breakdown by entity type
entity_type_flat = map(lambda x: x.split('/')[1], tagged_entities_type_flat)
entity_type_counts = Counter(entity_type_flat)
print(entity_type_counts.items())

[(u'', 28), (u'LUG', 18442), (u'PERS', 27067), (u'7', 2), (u'cz3yxqThank', 1), (u'ORG', 29553), (u'OTROS', 8352), (u'171', 1)]


Most mentions are either location, person or organization. GOOD.

## Build sample data
Let's start from scratch with a new sample of data, taken over the whole corpus. $N=500$.

In [105]:
pd.np.random.seed(123)
N = 500
# need to include text and post index for later cross-referencing
fb_group_data_sample_idx = pd.np.random.choice(fb_group_data_es_txt_df.index, size=N, replace=False)
fb_group_data_sample = fb_group_data_es_txt_df.loc[fb_group_data_sample_idx, :]
# write to file: raw data and pre-annotation data
sample_file_name = '../../data/facebook-maria/all_group_sample_statuses.txt'
sample_annotate_file_name = '../../data/facebook-maria/all_group_sample_statuses_annotated.txt'
with codecs.open(sample_file_name, 'w', encoding='utf-8') as sample_output:
    for l in fb_group_data_sample.loc[:, 'status_message_clean'].values.tolist():
        sample_output.write('%s\n'%(l))
with codecs.open(sample_annotate_file_name, 'w', encoding='utf-8') as annotate_output:
    for i, r in fb_group_data_sample.iterrows():
        annotate_output.write('%s\t%s\n'%(r.loc['status_id'], r.loc['status_message_clean']))

...lots of annotation later...

In [138]:
# annotated text
sample_annotate_file_name = '../../data/facebook-maria/all_group_sample_statuses_annotated.txt'
sample_annotations = [l.strip() for l in codecs.open(sample_annotate_file_name, 'r', encoding='utf-8')]
print('\n'.join(sample_annotations[:10]))

486819048360070_495834210791887	Las ayudas donde estan parando ?
1773209126315380_1798559843780308	@Clandestino te agararon de toalla pa limpiarle el nombre al otro ... Recuerda que tu vives del pueblo y lo que hiso el señor Narmito Representante desde la luna se notava que era para su uso personal VIP ! @Clandestino consejo no metas tus manos al fuego por algien recuerda que tu eres lo q eres gracias al pueblo despues de 3 dias da cara 2 verciones de amistad 1 video y la gente pasando habre y sed ! ! ?? ?? ?? ?? DESPUES QUE PASAN EL VIDEO REPARTE LOS SUMINISTROS ESA MISMA NOCHE CUANDO SUPUESTAMENTE LOS ESTABA GUARDANDO PAL OTRO DIA ?? ?? ?? ?? ?? ?? Narmito Ortiz CHARLATANahhhh bloqueenme este tambie . l .
1979604895658060_1988090978142785	Mañana van a [[Corozal]]{CITY} ! Tienes que ver el vídeo ! ! #uni1porpuertorico
1773209126315380_1774270739542552	Alguien sabe de Juan Antonio ( Tony ) Colon ?
1988415758095382_1988888744714750	Esta es parte de mi familia mi hermano Emanuel Rojas ( 

In [139]:
import re
from unidecode import unidecode
# extract mentions
annotate_matcher = re.compile(r'(\[\[[^\]]+\]\])(\{[A-Z]+\})')
def extract_annotations(x):
    # TODO: include index number!
    annotations = [(x[2:-2], y[1:-1]) for (x,y) in annotate_matcher.findall(x)]
    return annotations
sample_annotation_mentions = map(extract_annotations, sample_annotations)
print(sample_annotation_mentions[:10])

[[], [], [(u'Corozal', u'CITY')], [], [(u'San Antonio', u'CITY'), (u'callejon', u'TOPO'), (u'Medina', u'TOPO')], [(u'VEGA ALTA', u'CITY')], [(u'Vista Monte', u'TOPO')], [], [(u'Barranquitas', u'CITY')], [(u'palmarejos', u'CITY')]]


Let's get some basic counts, like toponym type and most frequent toponyms.

In [140]:
from collections import Counter
sample_annotation_mentions_flat = reduce(lambda x,y: x+y, sample_annotation_mentions)
topo_names, topo_types = zip(*sample_annotation_mentions_flat)
topo_name_counts = Counter(topo_names)
topo_type_counts = Counter(topo_types)
print(topo_name_counts.most_common(10))
print(topo_type_counts.most_common())

[(u'Guayama', 26), (u'Vega Alta', 20), (u'Ponce', 15), (u'Coamo', 14), (u'Utuado', 11), (u'Yabucoa', 11), (u'Lajas', 11), (u'Vega Baja', 10), (u'Quebradillas', 9), (u'Barranquitas', 8)]
[(u'CITY', 366), (u'TOPO', 180), (u'UNK', 105)]


In [291]:
# percentages lol
topo_type_count_series = pd.Series(topo_type_counts)
topo_type_count_series /= topo_type_count_series.sum()
print(topo_type_count_series)

CITY    0.562212
TOPO    0.276498
UNK     0.161290
dtype: float64


Most toponyms mentioned tend to be the big cities, and there's a lot of `UNK` toponyms too! This will be a problem for the resolution engine.

How many statuses have at least one `TOPO`?

In [294]:
from __future__ import division
topo_status = filter(lambda x: any(map(lambda y: y[1]=='TOPO', x)), sample_annotation_mentions)
print(len(topo_status) / len(sample_annotation_mentions))

0.266


## Extract all locations

In [155]:
# need to rewrite the annotated data first because of mishaps the first time around
# annotate_cleaner_1 = re.compile('(\[\[[^\]]\]\])')
annotate_cleaner_1 = re.compile('(\[\[)|(\]\])')
annotate_cleaner_2 = re.compile('\{[^}]+\}')
clean_annotations = lambda x: annotate_cleaner_1.sub('', annotate_cleaner_2.sub('', x))
sample_annotations_clean = map(clean_annotations, sample_annotations)
print('\n'.join(sample_annotations_clean[:10]))
# write to file
sample_file_name = '../../data/facebook-maria/all_group_sample_statuses.txt'
with codecs.open(sample_file_name, 'w', encoding='utf-8') as sample_file:
    sample_file.write('\n'.join(sample_annotations_clean))

486819048360070_495834210791887	Las ayudas donde estan parando ?
1773209126315380_1798559843780308	@Clandestino te agararon de toalla pa limpiarle el nombre al otro ... Recuerda que tu vives del pueblo y lo que hiso el señor Narmito Representante desde la luna se notava que era para su uso personal VIP ! @Clandestino consejo no metas tus manos al fuego por algien recuerda que tu eres lo q eres gracias al pueblo despues de 3 dias da cara 2 verciones de amistad 1 video y la gente pasando habre y sed ! ! ?? ?? ?? ?? DESPUES QUE PASAN EL VIDEO REPARTE LOS SUMINISTROS ESA MISMA NOCHE CUANDO SUPUESTAMENTE LOS ESTABA GUARDANDO PAL OTRO DIA ?? ?? ?? ?? ?? ?? Narmito Ortiz CHARLATANahhhh bloqueenme este tambie . l .
1979604895658060_1988090978142785	Mañana van a Corozal ! Tienes que ver el vídeo ! ! #uni1porpuertorico
1773209126315380_1774270739542552	Alguien sabe de Juan Antonio ( Tony ) Colon ?
1988415758095382_1988888744714750	Esta es parte de mi familia mi hermano Emanuel Rojas ( nolito / n

NER first.

In [156]:
!bash tag_spanish_txt.sh ../../data/facebook-maria/all_group_sample_statuses.txt

Invoked on Fri Dec 01 12:35:08 EST 2017 with arguments: -loadClassifier classifiers/spanish.ancora.distsim.s512.crf.ser.gz -textFile ../../../data/facebook-maria/all_group_sample_statuses.txt
loadClassifier=classifiers/spanish.ancora.distsim.s512.crf.ser.gz
textFile=../../../data/facebook-maria/all_group_sample_statuses.txt
Loading classifier from classifiers/spanish.ancora.distsim.s512.crf.ser.gz ... done [1.2 sec].
Untokenizable: ️ (U+FE0F, decimal: 65039)
CRFClassifier tagged 20978 words in 1787 documents at 7868.72 words per second.


In [228]:
ner_file = '../../data/facebook-maria/all_group_sample_statuses_ner.txt'
ner_annotation_lines = [l.strip() for l in codecs.open(ner_file, 'r', encoding='utf-8')]

In [229]:
# fix annotation text!!
from data_helpers import clean_tagged_txt
delim = '[0-9]{10,}_[0-9]{10,}'
ner_annotation_lines = clean_tagged_txt(ner_annotation_lines, delim=delim)

processed 0 lines


In [233]:
from data_helpers import collect_entities_from_txt
ner_annotations = map(collect_entities_from_txt, ner_annotation_lines)
# replace underscore with space!!
ner_annotations = map(lambda x: map(lambda y: y.replace('_', ' '), x), ner_annotations)
print(ner_annotations[:20])

[[], [u'Narmito Representante', u'VIP', u'Narmito Ortiz CHARLATANahhhh'], [u'Corozal'], [u'Juan Antonio -LRB- Tony -RRB- Colon'], [u'Emanuel Rojas -LRB- nolito / nolo -RRB-', u'Liz', u'Asuncion Hernandez', u'Barrio San Antonio', u'Medina'], [u'VEGA'], [u'R\xedo Abajo', u'Vista Monte'], [u'SJ', u'Miami', u'NY'], [u'Barranquitas'], [u'To\xf1o Toro', u'Toros'], [u'Dios bendigas'], [u'Raul A', u'Ortiz -LRB- Goyito', u'Funeraria Rivera', u'Margarita Vazquez', u'Marliz Torres', u'Juan R', u'V\xe1zquez', u'Rabanal', u'Carmelo Rolon Jimenez', u'PR'], [u'Qien'], [u'FB', u'Estado'], [u'Porfin', u'Dios', u'PR'], [u'Viejo San Juan'], [u'Puerto Rico', u'Mar\xeda', u'Puerto Rico', u'Sali', u'Puerto Rico', u'Facebook'], [u'Palo Hincado', u'Nelson'], [u'fucking', u'Porq'], [u'Buzon', u'Davila']]


Lexicon second.

In [179]:
import data_helpers
reload(data_helpers)
from data_helpers import collect_lexicon_toponyms_from_txt
sample_file_name = '../../data/facebook-maria/all_group_sample_statuses.txt'
sample_statuses = [l.strip() for l in codecs.open(sample_file_name, 'r', encoding='utf-8')]
lexicon_file = '../../data/geo_files/toponym_lexicon_filtered.txt'
lexicon = [l.strip() for l in codecs.open(lexicon_file, 'r', encoding='utf-8')]
lexicon_annotations = collect_lexicon_toponyms_from_txt(sample_statuses, lexicon)
print('\n'.join(sample_statuses[:10]))
print('\n'.join(map(str, lexicon_annotations[:10])))

486819048360070_495834210791887	Las ayudas donde estan parando ?
1773209126315380_1798559843780308	@Clandestino te agararon de toalla pa limpiarle el nombre al otro ... Recuerda que tu vives del pueblo y lo que hiso el señor Narmito Representante desde la luna se notava que era para su uso personal VIP ! @Clandestino consejo no metas tus manos al fuego por algien recuerda que tu eres lo q eres gracias al pueblo despues de 3 dias da cara 2 verciones de amistad 1 video y la gente pasando habre y sed ! ! ?? ?? ?? ?? DESPUES QUE PASAN EL VIDEO REPARTE LOS SUMINISTROS ESA MISMA NOCHE CUANDO SUPUESTAMENTE LOS ESTABA GUARDANDO PAL OTRO DIA ?? ?? ?? ?? ?? ?? Narmito Ortiz CHARLATANahhhh bloqueenme este tambie . l .
1979604895658060_1988090978142785	Mañana van a Corozal ! Tienes que ver el vídeo ! ! #uni1porpuertorico
1773209126315380_1774270739542552	Alguien sabe de Juan Antonio ( Tony ) Colon ?
1988415758095382_1988888744714750	Esta es parte de mi familia mi hermano Emanuel Rojas ( nolito / n

## Compute precision/recall
Remember! This is just for toponym extraction, not for resolution.

In [315]:
# known
sample_annotation_mentions_known = map(lambda x: filter(lambda y: y[1]!='UNK', x), sample_annotation_mentions)
# topos only
sample_annotation_mentions_topo = map(lambda x: filter(lambda y: y[1]=='TOPO', x), sample_annotation_mentions)

In [316]:
gold_mentions = map(lambda x: zip(*x)[0] if len(x)>0 else [], sample_annotation_mentions_known)
gold_topo_mentions = map(lambda x: zip(*x)[0] if len(x)>0 else [], sample_annotation_mentions_topo)

In [319]:
print(gold_mentions[:10])
print(gold_topo_mentions[:10])

[[], [], (u'Corozal',), [], (u'San Antonio', u'callejon', u'Medina'), (u'VEGA ALTA',), (u'Vista Monte',), [], (u'Barranquitas',), (u'palmarejos',)]
[[], [], [], [], (u'callejon', u'Medina'), [], (u'Vista Monte',), [], [], []]


## All locations

Let's evaluate NER and the lexicon on all locations first, then sub-city locations.

### NER performance

In [306]:
import data_helpers
reload(data_helpers)
from data_helpers import test_precision_recall
ner_false, ner_missed, ner_precision, ner_recall = test_precision_recall(ner_annotations, gold_mentions)

In [307]:
print('NER precision=%.3f, recall=%.3f'%(ner_precision, ner_recall))

NER precision=0.173, recall=0.457


This is pretty bad! Will it help precision if we restrict to location mentions?

In [308]:
ner_type_annotations = map(lambda x: collect_entities_from_txt(x, include_type=True), ner_annotation_lines)
ner_type_annotations = map(lambda x: map(lambda y: y.replace('_', ' '), x), ner_type_annotations)
ner_type_annotations = map(lambda x: map(lambda y: y.split('/'), x), ner_type_annotations)
loc_type = 'LUG'
ner_loc_annotations = map(lambda x: filter(lambda y: y[1]==loc_type, x), ner_type_annotations)
# remove type
ner_loc_annotations = map(lambda x: map(lambda y: y[0], x), ner_loc_annotations)
ner_loc_annotations_flat = reduce(lambda x,y: x+y, ner_loc_annotations)
print(ner_loc_annotations_flat[:10])

[u'Barrio San Antonio', u'VEGA', u'R\xedo Abajo', u'Miami', u'Barranquitas', u'Funeraria Rivera', u'Viejo San Juan', u'Puerto Rico', u'Puerto Rico', u'Puerto Rico']


These locations make sense! So are they not showing up in the gold annotations or what?

In [309]:
ner_false, ner_missed, ner_precision, ner_recall = test_precision_recall(ner_loc_annotations, gold_mentions)
print('LOC NER precision=%.3f, recall=%.3f'%(ner_precision, ner_recall))

LOC NER precision=0.434, recall=0.273


Precision improved a lot but recall fell. What are we missing?

In [310]:
ner_missed_flat = reduce(lambda x,y: x+y, ner_missed)
ner_false_flat = reduce(lambda x,y: x+y, ner_false)
print(ner_missed_flat[:10])
print(ner_false_flat[:10])

[u'Corozal', u'Medina', u'San Antonio', u'callejon', u'VEGA ALTA', u'Vista Monte', u'palmarejos', u'Rabanal', u'corozal', u'guarico']
[u'Barrio San Antonio', u'VEGA', u'R\xedo Abajo', u'Miami', u'Funeraria Rivera', u'Puerto Rico', u'Puerto Rico', u'Puerto Rico', u'Buzon', u'Para\xedso']


In [311]:
print(gold_mentions[20:40])
print(ner_missed[20:40])
print(ner_false[20:40])

[(u'Urbanizaci\xf3n Manuel J Rivera', u'Coamo'), (u'urbanizaci\xf3n El Para\xedso', u'Ponce'), (u'carr 104', u'Mayaquez'), (u'Calle Victoria', u'El Dorado'), [], (u'Yabucoa',), (u'Cerro Gordo', u'Dorado'), [], (u'Guayama',), [], (u'Yabucoa',), [], (u'Quebradillas',), (u'Santa Ana', u'Arriba'), (u'Las Parcelas', u'Carmen'), [], (u'Guayama', u'Guayama', u'Guayama'), (u'Olimpo',), (u'Palos Blanco',), (u'VEGA ALTA',)]
[[u'Coamo', u'Urbanizaci\xf3n Manuel J Rivera'], [u'urbanizaci\xf3n El Para\xedso'], [u'Mayaquez', u'carr 104'], [u'Calle Victoria', u'El Dorado'], [], [u'Yabucoa'], [u'Cerro Gordo', u'Dorado'], [], [], [], [], [], [], [u'Arriba', u'Santa Ana'], [u'Carmen', u'Las Parcelas'], [], [], [], [u'Palos Blanco'], [u'VEGA ALTA']]
[[], [u'Para\xedso'], [], [u'Calle Victoria K11'], [], [], [], [], [u'Florida'], [], [u'Boston Massachusetts'], [], [u'ZELLOWalkie Talkie'], [], [u'Parcelas Carmen'], [], [], [], [u'Barrio Palos Blanco Serca'], []]


False negatives are fine-grained entities:
- `callejon`
- `Vista Monte`
- `carr 104`

False positives are coarse-grained entities:
- `Puerto Rico`
- `Florida`

### Lexicon performance

In [312]:
# first convert all gold mentions to lowercase
gold_mentions_lower = map(lambda x: map(lambda y: y.lower(), x), gold_mentions)
lex_false, lex_missed, lex_precision, lex_recall = test_precision_recall(lexicon_annotations, gold_mentions_lower)

In [313]:
print('lexicon precision=%.3f, recall=%.3f'%(lex_precision, lex_recall))

lexicon precision=0.499, recall=0.540


Woah!! This is so good!! Why did we ever do anything besides this??

In [314]:
print(gold_mentions_lower[20:40])
print(lex_missed[20:40])
print(lex_false[20:40])

[[u'urbanizaci\xf3n manuel j rivera', u'coamo'], [u'urbanizaci\xf3n el para\xedso', u'ponce'], [u'carr 104', u'mayaquez'], [u'calle victoria', u'el dorado'], [], [u'yabucoa'], [u'cerro gordo', u'dorado'], [], [u'guayama'], [], [u'yabucoa'], [], [u'quebradillas'], [u'santa ana', u'arriba'], [u'las parcelas', u'carmen'], [], [u'guayama', u'guayama', u'guayama'], [u'olimpo'], [u'palos blanco'], [u'vega alta']]
[[u'urbanizaci\xf3n manuel j rivera'], [u'ponce', u'urbanizaci\xf3n el para\xedso'], [u'carr 104', u'mayaquez'], [], [], [], [], [], [], [], [], [], [], [u'arriba'], [u'carmen', u'las parcelas'], [], [u'guayama', u'guayama'], [], [u'palos blanco'], []]
[[], [u'para\xedso'], [u'suarez'], [u'dorado', u'la princesa'], [], [], [], [], [u'florida'], [], [u'camino nuevo'], [], [], [u'capilla', u'capilla de santa ana'], [], [], [], [], [], []]


False negatives are respellings:

- `carr 104`
- `mayaguez`

False positives are overlap errors and irrelevant toponyms:
- `dorado` (extracted from `el dorado`)
- `la princesa`

## Toponyms only

In [320]:
ner_false, ner_missed, ner_precision, ner_recall = test_precision_recall(ner_loc_annotations, gold_topo_mentions)
print('TOPO NER precision=%.3f, recall=%.3f'%(ner_precision, ner_recall))
gold_topo_mentions_lower = map(lambda x: map(lambda y: y.lower(), x), gold_topo_mentions)
lex_false, lex_missed, lex_precision, lex_recall = test_precision_recall(lexicon_annotations, gold_topo_mentions_lower)
print('TOPO lex precision=%.3f, recall=%.3f'%(lex_precision, lex_recall))

TOPO NER precision=0.051, recall=0.101
TOPO lex precision=0.143, recall=0.472


### Combined performance
Can we combine the NER and lexicon annotations into a super-group?

This will require ~indexing~ the data, one word at a time.

In [246]:
# get rid of per-line indices
index_matcher = re.compile('[0-9]{10,}_[0-9]{10,}')
sample_annotate_file_name = '../../data/facebook-maria/all_group_sample_statuses_annotated.txt'
sample_annotations = [l.strip() for l in codecs.open(sample_annotate_file_name, 'r', encoding='utf-8')]
sample_annotations = map(lambda x: index_matcher.sub('', x).strip(), sample_annotations)

In [266]:
a = 'blah,blah'
a_split = a.split(',')

In [267]:
a_split.insert(1, 'x')
print(a_split)

['blah', 'x', 'blah']


In [282]:
def index_txt(txt):
    """
    Add index after each token.
    """
    txt_indexed = []
    end_matcher = re.compile('[^\]]{[A-Z]+}')
    punct_matcher = re.compile('\.\.+')
    txt_split = txt.split(' ')
    ctr = 0
    for t in txt_split:
        # put index after word, not phrase
        if(t != ''):
            if(punct_matcher.match(t)):
                t_split = t.split('.')
                for t_ in t_split:
                    txt_indexed.append('%s_%d'%(t_, ctr))
                    ctr += 1
            else:
                if(t.endswith('}')):
                    t_1, t_2 = t.split(']]')
                    t_1 = '%s_%d'%(t_1, ctr)
                    t = '%s]]%s'%(t_1, t_2)
                else:
                    t = '%s_%d'%(t, ctr)
                txt_indexed.append(t)
                ctr += 1
    txt_indexed = ' '.join(txt_indexed)
    return txt_indexed

In [257]:
test_txt = sample_annotations[10]
print(index_txt(test_txt))

(u'vaba', u'{UNK}')
Dios_0 bendigas_1 si_2 alguien_3 saber_4 de_5 brenda_6 marie_7 adorno_8 de_9 [[vega_10 vaba_11]]{UNK} ley_12 que_13 diga_14 algo_15 no_16 sabemos_17 nada_18 de_19 ella_20 por_21 favor_22


In [None]:
# preprocessing steps to make sure text is consistent between original and NER tagged
'..' '.'
'!!' '!'
'`' ''
'<URL>' 'URL'
':' ' : '
'#' ' # '
'-RCB-' ''
'-' ' - '

In [290]:
for i, (l1, l2) in enumerate(izip(sample_annotations, ner_annotation_lines)):
    l1_tokens = l1.strip().split(' ')
    l2_tokens = l2.strip().split(' ')
    l1_len = len(l1_tokens)
    l2_len = len(l2_tokens)
    if(l1_len != l2_len):
        print('error at index %d with len %d and %d'%(i, l1_len, l2_len))
        print(index_txt(l1))
#         print('<%s>'%(l1_tokens[-1].encode('utf-8')))
        print(index_txt(l2))

error at index 11 with len 122 and 125
Por_0 favor_1 miren_2 bien_3 _4 _5 _6 _7 Mi_8 hermano_9 Raul_10 A_11 ._12 Torres_13 Ortiz_14 (_15 Goyito_16 que_17 trabajaba_18 en_19 la_20 [[Funeraria_21 Rivera_22]]{UNK} )_23 y_24 su_25 esposa_26 Margarita_27 Vazquez_28 _29 _30 _31 Mi_32 sobrina_33 Marliz_34 Torres_35 y_36 su_37 esposo_38 Juan_39 R_40 ._41 Vázquez_42 (_43 ambos_44 trabajan_45 en_46 la_47 farmacia_48 [[Rabanal_49]]{CITY} y_50 mi_51 tio_52 Carmelo_53 Rolon_54 Jimenez_55 _56 _57 _58 _59 Si_60 los_61 han_62 visto_63 por_64 favor_65 dejemen_66 saber_67 algo_68 !_69 !_70 !_71 ??_72 ??_73 ??_74 ??_75 ??_76 ??_77 ??_78 ??_79 ??_80 ??_81 ??_82 ??_83 ??_84 ??_85 ??_86 ??_87 Here_88 is_89 my_90 inmediate_91 family_92 in_93 PR_94 _95 _96 _97 My_98 brother_99 and_100 his_101 wife_102 ,_103 my_104 niece_105 and_106 her_107 husband_108 and_109 my_110 uncle_111 _112 _113 _114 If_115 you_116 have_117 seen_118 them_119 please_120 let_121 me_122 know_123 ??_124 ??_125 ??_126 ??_127 ??_128 ??_129 ?

In [258]:
test_ner_txt = ner_annotation_lines[10]
print(index_txt(test_ner_txt))

Dios/OTROS_0 bendigas/OTROS_1 si/O_2 alguien/O_3 saber/O_4 de/O_5 brenda/O_6 marie/O_7 adorno/O_8 de/O_9 vega/O_10 vaba/O_11 ley/O_12 que/O_13 diga/O_14 algo/O_15 no/O_16 sabemos/O_17 nada/O_18 de/O_19 ella/O_20 por/O_21 favor/O_22


Maybe this is the wrong way to go about it!

TODO: Let's "zip" together the lexicon and NER annotation lists and, in the case of shared mentions, we'll split them based on approximate collocation.

### Neural NER
Let's test the output of [this](https://github.com/glample/tagger) neural NER tagger that has been trained on news text (I think?).

In [None]:
# already tagged
neural_ner_annotated_status_file = '../../data/facebook-maria/'