In [1]:
import pandas as pd

# Create entity recognition model / text matcher

In [2]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
from spacy.tokens import Span
from spacy import displacy

PhraseMatcher matches on large terminology lists vs Matcher which matches only token patterns  
Source: https://spacy.io/usage/rule-based-matching

### Create list of terms / test PhraseMatcher class
Lists are from assorted sources, including topic-terms lists related to natural disasters and manually created keywords from Hurricane Harvey tweets. 

In [3]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER") # case insensitive matching

In [4]:
s = '''attic
basement
bathroom
bedroom
cellar
closet
den
dining room
front yard
garage
hall
hallway
kitchen
laundry
living room
master bedroom
office
pantry
patio
playroom
porch
staircase
study
sun room
TV room
workshop'''

In [5]:
HOUSE_AREA = s.split('\n')

In [6]:
ACCESSIBILITY = ['wheelchair', 'stuck', 'special needs']
YOUNG = ['newborn', 'baby', 'kid', 'kids', 'child', 'children']
ELDERLY = ['elderly', 'grandma', 'grandpa', 'grandparents', 'grandmother']
PETS = ['cat', 'dog', 'pet']

In [7]:
SHELTER = ['living', 'evacuated', 'makeshift', 'necessities', 'tarpaulins',
       'refugee camps', 'stoves', 'plastic sheeting', 'relief assistance',
       'evacuation', 'food items', 'refuge', 'accommodation',
       'embankments', 'roof', 'housing', 'volunteers',
       'emergency shelter', 'emergency evacuation center', 'tent',
       'village', 'emergency', 'humanitarian assistance', 'families',
       'temporary housing', 'camps', 'rations', 'constructed',
       'relocated', 'blankets', 'pet detective', 'made homeless',
       'refugee camp', 'damaged home', 'houses', 'villages', 'meals',
       'buildings', 'affected communities', 'host families',
       'storm shelter', 'refugee shelters', 'sandbag homes', 'shelters',
       'sleeping', 'classrooms', 'plastic sheets', 'basic needs',
       'clothes', 'assistance', 'clothing', 'left homeless',
       'construction', 'family kits', 'temporary accommodation',
       'houses collapsed', 'slum', 'winterised', 'permanent houses',
       'cyclone shelters', 'permanent housing', 'relief organizations',
       'rendered homeless', 'resettled', 'worst hit areas',
       'accommodation centres', 'several villages', 'sewage',
       'homes damaged', 'sought refuge', 'household items',
       'whose houses', 'corrugated', 'basic necessities',
       'low lying areas', 'guest families', 'floods caused',
       'communities affected', 'overcrowded', 'flood hit areas',
       'devastated areas', 'houses damaged', 'evacuating',
       'makeshift shelters', 'slums', 'hygiene items', 'whose homes',
       'winterized', 'makeshift homes', 'quake survivors',
       'isolated areas', 'mobile clinics', 'tarpaulin', 'sheltered',
       'shelter kits', 'shelter material', 'cooking utensils',
       'severely malnourished', 'barracks', 'water containers',
       'dwellings', 'school buildings', 'habitable shelter',
       'prefabricated', 'crowded', 'mass emergency shelter',
       'winterized tents', 'tented', 'warm clothing', 'vulnerable areas',
       'emergency shelters', 'damaged homes', 'tarps',
       'coastal districts', 'emergency operations', 'refugee agency',
       'sanitation services', 'temporary camps', 'total loss',
       'destitute', 'firewood', 'transitional shelter', 'camping',
       'housed', 'latrine', 'transitional shelters', 'building materials',
       'prone areas', 'floating homes', 'makeshift camps', 'huts',
       'bedding', 'resettle', 'mattresses', 'quake zone',
       'household kits', 'mountainous areas', 'uprooted',
       'storage facilities', 'post disaster emergency shelter',
       'taking refuge', 'tent village', 'emergency relief efforts',
       'shacks', 'refugee families', 'accommodation centers',
       'warm clothes', 'houses destroyed', 'makeshift tents',
       'collapsed buildings', 'monsoon flooding', 'flood survivors',
       'monsoon floods', 'airstrips', 'villages near', 'heavy floods',
       'winterization', 'families displaced', 'winter clothing',
       'permanent homes', 'thatched', 'temporary schools', 'orphanage',
       'mountainous regions', 'severe flood', 'jerrycans',
       'mountain villages', 'tent camps', 'quake relief', 'iron sheets',
       'rebuild homes', 'huddled', 'newly displaced',
       'temporary settlements', 'water recedes', 'housing units',
       'uprooted trees', 'sought shelter', 'essential household items',
       'unsanitary', 'damaged buildings', 'tent villages',
       'medical evacuation', 'rural villages', 'permanent shelters',
       'homes destroyed', 'dwelling', 'water reservoirs',
       'medical clinics', 'apartment', 'inaccessible areas',
       'temporary housing camp', 'sandbags', 'transit camps',
       'devastating flood', 'displaced thousands', 'relief coordinator',
       'polythene sheets', 'relief centres', 'families displaced by',
       'have taken refuge', 'squalid', 'apartments', 'seeking shelter',
       'irrigate', 'orphanages', 'earthquake resistant',
       'refugees fleeing', 'adequate shelter', 'cyclone survivors',
       'vast areas', 'plastic sheet', 'sanitary facilities',
       'winterised tents', 'thatch', 'outlying areas', 'cramped',
       'spontaneous camps', 'accommodations', 'heaters',
       'evacuation centers', 'seek shelter', 'displacement camps',
       'sleeping bags', 'water tank', 'being sheltered',
       'construction material', 'makeshift camp', 'iron sheeting',
       'transitional housing', 'shanty', 'temporary latrines',
       'kitchen kits', 'declared unsafe', 'cantonment', 'bednets',
       'relocation sites', 'nearby towns', 'being housed',
       'washing facilities', 'sewage systems', 'household utensils',
       'habitable', 'communities devastated', 'winterisation',
       'sleeping outside', 'unsanitary conditions', 'basic healthcare',
       'permanent structures', 'spontaneous settlements', 'flooded homes',
       'flood plain', 'humanitarian conditions', 'repair kits',
       'kerosene stoves', 'river embankments', 'granaries',
       'flood waters recede', 'temporary shelter materials',
       'life saving supplies', 'camp management', 'catastrophic flooding',
       'dormitories', 'shipping container comes', 'evacuation shelters',
       'roofing materials', 'christmas mass', 'informal settlements',
       'temporary classrooms', 'other basic necessities',
       'building latrines', 'handpumps', 'habitation', 'roofed',
       'makeshift tent', 'evacuation centre', 'flood areas',
       'cooked meals', 'outlying villages', 'camp out', 'bathrooms',
       'build temporary shelters', 'water catchment',
       'temporary structures', 'roofing sheets', 'resettlement sites',
       'tented camps', 'water storage tanks', 'eastern areas',
       'winter clothes', 'shipping container homes',
       'displaced residents', 'temporary learning spaces', 'sanctuary',
       'flood stricken', 'basic household items', 'unhygienic',
       'persons displaced', 'homeless shelter', 'rescue personnel',
       'temporary accommodation centres', 'hard hit areas',
       'bathing facilities', 'plastic tarpaulins',
       'adequate sanitation facilities', 'areas surrounding',
       'insulation', 'resettlement site', 'sleeping outdoors',
       'temporary toilets', 'morgue', 'toiletries', 'buildings destroyed',
       'without proper shelter', 'ramshackle', 'fleeing conflict',
       'marooned villagers', 'floodplain', 'sleep outside', 'evacuee',
       'basement', 'encampment', 'woolen blankets', 'remain unaccounted',
       'water contaminated', 'tent cities', 'snowline',
       'building shelters', 'campsites', 'corrugated iron sheeting',
       'wool blankets', 'sanitary latrines', 'shelter cluster',
       'hygiene articles', 'flood zones', 'tentage', 'thatched houses',
       'buildings damaged', 'basic amenities', 'bamboo poles',
       'isolation units', 'unhygienic conditions', 'squatter',
       'quake affectees', 'evacuate residents', 'prefab',
       'crowded conditions', 'defecate', 'kitchen equipment',
       'prefabricated houses', 'displacement sites', 'have found refuge',
       'tarp', 'wooden houses', 'after devastating floods',
       'earthquake affectees', 'refuges', 'distributing hygiene kits',
       'hostel', 'crowded camps', 'family survival kits',
       'construct temporary', 'welfare centers', 'overcrowded camps',
       'residential houses', 'squatters', 'neighboring areas',
       'became homeless', 'poorly constructed', 'kerosene lamps',
       'villages destroyed', 'temporary settlement', 'flood ravaged',
       'dormitory', 'cooking facilities', 'single family shelters',
       'tented villages', 'soup kitchens', 'plywood', 'evacuation plans',
       'resistant homes', 'residential buildings', 'lodging',
       'tented camp', 'partially damaged houses', 'zinc sheets',
       'air dropped', 'build latrines', 'relocation camps',
       'flood damages', 'parental controls', 'coastal residents',
       'several buildings', 'dwelling units',
       'proper sanitation facilities', 'cramped conditions', 'rehoused',
       'temporarily accommodated', 'halal meals', 'beddings',
       'kitchen items', 'hostels', 'refugee arrivals',
       'basic necessities such', 'ground sheets', 'relocation centres',
       'shanties', 'constructing latrines', 'deplorable conditions',
       'transit site', 'plastic tarps', 'temporarily housed',
       'hygiene parcels', 'heating stoves', 'transit sites',
       'sanitary supplies', 'tents erected', 'temporary housing units',
       'emergency accommodation', 'temporary shelter sites',
       'internally displaced person', 'encampments',
       'personal hygiene items', 'shelters across', 'mobile toilets',
       'shift shelters', 'shantytowns', 'vulnerable residents',
       'currently sheltering', 'cottages', 'cleaning kits',
       'thatched roofs', 'sleep outdoors', 'overcrowded conditions',
       'tarpaulin sheet', 'high rise buildings', 'host communities',
       'living quarters', 'displaced persons camp', 'cookware',
       'hygiene sets', 'plastic tents', 'makeshift shelter',
       'sized tents', 'remain marooned', 'landless families',
       'resettlement centers', 'forcibly displaced', 'emergency housing',
       'housing units collapsed', 'makeshift settlements',
       'spontaneous camp', 'campsite', 'transit centers', 'rockslides',
       'drainage ditches', 'communal latrines', 'communal kitchens',
       'cutlery', 'sturdier', 'providing cooked food', 'shelterless',
       'winter coats', 'settlement sites', 'squalid conditions',
       'sanitary kits', 'makeshift huts', 'larger towns', 'scavenge',
       'temporarily sheltered', 'families fleeing', 'plastic tarpaulin',
       'religious buildings', 'temporary learning centres']

In [8]:
WATER = ['river basin', 'liters', 'water pumps', 'water containers',
       'latrines', 'stagnant', 'salts', 'muddy', 'water bladders',
       'filters', 'clean', 'sanitation services', 'pump',
       'water purifying', 'lake', 'dams', 'chlorine tablets', 'soap',
       'drinking water', 'dirty', 'irrigation systems', 'water supplies',
       'clean drinking water', 'clean drinking', 'boreholes', 'canal',
       'canals', 'herbal tea', 'overflowed', 'safe drinking',
       'bottled water', 'contaminated', 'water system',
       'sanitation systems', 'boiled water', 'water source', 'purifiers',
       'pipe', 'pumps', 'upstream', 'disinfectant', 'sewage', 'lakes',
       'ponds', 'floodwater', 'bathing', 'water quality', 'pollution',
       'drain', 'water level', 'washing facilities',
       'drinking water sources', 'disinfection', 'soil moisture',
       'purify', 'disinfectants', 'salty', 'river valleys',
       'sanitary facilities', 'waterway', 'chlorinated',
       'sand filtration', 'polluted water', 'salt water', 'tributary',
       'provide safe drinking', 'spillway', 'bleach', 'sewerage',
       'river basins', 'liquid', 'surface water', 'drinkable',
       'sewerage system', 'saltwater', 'boiling', 'seawater',
       'water trucking', 'disinfected', 'water treatment plants',
       'water tankers', 'flood waters recede', 'water filters',
       'drainage systems', 'fetch water', 'water treatment units',
       'water treatment plant', 'sales force', 'dirty water',
       'water reservoirs', 'water pump', 'pond', 'purifying tablets',
       'catchments', 'rainwater harvesting', 'providing safe drinking',
       'toilet facilities', 'disinfect', 'sewage system',
       'boiling of water', 'piped water', 'drinking water supply',
       'mineral water', 'purified', 'treatment plants',
       'irrigation water', 'temporary latrines', 'disinfecting',
       'drinkable water', 'underground water', 'gallon', 'urine',
       'irrigate', 'water catchment', 'overflowing rivers',
       'halogen tablets', 'personal hygiene', 'soil erosion',
       'storage tanks', 'wetlands', 'water storage tanks', 'piping',
       'desalination', 'proper sanitation', 'unclean', 'shallow wells',
       'borehole', 'irrigation canals', 'handpumps', 'muddy water',
       'adequate sanitation', 'boil', 'water filtration', 'groundwater',
       'sewage systems', 'septic', 'water tank', 'contaminating',
       'sea water', 'swollen rivers', 'environmental sanitation',
       'pumping stations', 'filtration units', 'water tankering',
       'aquifer', 'lakh cusecs', 'washing hands', 'waste water',
       'brackish', 'storage tank', 'bathing facilities',
       'purifying water', 'creek', 'water bowsers',
       'adequate sanitation facilities', 'non potable', 'soil salinity',
       'cleaning wells', 'contaminate', 'filtered', 'poor drainage',
       'silted', 'purified water', 'storage containers',
       'collapsible water', 'faecal', 'deep wells', 'saline water',
       'fish ponds', 'cubic meter', 'sewer', 'sanitary latrines',
       'chlorinate', 'water pans', 'nitrate', 'wetland', 'wastewater',
       'poor sanitary', 'drinking contaminated water',
       'floodwaters receded', 'silting', 'sanitation practices',
       'irrigating', 'household latrines', 'aquifers', 'flooded rivers',
       'chlorinated water', 'evaporation', 'water tanker',
       'contaminated drinking water', 'untreated water', 'seepage',
       'treatment chemicals', 'artesian', 'irrigation canal', 'riverbed',
       'sewers', 'arsenic', 'bladder tanks', 'fresh drinking water',
       'septic tank', 'unclean water', 'stagnant pools',
       'inadequate sanitation', 'septic tanks',
       'water storage containers', 'water catchments', 'excreta',
       'drainage channels', 'chlorine solution', 'water filter',
       'water contaminated', 'snowmelt', 'cisterns', 'purifier', 'creeks',
       'river beds', 'washing clothes', 'poor sanitary conditions',
       'drinking contaminated', 'chlorinating', 'communal latrines',
       'water company', 'water drainage', 'salty water', 'water purifier',
       'contaminated wells', 'drinking water supplies', 'pollutants',
       'constructing latrines', 'digging wells', 'municipal water',
       'construct latrines', 'body of water', 'supplying drinking water',
       'contaminating water', 'heavily polluted', 'sediments',
       'collapsible water tanks', 'sewage treatment',
       'supplying clean water', 'water storage', 'stinking',
       'water bladder', 'irrigated crops', 'proper drainage',
       'water quality testing', 'irrigated lands', 'radioactive water',
       'pump stations', 'contaminants', 'salinity levels', 'flushing',
       'potable drinking', 'litre water tanks', 'dewatering',
       'water line', 'radioactive waste', 'hand washing facilities',
       'salinated', 'watersheds', 'potable water supply',
       'water consumption', 'rivulets', 'water intrusion',
       'proper sanitation facilities', 'chloride',
       'plastic water containers', 'water mains', 'irrigation equipment',
       'artesian well', 'bacteriological', 'filthy water',
       'artesian wells', 'muddy waters', 'leaching',
       'desalination plants', 'drainages', 'filtration plants',
       'natural springs', 'prevent waterborne diseases',
       'radioactive materials', 'filtration system',
       'water filtration system', 'irrigation ditches', 'chlorine powder',
       'drinking dirty', 'feces', 'sewerage systems', 'drip irrigation',
       'unsafe drinking water', 'drainage canals', 'water gushing',
       'bore wells', 'drink contaminated', 'bottled drinking',
       'hydrological gauging', 'brackish water', 'aluminium sulphate',
       'drinking purposes', 'environmental clean',
       'water filtration plants', 'stagnant flood', 'artificial lake',
       'piped water system', 'drainage ditches', 'irrigation purposes',
       'watercourses', 'water treatment chemicals', 'water storage tank',
       'bottled drinking water', 'sedimentation',
       'potable drinking water', 'putrid', 'rainwater collection',
       'purified drinking', 'boil drinking', 'make water',
       'boiling drinking', 'wastewater treatment',
       'potentially contaminated', 'bathing places', 'freshwater lake',
       'hand washing stations', 'flush latrines', 'drilling boreholes',
       'toxic chemicals', 'aquaculture ponds', 'submersible pumps',
       'purifying packets', 'ceramic water filters', 'sewage pipes',
       'drink contaminated water', 'harvesting tanks', 'pure drinking',
       'waste disposal systems', 'water tap', 'heating systems',
       'lime powder', 'liter water containers', 'water testing kits',
       'collect rainwater', 'tankering water', 'chemical fertilizers',
       'supply pipelines', 'pumping machines', 'drinking dirty water',
       'plastic water tanks', 'purified drinking water',
       'contaminated floodwater', 'water pumping station',
       'boil drinking water', 'effluent', 'electric pump',
       'microbiological', 'shallower', 'pollute', 'liquid waste',
       'calcium hypochlorite', 'receptacles', 'cleaning debris',
       'safe hygiene practices', 'tankered', 'coastal waters',
       'electric pumps', 'water lilies', 'rain water', 'borewells',
       'residual chlorine', 'chlorination tablets', 'highly contaminated',
       'improve drainage', 'bladder tank', 'desalinated',
       'waste disposal system', 'murky water', 'fetid', 'fluoride',
       'storm drains', 'pure drinking water', 'excreta disposal',
       'filtration systems', 'drilling wells', 'water runoff',
       'waste removal', 'drinking polluted', 'waste dumps',
       'storage bladders', 'handpump', 'submersible pump',
       'avoid contamination', 'turbid', 'groundwater levels',
       'irrigation dams', 'sanitizer', 'improved sanitation facilities',
       'chlorinating water', 'radioactive contamination',
       'drinking wells', 'water collection tanks', 'oral saline',
       'earth dams', 'poor drainage system', 'drink dirty',
       'repairing wells', 'inadequate drainage', 'rehabilitating wells',
       'existing boreholes', 'acidic', 'through portable tankers',
       'hydroelectric dams', 'disinfecting wells', 'glacial lakes',
       'irrigation networks', 'prevent soil erosion',
       'installing water tanks', 'desalination units',
       'irrigation reservoirs', 'dewatering pumps',
       'reactor pressure vessels', 'rainwater tanks', 'silted up',
       'water filtration systems', 'store rainwater', 'polluted wells',
       'animal urine', 'water collection points', 'stormwater',
       'shallow tube', 'filtered water', 'chlorination points',
       'turbidity', 'sand filters', 'flowing downstream',
       'radioactive substances', 'perennial rivers', 'polluted waters',
       'rainwater catchment', 'deeper wells', 'adequate drainage',
       'pumping equipment', 'polluted drinking water',
       'heavily contaminated', 'oral rehydration sachets', 'drill wells',
       'pump station', 'drinking untreated', 'contaminated ponds',
       'drilled wells', 'flowing rivers', 'hydropower plants',
       'stagnating water', 'consuming contaminated', 'digging latrines',
       'contaminated soil', 'harvest rainwater', 'portable tankers',
       'rainwater harvesting systems', 'litre tanks', 'refilling',
       'unsanitary living conditions', 'chlorinate water', 'salt content',
       'contaminates', 'treat waterborne diseases',
       'handwashing stations', 'purifies', 'stagnant floodwater',
       'underground aquifers', 'vast lakes', 'seasonal streams',
       'pump wells', 'minor irrigation', 'drinking untreated water',
       'pumps installed', 'flow downstream', 'pump sets', 'algae',
       'deep tube wells', 'wash basins', 'swelled rivers',
       'saltwater intrusion', 'desalination plant', 'chemical packets',
       'water purifying powder', 'pond water', 'piped water systems']

In [9]:
# simple matcher
# matcher.add(ent_label, callback, spacy parsed terms)
matcher.add('HOUSE_AREA', None, *list(nlp.pipe(HOUSE_AREA)))
matcher.add('ACCESSIBILITY', None, *list(nlp.pipe(ACCESSIBILITY)))
matcher.add('YOUNG', None, *list(nlp.pipe(YOUNG)))
matcher.add('ELDERLY', None, *list(nlp.pipe(ELDERLY)))
matcher.add('PETS', None, *list(nlp.pipe(PETS)))
matcher.add('SHELTER', None, *list(nlp.pipe(SHELTER)))
matcher.add('WATER', None, *list(nlp.pipe(WATER)))

In [10]:
# Load Port Arthur tweets, already preprocessed with wordninja to untangle words
port_arthur = pd.read_pickle('./harvey_port_arthur_tweets.pkl')

In [11]:
print(port_arthur.shape)
port_arthur.head()

(85,)


0    2231 Fredrick st port Arthur to 77640409720832...
1    RESCUE ALERT Six people in Port Arthur need re...
2                     2320 COX ST PORT ARTHUR TX 77640
3                                                     
4                           y'all got a team out in PA
Name: processed, dtype: object

In [12]:
# Gather lists
lists = [HOUSE_AREA, ACCESSIBILITY, YOUNG, ELDERLY, PETS, SHELTER, WATER]
listnames = ['HOUSE_AREA', 'ACCESSIBILITY', 'YOUNG', 'ELDERLY', 'PETS', 'SHELTER', 'WATER']

In [13]:
%%time
# simple loop to check if a keyword is in a message
# pretty fast with few enough terms but how does this fit with the final output? 
for i, doc in enumerate(port_arthur):
    for L, name in zip(lists,listnames): 
        for keyword in L:
            if keyword in doc:
                print(i, name, keyword)

6 ELDERLY grandma
15 YOUNG kid
15 YOUNG kids
21 YOUNG kid
21 YOUNG kids
22 ELDERLY grandma
30 ELDERLY grandma
36 ELDERLY grandmother
41 SHELTER roof
45 YOUNG kid
45 YOUNG kids
49 YOUNG child
52 HOUSE_AREA den
62 SHELTER assistance
63 SHELTER assistance
64 ELDERLY grandmother
66 SHELTER roof
69 ACCESSIBILITY stuck
69 ELDERLY grandpa
69 ELDERLY grandparents
71 PETS dog
73 ACCESSIBILITY special needs
73 YOUNG child
74 ACCESSIBILITY wheelchair
74 ELDERLY grandmother
78 ELDERLY grandmother
80 ELDERLY grandmother
82 HOUSE_AREA attic
82 ACCESSIBILITY stuck
82 YOUNG kid
82 YOUNG kids
CPU times: user 21.9 ms, sys: 16.6 ms, total: 38.6 ms
Wall time: 47.6 ms


In [14]:
%%time
# same printing task but with spacy
entities = []
for i, doc in enumerate(nlp.pipe(port_arthur)):
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id] # get unicode ID
        span = doc[start:end] # get matched slice 
        print(i, rule_id, span.text)

6 ELDERLY grandma
13 ELDERLY GRANDPA
13 YOUNG KIDS
15 YOUNG kids
21 YOUNG kids
22 ELDERLY grandma
24 YOUNG KIDS
28 ELDERLY GRANDPARENTS
30 ELDERLY grandma
36 ELDERLY grandmother
38 SHELTER Roof
41 SHELTER roof
45 YOUNG kids
49 YOUNG child
50 SHELTER APARTMENTS
52 ELDERLY Elderly
54 ACCESSIBILITY STUCK
55 ELDERLY Grandma
58 SHELTER ROOF
58 YOUNG NEWBORN
58 YOUNG BABY
62 SHELTER assistance
63 SHELTER assistance
64 ELDERLY grandmother
66 SHELTER roof
67 ACCESSIBILITY SPECIAL NEEDS
67 ACCESSIBILITY STUCK
69 ELDERLY grandparents
69 ACCESSIBILITY stuck
71 PETS dog
73 ACCESSIBILITY special needs
73 YOUNG child
74 ELDERLY grandmother
74 ACCESSIBILITY wheelchair
75 SHELTER EMERGENCY
78 ELDERLY grandmother
80 ELDERLY grandmother
82 ACCESSIBILITY stuck
82 HOUSE_AREA attic
82 YOUNG kids
83 YOUNG KIDS
CPU times: user 211 ms, sys: 8.42 ms, total: 220 ms
Wall time: 247 ms


In [15]:
# Older way of custom spacy entity matcher - spacy 2.1 came out with EntityRuler
class EntityMatcher(object):
    name = 'entity_matcher'
    
    def __init__(self, nlp, term_lists, labels):
        self.matcher = PhraseMatcher(nlp.vocab)
        for tl, label in zip(term_lists, labels):
            self.matcher.add(label, None, *list(nlp.pipe(tl)))
            
    def __call__(self, doc):
        matches = self.matcher(doc) # one at a time
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc

In [17]:
nlp = spacy.load('en_core_web_md')
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [18]:
# there was overlap between some of the term_lists, so truncated it
lists = [HOUSE_AREA, ACCESSIBILITY, YOUNG, ELDERLY, PETS]
listnames = ['HOUSE_AREA', 'ACCESSIBILITY', 'YOUNG', 'ELDERLY', 'PETS']

In [19]:
entity_matcher = EntityMatcher(nlp, lists, listnames)
nlp.add_pipe(entity_matcher, before='ner')
print(nlp.pipe_names)

['tagger', 'parser', 'entity_matcher', 'ner']


In [23]:
%%time 
spacy_docs = []
for i, doc in enumerate(nlp.pipe(port_arthur)):
    print(i, [(ent.text, ent.label_) for ent in doc.ents])
    if i in [24, 74, 82]:
        spacy_docs.append(doc)

0 [('Fredrick', 'PERSON'), ('Arthur', 'PERSON'), ('776404097208322', 'CARDINAL'), ('Linda', 'PERSON')]
1 [('Six', 'CARDINAL'), ('Port Arthur', 'FAC'), ('5235', 'CARDINAL')]
2 [('2320', 'CARDINAL'), ('COX', 'ORG'), ('ST', 'GPE'), ('77640', 'DATE')]
3 []
4 []
5 [('2611', 'CARDINAL'), ('12 th', 'QUANTITY'), ('Port Arthur', 'FAC'), ('77640', 'DATE'), ('74', 'CARDINAL')]
6 [('one', 'CARDINAL'), ('grandma', 'ELDERLY'), ('Ophelia Kin', 'PERSON'), ('10 st Port Art hu', 'WORK_OF_ART')]
7 [('4005 4 th', 'QUANTITY'), ('8322353561', 'DATE')]
8 [('4209 7', 'QUANTITY'), ('St Port', 'FAC'), ('Arthur Tx Water', 'FAC')]
9 [('6', 'CARDINAL'), ('5307', 'CARDINAL'), ('Gulf way', 'EVENT'), ('Apt 161', 'FAC'), ('TX', 'GPE')]
10 [('Port Arthur', 'FAC')]
11 []
12 [('221 pecan', 'QUANTITY'), ('77575', 'CARDINAL')]
13 [('5', 'CARDINAL'), ('PROCTOR PORT', 'FAC'), ('409', 'CARDINAL'), ('2561', 'GPE')]
14 [('1329 Barbara Ln Port', 'QUANTITY')]
15 [('Jacque lyn Foreman', 'PERSON'), ('Daisy Ave Port', 'FAC'), ('3', 

In [26]:
samp = spacy_docs[1]

In [28]:
displacy.render(spacy_docs[0], style='ent', page=False, minify=True, jupyter=True)

In [29]:
displacy.render(spacy_docs[1], style='ent', page=False, minify=True, jupyter=True)
# better coloring system? 

In [30]:
displacy.render(spacy_docs[2], style='ent', page=False, minify=True, jupyter=True)

The default NER has a hard time with the address info. Might need to use a blank NER.

### To do:
1. Create more comprehensive and relevant term lists - need better categorization
2. Use EntityRuler instead of EntityMatcher?
    - generate a list of patterns in json form
```py
{"label": "ORG", "pattern": "Apple"}
{"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
```

In [None]:
# Other lists:
tweet_keywords = ['ablaze', 'accident', 'aftershock', 'airplane accident', 'ambulance', 'annihilated', 'annihilation', 'apocalypse', 'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked', 'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze', 'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood', 'bloody', 'blown up', 'body bag', 'body bagging', 'body bags', 'bomb', 'bombed', 'bombing', 'bridge collapse', 'buildings burning', 'buildings on fire', 'burned', 'burning', 'burning buildings', 'bush fires', 'casualties', 'casualty', 'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall', 'collapse', 'collapsed', 'collide', 'collided', 'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge', 'deluged', 'demolish', 'demolished', 'demolition', 'derail', 'derailed', 'derailment', 'desolate', 'desolation', 'destroy', 'destroyed', 'destruction', 'detonate', 'detonation', 'devastated', 'devastation', 'disaster', 'displaced', 'drought', 'drown', 'drowned', 'drowning', 'dust storm', 'earthquake', 'electrocute', 'electrocuted', 'emergency', 'emergency plan', 'emergency services', 'engulfed', 'epicentre', 'evacuate', 'evacuated', 'evacuation', 'explode', 'exploded', 'explosion', 'eyewitness', 'famine', 'fatal', 'fatalities', 'fatality', 'fear', 'fire', 'fire truck', 'first responders', 'flames', 'flattened', 'flood', 'flooding', 'floods', 'forest fire', 'forest fires', 'hail', 'hailstorm', 'harm', 'hazard', 'hazardous', 'heat wave', 'hellfire', 'hijack', 'hijacker', 'hijacking', 'hostage', 'hostages', 'hurricane', 'injured', 'injuries', 'injury', 'inundated', 'inundation', 'landslide', 'lava', 'lightning', 'loud bang', 'mass murder', 'mass murderer', 'massacre', 'mayhem', 'meltdown', 'military', 'mudslide', 'natural disaster', 'nuclear disaster', 'nuclear reactor', 'obliterate', 'obliterated', 'obliteration', 'oil spill', 'outbreak', 'pandemonium', 'panic', 'panicking', 'police', 'quarantine', 'quarantined', 'radiation emergency', 'rainstorm', 'razed', 'refugees', 'rescue', 'rescued', 'rescuers', 'riot', 'rioting', 'rubble', 'ruin', 'sandstorm', 'screamed', 'screaming', 'screams', 'seismic', 'sinkhole', 'sinking', 'siren', 'sirens', 'smoke', 'snowstorm', 'storm', 'stretcher', 'structural failure', 'suicide bomb', 'suicide bomber', 'suicide bombing', 'sunk', 'survive', 'survived', 'survivors', 'terrorism', 'terrorist', 'threat', 'thunder', 'thunderstorm', 'tornado', 'tragedy', 'trapped', 'trauma', 'traumatised', 'trouble', 'tsunami', 'twister', 'typhoon', 'upheaval', 'violent storm', 'volcano', 'war zone', 'weapon', 'weapons', 'whirlwind', 'wild fires', 'wildfire', 'windstorm', 'wounded', 'wounds', 'wreck', 'wreckage', 'wrecked']    

# Generate fake messages 

In [31]:
import random
random.choice(HOUSE_AREA)

'front yard'