The aim is to use NLP to identify articles that are 'relevant' to the challenge, that is articles that refer to:
    - houses / homes being destroyed, damaged etc etc.
    - people / families being displaced, evacuated etc etc.

This is some initial exploration of how to use the Spacy library for this task

In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_colwidth', -1)
import re
from collections import Counter

In [3]:
nlp = spacy.load('en')

In [242]:
# Data source is 290 downloaded articles from the Training Data
df = pd.read_csv('https://s3-us-west-1.amazonaws.com/simon.bedford/d4d/article_contents.csv')
df = df.fillna('')

In [210]:
# Specified reporting terms from challenge description
reporting_terms = [
    'displaced', 'evacuated', 'forced to flee', 'homeless', 'in relief camp',
    'sheltered', 'relocated', 'destroyed housing', 'partially destroyed housing',
    'uninhabitable housing'
]

In [211]:
# Specified reporting units from challenge description
reporting_units = {
    'people': ['people', 'persons', 'individuals', 'children', 'inhabitants', 'residents', 'migrants'],
    'households': ['families', 'households', 'houses', 'homes']
}

#### A quick example

In [214]:
example_sentence = "At least 60 homes were destroyed across three districts, said provincial spokesman Mohammad Yusufi."

In [215]:
doc = nlp(u"{}".format(example_sentence))
for np in doc.noun_chunks:
    if re.search(r'homes|houses', np.text) and re.search(r'destroyed|damaged|washed away', np.root.head.text):
        print(np.text, np.root.head.text)

At least 60 homes destroyed


#### What are all verbs that apply to homes or houses?

In [216]:
verbs = Counter()
for i, row in df.iterrows():
    doc = nlp(u"{}".format(row['content'].lower()))
    for np in doc.noun_chunks:
        if re.search(r'homes|houses', np.text):
            verbs[np.root.head.text] += 1

In [217]:
verbs.most_common(20)

[('of', 70),
 ('destroyed', 64),
 ('damaged', 41),
 ('from', 34),
 ('were', 22),
 ('to', 21),
 ('leave', 13),
 ('flooded', 13),
 ('left', 8),
 ('inundated', 8),
 ('in', 7),
 ('lost', 7),
 ('fled', 7),
 ('are', 7),
 ('destroying', 7),
 ('evacuate', 7),
 ('collapsed', 7),
 ('submerged', 6),
 ('affected', 5),
 ('over', 5)]

#### Look across the article samples for similar sentences of events negatively impacting homes

In [218]:
house_impacts = re.compile('destroyed|damaged|flooded|inundated|lost|collapsed|submerged|washed away')

In [220]:
example_phrases = []
for i, row in df.iterrows():
    doc = nlp(u"{}".format(row['content'].lower()))
    for np in doc.noun_chunks:
        if re.search(r'homes|houses', np.text) and re.search(house_impacts, np.root.head.text):
            example_phrases.append("{}, {}".format(np.text, np.root.head.text))

In [221]:
example_phrases = pd.DataFrame(example_phrases)
example_phrases.head(10)

Unnamed: 0,0
0,"at least 60 homes, destroyed"
1,"more than fifty homes, destroyed"
2,"over 1,500 homes, damaged"
3,"more than 120 houses, destroyed"
4,"as many as 1,770 homes, damaged"
5,"three wounded and 150 houses, destroyed"
6,"several homes, damaged"
7,"four homes, flooded"
8,"116 houses, destroyed"
9,"their homes, lost"


#### Other ways you could say the same thing

In [222]:
example_sentence = "A large flood destroyed 100 homes yesterday"

In [223]:
doc = nlp(u"{}".format(example_sentence))
for np in doc.noun_chunks:
    if re.search(r'homes|houses', np.text) and re.search(house_impacts, np.root.head.text):
        print(np.text, np.root.head.text)

100 homes destroyed


#### So this is already picked up by the existing code

#### What is the broader list of actions that seem to apply to the 'units'

In [224]:
verbs = Counter()
for i, row in df.iterrows():
    doc = nlp(u"{}".format(row['content'].lower()))
    for np in doc.noun_chunks:
        if re.search(units_regex, np.text):
            verbs[np.root.head.text] += 1

In [225]:
verbs.most_common(30)

[('of', 255),
 ('to', 80),
 ('destroyed', 65),
 ('evacuated', 57),
 ('were', 56),
 ('affected', 48),
 ('displaced', 43),
 ('died', 42),
 ('damaged', 42),
 ('from', 41),
 ('killed', 39),
 ('for', 37),
 ('left', 32),
 ('are', 31),
 ('with', 28),
 ('forced', 27),
 ('on', 17),
 ('said', 17),
 ('had', 16),
 ('evacuate', 15),
 ('injured', 14),
 ('families', 14),
 ('flooded', 14),
 ('told', 14),
 ('leave', 13),
 ('leaving', 11),
 ('reported', 11),
 ('in', 10),
 ('lost', 9),
 ('fled', 9)]

In [226]:
# A more broad scan through the document
actions_regex = re.compile("destroyed|damaged|flooded|inundated|lost|collapsed|submerged|washed away|evacuated|affected|displaced|evacuate|fled")

In [227]:
def parse_article(article):
    relevant_sentences = []
    doc = nlp(u"{}".format(article.lower()))
    sentences = [s for s in doc.sents]
    for s in sentences:
        d = nlp(u"{}".format(s))
        for np in d.noun_chunks:
            if re.search(units_regex, np.text) and re.search(actions_regex, np.root.head.text):
                relevant_sentences.append(str(s))
    return relevant_sentences

In [228]:
relevant_sentences = []
for i, row in df.iterrows():
    article_sentences = parse_article(row['content'])
    for s in article_sentences:
        relevant_sentences.append(s)

In [229]:
relevant_sentences = pd.DataFrame(relevant_sentences)
relevant_sentences.head(15)

Unnamed: 0,0
0,"at least 60 homes were destroyed across three districts, said provincial spokesman mohammad yusufi."
1,more than fifty homes and shops were destroyed and thousands of acres of farmland flooded.
2,"bna report that at least 7 people were killed and over 1,500 homes damaged."
3,"quoting an official from the badakhshan provincial government, xinhua also said that the foods had damaged or destroyed more than 120 houses in the district."
4,"flash floods struck in the shuhada district of badakhshan on 07 july 2015, leaving at least 7 dead and around 300 families affected. since then the international organization for migration (iom) in afghanistan"
5,authorities have evacuated families from five buildings.
6,"later reports suggest that as many as 1,770 homes have been damaged."
7,"the report reads that the rains of march 28 and 29 left 2,160 people from sumbe and porto amboim in a situation of dire need, as one was killed, three wounded and 150 houses destroyed, laving 483 homeless."
8,"salta governor juan manuel urtubey rushed to the town, promising to help reconstruct damaged homes and buildings and confirming that 15 families have been evacuated to the town hall as a precaution against collapse."
9,"a profiling exercise led by nrc in 2005 found that 65,000 families were displaced during the 1988-1994 conflict between azerbaijan and armenia over nagorno-karabakh."


In [232]:
def relevant_article(article):
    relevant_sentences = parse_article(article)
    if len(relevant_sentences) > 0:
        return True
    else:
        return False

In [233]:
total_sample = len(df)
print("Total articles in sample is {}".format(total_sample))

Total articles in sample is 291


In [236]:
df['is_relevant'] = df['content'].apply(relevant_article)

In [238]:
pc_relevant = (df['is_relevant'] == 1).sum() / total_sample * 100

In [239]:
print("% of apparently relevant articles in sample is {:.0f}%".format(pc_relevant))

% of apparently relevant articles in sample is 61%


##### Obviously this is not using a complete list of terms, actions etc., and there are likely other types of phrases that need to be identified