In [1]:
import pandas as pd


In [2]:
import spacy
nlp = spacy.load('en')

In [3]:
# the csv file is from the training data
articleDF = pd.read_csv('datasets/article_contents.csv')

<h3>Inspecting what is inside the article_contents</h3>

In [4]:
articleDF.head(5)

Unnamed: 0,country,url,tag,title,meta_description,content
0,Afghanistan,http://www.independent.co.uk/news/world/asia/1...,Disasters,160 killed and hundreds left stranded by flood...,Flash flooding across Afghanistan and Pakistan...,Flash flooding across Afghanistan and Pakistan...
1,Afghanistan,http://floodlist.com/asia/afghanistan-flash-fl...,Disasters,Afghanistan – Flash Floods in Faryab and Baghl...,,"Afghanistan state news agency, Bakhtar News Ag..."
2,Afghanistan,http://floodlist.com/asia/afghanistan-6-dead-f...,Disasters,Afghanistan - 6 Dead as Flash Floods Hit Badak...,,Flash floods have struck once again in the Bad...
3,Afghanistan,http://reliefweb.int/report/afghanistan/afghan...,Disasters,Afghanistan Earthquake: Overview of Assessed N...,Afghanistan Earthquake: OCHA Situation Report ...,UN Office for the Coordination of Humanitarian...
4,Albania,http://www.euronews.com/2014/11/19/albania-flo...,Disasters,Albania floods kill at least 3 people | Euronews,Flooding in Albania has killed at least three ...,Flooding in Albania has killed at least three ...


In [10]:
print(articleDF['title'][0])

160 killed and hundreds left stranded by flooding across Afghanistan


In [11]:
print(articleDF['meta_description'][0])

Flash flooding across Afghanistan and Pakistan has left more than 160 dead and dozens stranded in one of South Asia's worst natural disasters this year, say officials.


In [6]:
articleDF['content'][0]

"Flash flooding across Afghanistan and Pakistan has left more than 160 dead and dozens stranded in one of South Asia's worst natural disasters this year, say officials.  The flooding, caused by unusually heavy rain, has left villagers stuck in remote areas without shelter, food or power.  Mountainous Afghanistan was the worst hit, with 61 people killed and approximately 500 traditional mud-brick homes washed away in more than a dozen villages in Sarobi, a rural district less than an hour from Kabul, officials said.  Floods left a village devastated in the remote eastern Afghan province of Nuristan. At least 60 homes were destroyed across three districts, said provincial spokesman Mohammad Yusufi. No one was killed.  Authorities have been unable to deliver aid to some badly affected villages by land as roads in the area are controlled by the Taliban, Yusufi added.  “We have asked the national government for help as have an overwhelming number of locals asking for assistance, but this is

In [7]:
articleDF['tag'].unique()

array(['Disasters', 'Conflict and violence'], dtype=object)

First attempt: running the entire content of 1 article through Spacy

In [8]:
parsed_text = nlp(articleDF['content'][0])
#after running through spacy, some stuff you can do e.g. POS-- 
"""for word in parsed_text:
    print(word.pos_, word)"""

In [41]:
#trying to get all the numbers in the article:
#not too bad but need some sort of thinking as to deciding which ones are releavnt!
ents = list(parsed_text.ents)
for entity in ents:
    sentence = entity.sent
    for idx, word in enumerate(sentence):
        if(word == entity.root and entity.label_ == 'CARDINAL'):
            print(entity,sentence[idx + 1: idx + 3])


more than 160 dead and
dozens stranded in
61 people killed
500 traditional mud
dozen villages in
At least 60 homes were
three districts,
At least 24 people were
two other eastern
More than fifty homes and
more than 80 lives,
18 million people,


In [99]:
'''
Ignore this for now. I stumbled across a tutuorial for a text processing pipeline using the below-- might come in 
handy later... Except I can't seem to find the link...!
'''
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def article_review(filename):
    """
    generator function to read in articles from the dataframe
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_article in nlp.pipe(article_review(filename), batch_size=10000, n_threads=4):
        
        for sent in parsed_article.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [91]:
# intuition is that the main info would be in the title-- for now just parse 
# this rather than trying to deal with the whole aritlce
def parse_title_for_units(title_text):
    parsed_text = nlp(title_text) #run title through spacy pipline
    ents = list(parsed_text.ents) #grab the named entities detected
    output = []
    for entity in ents:
         #the sentence in whcih the entity lives-- since for numbers, spacy doesn't include units e.g. 160 rather than 160 peopel
        sentence = entity.sent
        for idx, word in enumerate(sentence): 
            # for now, it seems that the only the ones with entity label == cardinal are relevant to the number of reporting units
            if(word == entity.root and entity.label_ == 'CARDINAL'):
                # the following to grab a few extra words round that entity to get the units
                if(len(sentence) > idx + 1):
                    output.append(entity.text_with_ws + sentence[idx + 1].text)
                else:
                    output.append(sentence[idx - 1].text_with_ws + entity.text)
    return output

    

In [92]:
#test the parser for the first title...
parse_title_for_units(articleDF['title'][0])

['160 killed', 'hundreds left']

In [93]:
#seems okay-- run it through the dataframe
articleDF['parse_title_for_units'] = articleDF['title'].map(parse_title_for_units)

In [94]:
articleDF.head(15)

Unnamed: 0,country,url,tag,title,meta_description,content,parse_title_for_units
0,Afghanistan,http://www.independent.co.uk/news/world/asia/1...,Disasters,160 killed and hundreds left stranded by flood...,Flash flooding across Afghanistan and Pakistan...,Flash flooding across Afghanistan and Pakistan...,"[160 killed, hundreds left]"
1,Afghanistan,http://floodlist.com/asia/afghanistan-flash-fl...,Disasters,Afghanistan – Flash Floods in Faryab and Baghl...,,"Afghanistan state news agency, Bakhtar News Ag...",[]
2,Afghanistan,http://floodlist.com/asia/afghanistan-6-dead-f...,Disasters,Afghanistan - 6 Dead as Flash Floods Hit Badak...,,Flash floods have struck once again in the Bad...,[]
3,Afghanistan,http://reliefweb.int/report/afghanistan/afghan...,Disasters,Afghanistan Earthquake: Overview of Assessed N...,Afghanistan Earthquake: OCHA Situation Report ...,UN Office for the Coordination of Humanitarian...,[]
4,Albania,http://www.euronews.com/2014/11/19/albania-flo...,Disasters,Albania floods kill at least 3 people | Euronews,Flooding in Albania has killed at least three ...,Flooding in Albania has killed at least three ...,[at least 3 people]
5,Algeria,http://floodlist.com/africa/torrential-rains-d...,Disasters,Torrential Rains Destroy 400 Homes in Algeria,,ALGIERS (AA) – Hundreds of homes have been des...,[400 Homes]
6,Angola,http://floodlist.com/africa/thousands-homes-da...,Disasters,Thousands of Homes Damaged by Floods in Luanda...,,Heavy rain on Monday 09 March 2015 flooded at ...,[Thousands of]
7,Angola,http://www.portalangop.co.ao/angola/en_us/noti...,Disasters,Cuanza Sul: Over thousand affected by rains,Cuanza Sul: Over thousand affected by rains - ...,The information is contained in a report on th...,[]
8,Argentina,http://www.argentinaindependent.com/tag/rio-pa...,Disasters,rio parana ‹ The Argentina Independent,,"When you boat your way somewhere, even if just...",[]
9,Argentina,http://www.buenosairesherald.com/article/20115...,Disasters,"Earthquake in Salta kills a 94-year-old woman,...","Earthquake in Salta kills a 94-year-old woman,...",The debris of collapsed houses is pictured in ...,[injures 30]


PROPN Afghanistan
PUNCT -
NUM 6
PROPN Dead
ADP as
PROPN Flash
PROPN Floods
PROPN Hit
PROPN Badakhshan
PROPN Province
ADV Once
ADV Again


In [95]:
articleDF.to_csv('datasets/refugess_training_text_parsing_test_1.csv')

In [96]:
articleDF[40:50]

Unnamed: 0,country,url,tag,title,meta_description,content,parse_title_for_units
40,China,http://news.xinhuanet.com/english/china/2014-0...,Disasters,China rainstorms kill at least 34,The Ministry of Civil Affairs on Thursday said...,"BEIJING, July 17 (Xinhua) -- The Ministry of C...",[least at least 34]
41,China,http://floodlist.com/asia/14-dead-337000-evacu...,Disasters,"14 dead, 337,000 Evacuated in Southern China F...",,"Further severe weather, floods and landslides ...","[14 dead, 337,000 Evacuated]"
42,China,http://news.xinhuanet.com/english/china/2014-0...,Disasters,Typhoon Matmo kills 13 in China,Typhoon Matmo has claimed 13 lives in China an...,"BEIJING, July 27 (Xinhua) -- Typhoon Matmo has...",[13 in]
43,China,http://floodlist.com/asia/25-dead-7-days-flood...,Disasters,25 Dead after 7 Days of Flooding in China,,The week-long flooding that has swept across s...,[25 Dead]
44,China,http://floodlist.com/asia/chongqing-floods-con...,Disasters,Chongqing Floods Continue – Over 40 Killed,,The flooding has continued in south western Ch...,[]
45,China,http://earthquake-report.com/2014/02/12/very-s...,Disasters,"Extremely dangerous earthquake in Xinjiang, Ch...",,The best independent earthquake reporting site...,"[60,000 severely, 85,000 people, 11000+ animals]"
46,China,http://news.xinhuanet.com/english/china/2014-1...,Disasters,Grief and rescue after SW China quake,A candlelight vigil was held early Wednesday m...,"JINGGU, Yunnan, Oct. 8 (Xinhua) -- A candlelig...",[]
47,China,http://news.xinhuanet.com/english/china/2014-0...,Disasters,Three dead in south China rainstorms,,"CHANGSHA, May 11 (Xinhua) -- Heavy rainstorms ...",[Three dead]
48,China,http://rt.com/news/china-earthquake-yongshan-y...,Disasters,"5.3 quake hits China’s southwest, at least 21 ...",A 5.3-magnitude earthquake in rural southwest ...,A 5.3-magnitude earthquake in rural southwest ...,[5.3 quake]
49,China,http://www.chinadaily.com.cn/china/2014-04/01/...,Disasters,"21 dead, 4 missing in S China rainfall - China...",Deadly rains over the last few days have also ...,BEIJING - Heavy rainfall in South China over t...,"[21 dead, 4 missing]"
