In [1]:
%load_ext autoreload
%autoreload 2
# Change these to True to set up the DB the first time
i_know_this_will_delete_everything = True
initialize_id_test = True
initialize_id = False

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from internal_displacement.model.model import init_db

db_host = os.environ.get('DB_HOST')

if initialize_id:
    db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
        user='jupyter', password='jupyter', db_host=db_host, db='id')
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)
    
if initialize_id_test:
    db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
        user='jupyter', password='tester', db_host=db_host, db='id_test')
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)

In [2]:
import spacy
import json
from sqlalchemy import create_engine
from sqlalchemy import exc
from datetime import datetime
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
    Location, Report, ReportDateSpan, ArticleCategory, Base
from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter
from internal_displacement.pipeline import Pipeline
from internal_displacement.add_countries import load_countries, delete_countries
import pandas as pd

In [3]:
engine = create_engine(db_url)
Session.configure(bind=engine)
session = Session()

In [4]:
load_countries(session)

In [5]:
scraper = Scraper()
nlp = spacy.load('en')
person_reporting_terms = [
    'displaced', 'evacuated', 'forced', 'flee', 'homeless', 'relief camp',
    'sheltered', 'relocated', 'stranded', 'stuck', 'stranded', "killed", "dead", "died", "drown"
]

structure_reporting_terms = [
    'destroyed', 'damaged', 'swept', 'collapsed',
    'flooded', 'washed', 'inundated', 'evacuate'
]

person_reporting_units = ["families", "person", "people", "individuals", "locals", "villagers", "residents",
                            "occupants", "citizens", "households", "life"]

structure_reporting_units = ["home", "house", "hut", "dwelling", "building", "shop", "business", "apartment",
                                     "flat", "residence"]

relevant_article_terms = ['Rainstorm', 'hurricane',
                          'tornado', 'rain', 'storm', 'earthquake']
relevant_article_lemmas = [t.lemma_ for t in nlp(
    " ".join(relevant_article_terms))]

data_path = '../data'

In [6]:
interpreter = Interpreter(nlp, person_reporting_terms, structure_reporting_terms, person_reporting_units,
                          structure_reporting_units, relevant_article_lemmas, data_path,
                          model_path='../internal_displacement/classifiers/default_model.pkl',
                          encoder_path='../internal_displacement/classifiers/default_encoder.pkl')

In [7]:
pipeline = Pipeline(session, scraper, interpreter)

In [8]:
test_urls = pd.read_csv('../data_extract/idmc_uniteideas_training_dataset.csv')

In [9]:
test_urls = test_urls['URL'].tolist()

In [22]:
for url in test_urls[:20]:
    try:
        pipeline.process_url(url)
    except exc.IntegrityError:
        session.rollback()

In [23]:
session.query(Article).count()

20

In [24]:
article = session.query(Article).filter_by(id=19).first()

In [25]:
article.url

'http://floodlist.com/africa/torrential-rains-destroy-400-homes-in-algeria'

In [26]:
article.status

'processed'

In [27]:
article.domain

'http://floodlist.com'

In [28]:
article.publication_date

datetime.datetime(2015, 3, 25, 13, 7, 44)

In [29]:
article.language

'en'

In [30]:
article.relevance

True

In [34]:
article.categories[0].category

'other'

In [38]:
article.reports[0].event_term, article.reports[0].subject_term, article.reports[0].quantity

('destroy', 'residence', 400)

In [44]:
article.reports[0].locations[0].code, article.reports[0].locations[0].description

('DZA', 'Tamanrasset')

In [45]:
for term in article.reports[0].locations[0].country.terms:
    print(term.term)

Algeria
People's Democratic Republic of Algeria
