In [1]:
%load_ext autoreload
%autoreload 2
# Change these to True to set up the DB the first time
i_know_this_will_delete_everything = True
initialize_id_test = True
initialize_id = False

import os
import sys
from sqlalchemy import create_engine
from sqlalchemy import exc
from sqlalchemy import func
from sqlalchemy import Table, text
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
    Location, Report, ReportDateSpan, ArticleCategory, Base

def init_db(db_url, i_know_this_will_delete_everything=False):
    """
    Warning! This will delete everything in the database!
    :param session: SQLAlchemy session
    """
    if not i_know_this_will_delete_everything:
        raise RuntimeError("Tried to init_db without knowing it would delete everything!")
    sql_path = '../internal_displacement/model/schema.sql'
    with open(sql_path, 'r') as schema:
        session.execute(text(schema.read()))
    session.commit()
    
db_host = os.environ.get('DB_HOST')

if initialize_id:
    db_url = 'postgresql://tester:tester@localdb/id_test'.format(
        user='jupyter', password='jupyter', db_host=db_host, db='id')
    engine = create_engine(db_url)
    Session.configure(bind=engine)
    session = Session()  
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)
    
if initialize_id_test:
    db_url = 'postgresql://tester:tester@localdb/id_test'.format(
        user='jupyter', password='tester', db_host=db_host, db='id_test')
    engine = create_engine(db_url)
    Session.configure(bind=engine)
    session = Session()  
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)

In [6]:
import spacy
import json
from datetime import datetime
from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter
from internal_displacement.pipeline import Pipeline
from internal_displacement.add_countries import load_countries, delete_countries
import pandas as pd

In [7]:
# df = pd.read_csv('../data/article_contents.csv', encoding='utf8')
# Pre-load list of countries into the database
load_countries(session)

In [8]:
scraper = Scraper()
nlp = spacy.load('en')
person_reporting_terms = [
    'displaced', 'evacuated', 'forced', 'flee', 'homeless', 'relief camp',
    'sheltered', 'relocated', 'stranded', 'stuck', 'stranded', "killed", "dead", "died", "drown"
]

structure_reporting_terms = [
    'destroyed', 'damaged', 'swept', 'collapsed',
    'flooded', 'washed', 'inundated', 'evacuate'
]

person_reporting_units = ["families", "person", "people", "individuals", "locals", "villagers", "residents",
                            "occupants", "citizens", "households", "life"]

structure_reporting_units = ["home", "house", "hut", "dwelling", "building", "shop", "business", "apartment",
                                     "flat", "residence"]

relevant_article_terms = ['Rainstorm', 'hurricane',
                          'tornado', 'rain', 'storm', 'earthquake']
relevant_article_lemmas = [t.lemma_ for t in nlp(
    " ".join(relevant_article_terms))]

data_path = '../data'

In [10]:
interpreter = Interpreter(nlp, person_reporting_terms, structure_reporting_terms, person_reporting_units,
                          structure_reporting_units, relevant_article_lemmas, data_path,
                          model_path='../internal_displacement/classifiers/default_model.pkl',
                          encoder_path='../internal_displacement/classifiers/default_encoder.pkl')

In [11]:
pipeline = Pipeline(session, scraper, interpreter)

In [12]:
test_urls = pd.read_csv('../data/idmc_uniteideas_training_dataset.csv')
test_urls = test_urls['URL'].tolist()

In [13]:
article = Article(url=test_urls[17], status=Status.NEW)
content, publish_date, title, content_type, authors, domain = scraper.scrape(article.url)
content

'ALGIERS (AA) – Hundreds of homes have been destroyed in Algeria‘s southern city of Tamanrasset following several days of torrential rainfall, a local humanitarian aid official said Wednesday.  The city was pounded by rainfall from March 19 to March 24, according to Ghanom Sudani, a member of a government-appointed humanitarian aid committee.  He added that heavy rains had destroyed as many as 400 residences.  “Hundreds of families have had to leave their homes after they were inundated with water,” Sudani told The Anadolu Agency.  www.aa.com.tr/en  Last month neighbouring Tunisia experienced heavy rainfall and flooding in Jendouba City.'

In [15]:
reports = interpreter.process_article_new(content)

ValueError: sentence boundary detection requires the dependency parse, which requires data to be installed. If you haven't done so, run: 
python -m spacy.en.download all
to install the data