Exploration of what a possible pipeline could look like for processing a URL from start to finish.

Once finalized this can be implemented as a series of methods in `pipeline.py`.

Some questions to discuss:
    
    - Should we attempt end-to-end processing in one shot or split it into steps
    - Implemente these as class methods, then initialize Pipeline() and carry out the steps one at a time?

This also highlights some pending issues to deal with in the Scraper and/or Interpreter code, i.e.

1. Turn extracted dates into Date Windows
2. Turn extracted quantities into integers for saving to DB
3. Add step for classifying article

#### Set up the db

In [1]:
%load_ext autoreload
%autoreload 2
# Change these to True to set up the DB the first time
i_know_this_will_delete_everything = True
initialize_id_test = True
initialize_id = False

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from internal_displacement.model.model import init_db

db_host = os.environ.get('DB_HOST')

if initialize_id:
    db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
        user='jupyter', password='jupyter', db_host=db_host, db='id')
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)
    
if initialize_id_test:
    db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
        user='tester', password='tester', db_host=db_host, db='id_test')
    init_db(db_url, i_know_this_will_delete_everything=i_know_this_will_delete_everything)

In [2]:
import spacy
import json
from sqlalchemy import create_engine
from datetime import datetime
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
    Location, Report, ReportDateSpan, ArticleCategory, Base
from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter

#### Set-up the session

In [3]:
engine = create_engine(db_url)
Session.configure(bind=engine)
session = Session()

#### Receive the url

In [4]:
url = 'http://www.independent.co.uk/news/world/asia/160-killed-and-hundreds-left-stranded-by-flooding-across-afghanistan-and-pakistan-8746566.html'

#### Create a new article based upon the url

In [5]:
article = Article(url=url, status=Status.NEW)
session.add(article)
session.commit()

#### Initialize the Scraper, and attempt to download the url and update article attributes

In [6]:
scraper = Scraper()
content, publish_date, title, content_type, authors, domain = scraper.scrape(url)

if content == 'retrieval_failed':
    # Not implemented, but here would set status to processing failed
    pass
else:
    session.query(Article).filter(Article.id == article.id).\
    update({"domain": domain, "status": Status.FETCHED, "title": title, "publication_date": publish_date,
           "authors": ", ".join(authors)})
    session.commit()

#### Add the article content to DB

In [7]:
content = Content(article_id=article.id, retrieval_date=datetime.now(), \
                  content=content, content_type=content_type)
session.add(content)
session.commit()

#### Set-up and initialize the Interpreter for carrying out report extraction etc.

In [8]:
# Load the spacy engine
nlp = spacy.load('en')

In [9]:
# Define the terms we want to use for article extraction
person_reporting_terms = [
    'displaced', 'evacuated', 'forced', 'flee', 'homeless', 'relief camp',
    'sheltered', 'relocated', 'stranded', 'stuck', 'stranded', "killed", "dead", "died", "drown"
]

structure_reporting_terms = [
    'destroyed', 'damaged', 'swept', 'collapsed',
    'flooded', 'washed', 'inundated', 'evacuate'
]

person_reporting_units = ["families", "person", "people", "individuals", "locals", "villagers", "residents",
                            "occupants", "citizens", "households", "life"]

structure_reporting_units = ["home", "house", "hut", "dwelling", "building", "shop", "business", "apartment",
                                     "flat", "residence"]

relevant_article_terms = ['Rainstorm', 'hurricane',
                          'tornado', 'rain', 'storm', 'earthquake']
relevant_article_lemmas = [t.lemma_ for t in nlp(
    " ".join(relevant_article_terms))]

In [10]:
# Intialize the interpreter

data_path = '../data'
interpreter = Interpreter(nlp, person_reporting_terms, structure_reporting_terms, person_reporting_units,
                          structure_reporting_units, relevant_article_lemmas, data_path)

#### Set article status to processing

In [12]:
article.status = Status.PROCESSING

#### Check and update language attribute

In [13]:
article.language = interpreter.check_language(article.content.content)
session.commit()

#### Try and extract reports

In [14]:
reports = interpreter.process_article_new(article.content.content)

#### Process each report in turn

In [18]:
for rep in reports:
    # Need to fix how quantities are extracted to be integers not strings
    report = Report(article_id=article.id, event_term=rep.event_term, subject_term=rep.subject_term,
               quantity=0, tag_locations=json.dumps(rep.tag_spans),
               analysis_date=datetime.now())
    session.add(report)
    session.commit()
    
    # Process each report location in turn
    for loc in rep.locations:
        country_code = interpreter.country_code(loc)
        if country_code:
            country = session.query(Country).filter_by(code=country_code).one_or_none() or Country(code=country_code)
            session.add(country)
            session.commit()
            location = Location(description=loc, country=country)
            session.add(location)
            session.commit()
            report.locations.append(location)

#### Update the article relevance status

In [19]:
status = len(article.reports) > 0
article.relevance = status
session.commit()

#### Set report status to processed

In [20]:
article.status = Status.PROCESSED
session.commit()

#### PENDING

- Insert and add date spans to reports
- Call pre-trained ML model to classify article and save category / categories