# NPI News Classifier

This prototype app collects 100 news articles from the last 24 hours using the [GDELT](https://www.gdeltproject.org/) API. Machine learning is used to score and sort the articles according to whether the story is about NPI changes or not. Relevance scorse range from 0 to 100. I manually labelled news headlines to create a training dataset for the ML algorithm which can be viewed [here](https://docs.google.com/spreadsheets/d/1pjC0M53ES8BP9jH52ngjQPmtkkd2onIsoALC6AQaL3U/edit?usp=sharing) for colleagues with access to google sheets.

Please be patient as it is running on slow free infrastructure.

![alt text](https://www.countryside-jobs.com/perch/resources/hero/newspapers-w1600h500.jpg "Newspapers")

You can 

- change the source country of news articles such as inputting `spain` or `unitedkingdom` instead of `world` 
- try a different GDELT query string [[docs](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/v)]



In [None]:
# !pip install -Uqq fastbook
# import fastbook
# fastbook.setup_book()

from ipywidgets import interact

import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import HTML
import urllib.request


from fastai.text.all import *


In [None]:
MODEL_URL = "https://www.dropbox.com/s/wg7gaa9gz8lmfrm/classifier_2020-11-11_1807.pkl?dl=1"
_ = urllib.request.urlretrieve(MODEL_URL, "classifier_2020-11-11_1807.pkl")

In [None]:
classifier = load_learner("classifier_2020-11-11_1807.pkl")

In [None]:
def get_articles(query: str, country: str):
  url = "https://api.gdeltproject.org/api/v2/doc/doc"
  if country == 'world':
    query_modifier =  ''
  else:
    query_modifier = 'sourcecountry:{}'.format(country)
  payload = {
      'query': '{} {} sourcelang:english'.format(query, query_modifier),
      'mode': "ArtList",
      'format': 'RSS',
      'maxrecords': 100,
      'timespan':'1d'
  }
  r = requests.get(url, params=payload)
  return(r)

def parse_articles(r: requests.Response):
  soup = BeautifulSoup(r.text, 'xml')
  items = soup.find_all('item')
  if len(items) > 0:
    data = [{'headline':i.title.text, 'url':i.link.text} for i in items if i.link is not None]
    df = pd.DataFrame(data)
  else:
    df = pd.DataFrame()
  return(df)

def output_prediction_table(query: str, country: str):
  r = get_articles(query, country)
  df = parse_articles(r)
  if len(df) == 0:
    return(None)
  df = df.groupby('headline', as_index=False).agg(first)
  headlines = df.headline.to_list()
  with classifier.no_bar() as clf:
    predictions = [clf.predict(headline)[2][1] for headline in headlines]
  df['relevance'] = [float(p)*100 for p in predictions]
  df['relevance'] = df['relevance'].round().astype(int)
  # test_dl = classifier.dls.test_dl(headlines)
  # df['score'] = [float(i[1]) for i in classifier.get_preds(dl=test_dl)[0]]
  # df['score'] = np.round(df['score'] * 100).astype(int)
  df = df.sort_values('relevance', ascending=False)

  def make_href(row: pd.Series):
    return '<a href="{}">{}</a>'.format(row.url, row.headline)

  df['headline'] = df.apply(make_href, axis = 1)
  html_string = df[['relevance', 'headline']].to_html(index=False)
  html_string = (
      html_string.replace('&lt;', '<')
      .replace('&gt;', '>')
      .replace('&lt;/a&gt;', '/a')
  )
  return HTML(html_string)

In [None]:
interact(output_prediction_table, 
         query = "(covid OR coronavirus OR virus OR pandemic) (rules OR restrictions OR shutdown OR measures OR lockdown)",
         country="world")