## Setup and Imports

In [1]:
import spacy
import feedparser

# Quick get feeds

In [87]:
%mkdir ../data/feeds

In [2]:
!python ../package/src/get_rss_feeds.py \
        --rss-url "http://feeds.bbci.co.uk/news/politics/rss.xml" \
        --json ../data/feeds/bbc3.json

In [3]:
!python ../package/src/get_rss_feeds.py \
        --rss-url "http://feeds2.feedburner.com/ft/westminster" \
        --json ../data/feeds/ft3.json

In [4]:
!python ../package/src/get_rss_feeds.py \
        --rss-url "https://www.theguardian.com/politics/rss" \
        --json ../data/feeds/guardian3.json

## Data Sources

In [86]:
bbc_url = "http://feeds.bbci.co.uk/news/politics/rss.xml"
ft_url = "http://feeds2.feedburner.com/ft/westminster"
gaurdian_url = "https://www.theguardian.com/politics/rss"

data_sources = [bbc_url, ft_url, gaurdian_url]

In [None]:
for source in data_sources:
    
    # Find latest feeds
    feed = fetch_latest_feed(source)
    
    save_feed_metadata(feed, db)
    
    fetch_articles
    
    

In [None]:
>>> from tinydb import TinyDB, Query
>>> db = TinyDB('path/to/db.json')
>>> User = Query()
>>> db.insert({'name': 'John', 'age': 22})
>>> db.search(User.name == 'John')
[{'name': 'John', 'age': 22}]

In [None]:
""" Script to fetch news articles from RSS feeds and store text and meta data as a JSON file.
@author: Chris Musselle
"""
# Standard libs
import os
import sys
import json

# 3rd Party libs
import requests
import bs4
import feedparser


def get_articles(feed_url, json_filename='articles.json'):
    """ Update a JSON file to hold article links, published data and text data """

    feed = feedparser.parse(feed_url)

    # Read in articles already downloaded if they exist
    if os.path.exists(json_filename):
        JSON_articles = json.load(open(json_filename, 'r'))
    else:
        JSON_articles = {}

    article_counter = 0

    for item in feed['items']:

        # Use title of the article as an id
        title = item['title']

        # Only process article if we have not done so already
        if title not in JSON_articles:

            # Store basic info from feed
            article_url = item['link']
            article_published_date = item['published']
            JSON_articles[title] = {'url': article_url,
                                    'published_date': article_published_date}

            # Get full web content for link
            r = requests.get(article_url)

            # Parse HTML using BeautifulSoup
            soup = bs4.BeautifulSoup(r.content, 'lxml')

            # Find all the p tags
            p_tags = soup.find_all(name='p')

            # Extract just the text from the p tags
            p_tags_text = [p.text for p in p_tags]

            # Join all p tag strings by a newline
            all_text = '\n'.join(p_tags_text)

            # Store and increment counter
            JSON_articles[title]['text'] = all_text
            article_counter += 1

    # Write updated file.
    with open(json_filename, 'w', encoding='utf-8') as json_file:
        json.dump(JSON_articles, json_file, indent=4)

    print('Added {} new articles'.format(article_counter))


if __name__ == '__main__':

    # Pass Arguments
    args = sys.argv[1:]
    feed_url = args[0]
    filepath = args[1]

    # Create the directory if it does not already exist
    dirname = os.path.dirname(filepath)
    if dirname:
        if not os.path.exists(dirname):
            os.makedirs(dirname)

    # Get the latest articles and append to the JSON file given
    print('Fetching articles for {}'.format(feed_url))
    get_articles(feed_url, filepath)
print('Saving to {}'.format(filepath))

In [3]:
bbc_feed = feedparser.parse(bbc_url)

In [4]:
ft_feed = feedparser.parse(ft_url)

In [5]:
gaurdian_feed = feedparser.parse(gaurdian_url)

In [6]:
bbc_feed.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [7]:
ft_feed.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [8]:
gaurdian_feed.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [14]:
bbc_feed['namespaces']

{'': 'http://www.w3.org/2005/Atom',
 'content': 'http://purl.org/rss/1.0/modules/content/',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'media': 'http://search.yahoo.com/mrss/'}

In [13]:
ft_feed['namespaces']

{'': 'http://www.w3.org/2005/Atom',
 'content': 'http://purl.org/rss/1.0/modules/content/',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'feedburner': 'http://rssnamespace.org/feedburner/ext/1.0',
 'slash': 'http://purl.org/rss/1.0/modules/slash/',
 'sy': 'http://purl.org/rss/1.0/modules/syndication/',
 'wfw': 'http://wellformedweb.org/CommentAPI/'}

In [12]:
ft_feed['feed']

{'feedburner_info': {'uri': 'ft/westminster'},
 'language': 'en',
 'link': 'http://blogs.ft.com/westminster',
 'links': [{'href': 'http://blogs.ft.com/westminster',
   'rel': 'alternate',
   'type': 'text/html'},
  {'href': 'http://feeds.feedburner.com/ft/westminster',
   'rel': 'self',
   'type': 'application/rss+xml'},
  {'href': 'http://pubsubhubbub.appspot.com/',
   'rel': 'hub',
   'type': 'text/html'}],
 'subtitle': 'Jim Pickard and Kiran Stacey share their views on the UK’s political scene for the Financial Times',
 'subtitle_detail': {'base': 'http://feeds2.feedburner.com/ft/westminster',
  'language': None,
  'type': 'text/html',
  'value': 'Jim Pickard and Kiran Stacey share their views on the UK’s political scene for the Financial Times'},
 'sy_updatefrequency': '1',
 'sy_updateperiod': 'hourly',
 'title': 'Westminster blog',
 'title_detail': {'base': 'http://feeds2.feedburner.com/ft/westminster',
  'language': None,
  'type': 'text/plain',
  'value': 'Westminster blog'},
 '

## Useful components

In [None]:
last_updated_parsed = bbc_feed['feed']['updated_parsed']
last_updated = bbc_feed['feed']['updated']

In [None]:
status = bbc_feed['status']

In [15]:
g_url = "https://www.theguardian.com/politics/2017/may/06/andy-burnham-denies-jeremy-corbyn-snub-manchester-rally"

In [17]:
from bs4 import BeautifulSoup

import requests

content = requests.get(g_url)
content.ok

soup = BeautifulSoup(content.content, "html5lib")

print(soup.get_text())

x = soup.select('div.content__main')[0]
x.find(name='h1')

x.get_text()

ps = x.find_all(name='p')

len(ps)

p = ps[0]

paras = [p.get_text() for p in ps]

In [46]:
paras

['New mayor for Greater Manchester blames prior commitments for no-show at victory rally with party leader',
 '\nChris Johnston',
 '\n\nSaturday 6 May 2017 15.39\xa0BST\n\n\nFirst published on Saturday 6 May 2017 14.44\xa0BST\n\n',
 'The new Labour mayor of Greater Manchester, Andy Burnham, has denied snubbing Jeremy Corbyn after failing to join his party leader at a rally in the city following his election victory.',
 'Burnham said his absence from the event was “not in the slightest” intended as a snub, and that he had told Corbyn’s office earlier in the week that he had prior engagements.',
 'Burnham told the BBC: “I had made it clear … that I wouldn’t be able to be at the rally at 7 o’clock, because I had a lot of commitments, including family commitments. Jeremy came, fair enough, because people wanted to enjoy the moment.”',
 'Ian Lavery, Labour’s national campaigns coordinator, also dismissed speculation of a rift between Corbyn and Burnham.',
 'Speaking in Leicester on Saturday

In [47]:
paras

['New mayor for Greater Manchester blames prior commitments for no-show at victory rally with party leader',
 '\nChris Johnston',
 '\n\nSaturday 6 May 2017 15.39\xa0BST\n\n\nFirst published on Saturday 6 May 2017 14.44\xa0BST\n\n',
 'The new Labour mayor of Greater Manchester, Andy Burnham, has denied snubbing Jeremy Corbyn after failing to join his party leader at a rally in the city following his election victory.',
 'Burnham said his absence from the event was “not in the slightest” intended as a snub, and that he had told Corbyn’s office earlier in the week that he had prior engagements.',
 'Burnham told the BBC: “I had made it clear … that I wouldn’t be able to be at the rally at 7 o’clock, because I had a lot of commitments, including family commitments. Jeremy came, fair enough, because people wanted to enjoy the moment.”',
 'Ian Lavery, Labour’s national campaigns coordinator, also dismissed speculation of a rift between Corbyn and Burnham.',
 'Speaking in Leicester on Saturday

In [51]:
import re

def split_sentences(text, pattern):
    
    parts = re.split(pattern=pattern, string=text)
    return parts

In [56]:
paras[5]

'Burnham told the BBC: “I had made it clear … that I wouldn’t be able to be at the rally at 7 o’clock, because I had a lot of commitments, including family commitments. Jeremy came, fair enough, because people wanted to enjoy the moment.”'

In [67]:
re.split(r'([a-z]{2})\.  ?([A-Z][a-z ])', paras[5] + 'done.  And then ..', )

['Burnham told the BBC: “I had made it clear … that I wouldn’t be able to be at the rally at 7 o’clock, because I had a lot of commitments, including family commitmen',
 'ts',
 'Je',
 'remy came, fair enough, because people wanted to enjoy the moment.”do',
 'ne',
 'An',
 'd then ..']

In [84]:
>>> from nltk import tokenize
>>> p = "Good morning Dr. Adams. The patient is waiting for you in room number 3."
>>> tokenize.sent_tokenize(p)
['Good morning Dr. Adams.', 'The patient is waiting for you in room number 3.']

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/cmusselle/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [70]:
import spacy

nlp = spacy.load('en')

doc = nlp(paras[5])

doc

list(doc.sents)