In [11]:
#!pip install newsapi-python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import boto3
import json
import spacy
#from urllib.parse import quote

# set pandas to display full text and all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [12]:
from newsapi import *
# Init
newsapi = NewsApiClient(api_key='') #Enter your api key

In [13]:
# /v2/everything
all_articles = newsapi.get_everything(q='Australia Politics',
                                      
                                      from_param='2018-08-20',
                                      to='2018-08-02',
                                      language='en',
                                      sort_by='publishedAt',
                                      page=1,
                                      page_size=100
                                    
                                     )


In [14]:
#next(iter(all_articles))
#all_articles

In [15]:
type(all_articles), len(all_articles)

(dict, 3)

In [16]:
all_articles.keys()

dict_keys(['status', 'totalResults', 'articles'])

In [17]:
all_articles['totalResults'], len(all_articles['articles'])

(798, 100)

In [18]:
#all_articles['articles']

In [19]:
all_df = pd.DataFrame(all_articles['articles'])
all_df.head(1).T
#all_df[['title','description']].head()

Unnamed: 0,0
author,
description,"It's on in Australian politics. Prime Minister Malcolm Turnbull will recontest his position, challenging his agitators to put up or shut up."
publishedAt,2018-08-20T23:18:46Z
source,"{'id': None, 'name': 'Stuff.co.nz'}"
title,Leadership spill declared in Australia
url,https://www.stuff.co.nz/world/australia/106436672/Leadership-spill-declared-in-Australia
urlToImage,https://resources.stuff.co.nz/content/dam/images/1/r/d/1/z/j/image.related.StuffLandscapeSixteenByNine.620x349.1rdb28.png/1534807321159.jpg


In [20]:
#all_df.publishedAt.iloc[[0,1,2]]
# this column needs to be in datetime format

### Script to fetch and store all pages of news articles in a dataframe all_df

In [21]:
# Search terms
# Time period of last 10 days
today=datetime.today().strftime('%Y-%m-%d')
ldate=(datetime.today()-timedelta(days=10)).strftime('%Y-%m-%d')
# Complex query search term must be URL-encoded? Not sure
query = 'Australia Politics'


# get all articles and store in all_df dataframe
all_articles = newsapi.get_everything(q=query,
                                      from_param=ldate,
                                      to=today,
                                      language='en',
                                      sort_by='publishedAt',
                                      page=1, # first page of results
                                      page_size=100)
all_df = pd.DataFrame(all_articles['articles'])

# calculated remaining pages 
pages = int(np.ceil(all_articles['totalResults']/len(all_articles['articles'])))

for p in range(2,pages+1):
    results = pd.DataFrame(newsapi.get_everything(q=query,
                                  from_param=ldate,
                                  to=today,
                                  language='en',
                                  sort_by='publishedAt',
                                  page=p,
                                  page_size=100)['articles']
                        )
    # append returns new object so reassign to all_df
    all_df = all_df.append(results, ignore_index=True)

In [22]:
# check all results fetched
len(all_df),all_articles['totalResults'], pages

(543, 543, 6)

In [23]:
all_df.iloc[-1],results.iloc[-1]

(author         Krystal Gordon                                                                                                                      
 description    Outback Queensland Mayor Geoff Morton apologises for sending nude photos of himself via the Diamantina Shire Council's email system.
 publishedAt    2018-08-31T00:25:55Z                                                                                                                
 source         {'id': 'abc-news-au', 'name': 'ABC News (AU)'}                                                                                      
 title          Mayor apologises for emailing nude photos on council email system                                                                   
 url            http://www.abc.net.au/news/2018-08-31/diamantina-mayor-geoff-morton-apologises-for-nude-photos-email/10186742                       
 urlToImage     http://www.abc.net.au/news/image/6750112-16x9-700x394.jpg                                 

In [24]:
# convert nested dict 'source' column to columns
source_df=pd.DataFrame(list(all_df.source))
all_df[['source_id','source_name']]=source_df[['id','name']]

# Can drop the source column to save memory
all_df.drop(columns=['source'],inplace=True)

# convert to datetime
all_df['publishedAt'] =  pd.to_datetime(all_df['publishedAt'])

In [25]:
all_df.tail(1).T

Unnamed: 0,542
author,Krystal Gordon
description,Outback Queensland Mayor Geoff Morton apologises for sending nude photos of himself via the Diamantina Shire Council's email system.
publishedAt,2018-08-31 00:25:55
title,Mayor apologises for emailing nude photos on council email system
url,http://www.abc.net.au/news/2018-08-31/diamantina-mayor-geoff-morton-apologises-for-nude-photos-email/10186742
urlToImage,http://www.abc.net.au/news/image/6750112-16x9-700x394.jpg
source_id,abc-news-au
source_name,ABC News (AU)


In [26]:
#potential to use this
all_df.publishedAt.diff(-1).tail(3) # time difference to next row e.g. diff(row0->row1) = 55seconds

540   00:12:11
541   00:22:32
542   NaT     
Name: publishedAt, dtype: timedelta64[ns]

### Get results from AWS comprehend API

In [45]:
all_df['description'][12:15]

12    There are three women in the field - Mary-Lou Jarvis, a vice-president of the NSW Liberal party; Katherine O'Regan, a commercial board director, and Maxine Szramka, a rheumatologist.
13    Parents who drop off their kids before 8:30am at Burrowes State School, south of Brisbane, will soon have to pay a daily fee for before-school care — whether they like it or not.    
14    Andrew Bragg, widely considered the frontrunner in the Liberal preselection for the prize seat of Wentworth, bows out of the race, saying the party should pick a woman.              
Name: description, dtype: object

In [46]:
str3 = all_df['description'][12]+all_df['description'][14]
print(str3)

There are three women in the field - Mary-Lou Jarvis, a vice-president of the NSW Liberal party; Katherine O'Regan, a commercial board director, and Maxine Szramka, a rheumatologist.Andrew Bragg, widely considered the frontrunner in the Liberal preselection for the prize seat of Wentworth, bows out of the race, saying the party should pick a woman.


In [47]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
text = str3

print(json.dumps(comprehend.detect_entities(Text=text,LanguageCode='en'), sort_keys=True, indent=4))

{
    "Entities": [
        {
            "BeginOffset": 10,
            "EndOffset": 21,
            "Score": 0.9020122289657593,
            "Text": "three women",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 37,
            "EndOffset": 41,
            "Score": 0.7247194647789001,
            "Text": "Mary",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 42,
            "EndOffset": 52,
            "Score": 0.996584951877594,
            "Text": "Lou Jarvis",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 78,
            "EndOffset": 95,
            "Score": 0.9161083698272705,
            "Text": "NSW Liberal party",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 97,
            "EndOffset": 114,
            "Score": 0.9973188638687134,
            "Text": "Katherine O'Regan",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 1

In [48]:
str3[182:194]

'Andrew Bragg'

In [30]:
# i=1
# j=0
# while i <5:
#     str1 = ''.join(Description[j:i])
#     check=json.dumps(comprehend.detect_entities(Text=str1,LanguageCode='en'), sort_keys=True, indent=4)
#     check2=comprehend.detect_entities(Text=str1,LanguageCode='en')
#     #print(json.dumps(comprehend.detect_entities(Text=str1,LanguageCode='en'), sort_keys=True, indent=4))
#     #print(i)
#     #print(str1)
#     i+=1
#     j+=1

### Try Spacy

https://spacy.io/usage/spacy-101



In [49]:
# Need to install spacy and english model - large files and takes time to install

# pip install spacy
# python -m spacy download en

In [50]:
nlp = spacy.load('en')
doc=nlp(str3)
print([t.text for t in doc])

['There', 'are', 'three', 'women', 'in', 'the', 'field', '-', 'Mary', '-', 'Lou', 'Jarvis', ',', 'a', 'vice', '-', 'president', 'of', 'the', 'NSW', 'Liberal', 'party', ';', 'Katherine', "O'Regan", ',', 'a', 'commercial', 'board', 'director', ',', 'and', 'Maxine', 'Szramka', ',', 'a', 'rheumatologist', '.', 'Andrew', 'Bragg', ',', 'widely', 'considered', 'the', 'frontrunner', 'in', 'the', 'Liberal', 'preselection', 'for', 'the', 'prize', 'seat', 'of', 'Wentworth', ',', 'bows', 'out', 'of', 'the', 'race', ',', 'saying', 'the', 'party', 'should', 'pick', 'a', 'woman', '.']


In [51]:
# sentences
list(doc.sents)[0].text

"There are three women in the field - Mary-Lou Jarvis, a vice-president of the NSW Liberal party; Katherine O'Regan, a commercial board director, and Maxine Szramka, a rheumatologist."

In [52]:
# Named entity recognition
nlp = spacy.load('en')
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
doc=nlp(str3)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

three 10 15 CARDINAL
Mary-Lou Jarvis 37 52 PERSON
NSW Liberal 78 89 ORG
Katherine O'Regan 97 114 PERSON
Maxine Szramka 149 163 PERSON
Andrew Bragg 182 194 PERSON
Liberal 237 244 GPE
Wentworth 280 289 GPE


In [53]:
# Spacy vizs
from spacy import displacy
nlp = spacy.load('en')
doc = nlp(str3)
displacy.render(doc, style='ent',jupyter=True)

In [54]:
sentence_spans = list(doc.sents)
#displacy.serve(sentence_spans, style='dep')
displacy.render(sentence_spans[0], style='dep',jupyter=True, options={'distance':50})

In [55]:
# Similarity
#Using basic model 'en' aka en_core_web_sm

tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.5390697
dog banana 0.28761008
cat dog 0.5390697
cat cat 1.0
cat banana 0.48752153
banana dog 0.28761008
banana cat 0.48752153
banana banana 1.0


In [56]:
# This is using a larger model that is 115MB vs 35MB for the 'en'. And takes time to load
# More accurate as it comes with pretrained word embedding vectors
nlp = spacy.load('en_core_web_md')
tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))


dog dog 1.0
dog cat 0.8016855
dog banana 0.24327648
cat dog 0.8016855
cat cat 1.0
cat banana 0.2815437
banana dog 0.24327648
banana cat 0.2815437
banana banana 1.0


In [57]:
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
          u"emoji. It's outranking eggplant 🍑 ")
print(doc[0].text)          # Peach
print(doc[1].text)          # emoji
print(doc[-1].text)         # 🍑
print(doc[17:19].text)      # outranking eggplant

noun_chunks = list(doc.noun_chunks)
print(noun_chunks[0].text)  # Peach emoji

sentences = list(doc.sents)
assert len(sentences) == 3
print(sentences[1].text)    # 'Peach is the superior emoji.'

Peach
emoji
🍑
outranking eggplant
Peach emoji
Peach is the superior emoji.


In [58]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
print('Fine-grained POS tag', apple.pos_, apple.pos)
print('Coarse-grained POS tag', apple.tag_, apple.tag)
print('Word shape', apple.shape_, apple.shape)
print('Alphanumeric characters?', apple.is_alpha)
print('Punctuation mark?', apple.is_punct)

billion = doc[10]
print('Digit?', billion.is_digit)
print('Like a number?', billion.like_num)
print('Like an email address?', billion.like_email)

Fine-grained POS tag PROPN 95
Coarse-grained POS tag NNP 15794550382381185553
Word shape Xxxxx 16072095006890171862
Alphanumeric characters? True
Punctuation mark? False
Digit? False
Like a number? True
Like an email address? False
