In [1]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp
import json

# Progress bar and delaying requests 
from tqdm import tnrange, tqdm_notebook #progress bars
from random import randint
import datetime
import time

In [2]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining I
This  notebook is intended to perform the following processes:

    1.1 Read-in news articles from newsAPI for a given date range, and up to five queries (passed as a list).

    1.2 Extract features native to the articles (e.g. url).

    1.3 Perform data cleanup and preprocessing.

    1.4 Split dataset into n-csv-files for distrubuted computation or batching.

___
### **Begin Data Mining I:** Read-in NewsAPI feed for a given date range

In [3]:
### NEWSAPI RELATED ###
# keys: 
mkey = '8ba091b7a47b4c9a9162a83ca72eb1ca'
ekey = '2bc85776a0c14af6b9937366ad683e2f'

# Install API 
#!pip install newsapi-python

# Import Client
from newsapi import NewsApiClient

# Initialize Client (create object)
news_api = NewsApiClient(api_key = mkey)
print(type(news_api))

<class 'newsapi.newsapi_client.NewsApiClient'>


__1.1 Read-in news articles from newsAPI for a given date range__

#### Function: **get_news**
Function establishes values to be used for control of loop then calls functions used to extract news article data

In [4]:
def get_news(query, start, stop, sort, lang, article_count, page_count):
    import math
    # extract information about response file to ensure proper loop control
    params = get_params(query, start, stop, sort, lang, article_count, page_count)

    # variable referencing
    status = params['status']
    results = params['totalResults']
    
    # Confirmation of data extraction
    print("\nVerify Read-in Process:", status)
    print("Number of Articles Correctly Extracted: ", results)
    print(type(params), params.keys())
           
    # per page article extraction stop variable -- if number of articles is greater than number articles per page
    loops = math.ceil(results/article_count)
    
    if page_count == 'all' or article_count <  results:
        print("\n\nExtracting News Data...\n")
        news_df = pd.DataFrame()
        # function is called withinin loop, is subject to number of pages available as a function of total no. articles
        for page in range(loops):
            page = page + 1
            df = news_data(query, start, stop, sort, lang, article_count, page_count)
            news_df = news_df.append(df)  #'.append' does not happen in place, so variable assignment stores dataframes    
        print("Process Completed.")
        return(news_df)            
    else:
        print("Invalid Parameters: Check values") 

#### Function: **get_params**
Function runs an initial newsAPI call, used to store values for controlling loops

In [5]:
# vanilla function for reading all articles, subject to parameters. 
# Can be used alone, or to extract values for iteration control.
def get_params(query, start, stop, sort, lang, article_count, page_count):
    print("\nExtracting Parameters for newsAPI...\n")
    params = news_api.get_everything(q = query,
                                     from_parameter= start,
                                     to= stop,
                                     sort_by= sort,
                                     language= lang,
                                     page_size= int(article_count)
                                    )
    
    # Confirmation of data extraction
    print("Read-in Status of Given Date Range:", params['status'])
    print("Number of Articles in Given Date Range: ", params['totalResults'])
    
    return(params)

#### Function: **news_data**
Function handles cases, and extracts values within 'articles'. Returns dataframe of contents: 


*Index(['author', 'description', 'publishedAt', 'source', 'title', 'url','urlToImage'],dtype='object')*


In [6]:
# Function can handle various relationships between no.pages and no.articles
def news_data(query, start, stop, sort, lang, article_count, page_count):
    if page_count == 'all':
        params = news_api.get_everything(q = query,
                                         from_parameter= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count)
                                         )
    else: 
        params = news_api.get_everything(q = query,
                                         from_parameter= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count),
                                         page = int(page_count)
                                         )
    values = pd.DataFrame(params['articles'])
    return(values)

#### User provided parameters and function call.

In [26]:
#01/26/18 to 03/26/18
query = 'Bitcoin'         # can handle a list of up to five search topics
start = '2018-01-26'      # yyyy-mm-dd
stop = '2018-03-26'
sort = 'publishedAt'
lang = 'en'
article_count = int(100)  # default is 20
page_count = 'all'        # enter 1, 2, ... Notes: 'all' iterates over all articLes

In [27]:
# object is the result of the following functions: 'get_params', 'get_news', and 'get_data'
news = get_news(query, start, stop, sort, lang, article_count, page_count)


Extracting Parameters for newsAPI...

Read-in Status of Given Date Range: ok
Number of Articles in Given Date Range:  24773

Verify Read-in Process: ok
Number of Articles Correctly Extracted:  24773
<class 'dict'> dict_keys(['status', 'totalResults', 'articles'])


Extracting News Data...

Process Completed.


#### Explore nested key/value pairs from newsAPI data

In [28]:
print(len(news))
print(news.keys())
news.head(5)

24800
Index(['author', 'description', 'publishedAt', 'source', 'title', 'url',
       'urlToImage'],
      dtype='object')


Unnamed: 0,author,description,publishedAt,source,title,url,urlToImage
0,"Cecchetti, Schoenholtz","Despite recent technological advances, the cos...",2018-03-27T00:00:00Z,"{'id': None, 'name': 'Voxeu.org'}",The stubbornly high cost of remittances,https://voxeu.org/article/stubbornly-high-cost...,https://voxeu.org/sites/default/files/image/Fr...
1,Reuters,Twitter Inc will start banning cryptocurrency ...,2018-03-26T23:43:50Z,"{'id': None, 'name': 'Cio.com.au'}",Twitter to ban cryptocurrency ads,https://www.cio.com.au/article/635378/twitter-...,https://d2r9nfiii89r0l.cloudfront.net/article/...
2,,See more 'Bitcoin' images on Know Your Meme!,2018-03-26T23:36:52Z,"{'id': None, 'name': 'Knowyourmeme.com'}",Bitcoin | c71.jpg,http://knowyourmeme.com/photos/1355307-bitcoin,http://i0.kym-cdn.com/photos/images/facebook/0...
3,James Mickleboro,The bitcoin (BTC) price has bounced back from ...,2018-03-26T23:35:38Z,"{'id': None, 'name': 'Fool.com.au'}",Why the bitcoin (BTC) price was smashed overnight,https://www.fool.com.au/2018/03/27/why-the-bit...,https://www.fool.com.au/wp-content/uploads/201...
4,newsfeeds@nzherald.co.nz,NEW YORK (AP) — Twitter says it will ban or re...,2018-03-26T23:25:06Z,"{'id': None, 'name': 'Nzherald.co.nz'}","Twitter to ban cryptocurrency ads, joining Fac...",http://www.nzherald.co.nz/business/news/articl...,/pb/resources/assets/img/fallback-promo-image....


__1.2 Extract features native to the articles__

#### Function: **get_info**
Function extracts variables from dataframe and stores each as a list, returning all of them as a single dataframe.

__Note:__ *urlToImage* is not included in this process, as we are uncertain as to the value of the feature

In [29]:
def get_info(df):
    # for deepcopy()
    import copy
    
    author = []
    title = []
    publisher = []
    publish_url = []
    timeStamp = []
    description = []
    
    # loop appends rows to respective lists 
    for col_name in df:
        for index in df[col_name]:
            if col_name == 'author':
                author.append(index)
            elif col_name == 'title':
                title.append(index)
            elif col_name == 'source':
                name = index['name']
                publisher.append(name)
            elif col_name == 'url':
                publish_url.append(index)
            elif col_name == 'publishedAt':
                timeStamp.append(index)
            elif col_name == 'description':
                description.append(index)
            else:
                continue
    
    # merge lists and return them as dataframe.
    df = pd.DataFrame({'author' : author,
                       'title' : title,
                       'publisher' : publisher,
                       'source_url' : publish_url,
                       'timeStamp' : timeStamp,
                       'description' : description})
    
    return(df)
        

#### Completed newsAPI Read-in Process: 
##### newsDF contains features extracted from raw newsAPI feed, for a given data range, and query.

In [30]:
# Object creation
newsDF = get_info(news)

In [31]:
# Verifying correct data extraction
print("\nDataFrame Dimensions:", newsDF.shape, "\n")
newsDF.head(3)


DataFrame Dimensions: (24800, 6) 



Unnamed: 0,author,description,publisher,source_url,timeStamp,title
0,"Cecchetti, Schoenholtz","Despite recent technological advances, the cos...",Voxeu.org,https://voxeu.org/article/stubbornly-high-cost...,2018-03-27T00:00:00Z,The stubbornly high cost of remittances
1,Reuters,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26T23:43:50Z,Twitter to ban cryptocurrency ads
2,,See more 'Bitcoin' images on Know Your Meme!,Knowyourmeme.com,http://knowyourmeme.com/photos/1355307-bitcoin,2018-03-26T23:36:52Z,Bitcoin | c71.jpg


__1.3 Perform data cleanup and preprocessing.__



The following functions perform basic clean up on a dataframe. The purpose is to prepare the file to write-out (csv).  
 

In [32]:
# Replace 'None' values
def findNone(df):
    """
     Receives pandas datraframe, and removes null entries from author feature
    """
    print("Removing 'None' values in author feature...")
    author = df['author']
    publisher = df['publisher']
    
    for i in range(len(df)):
        if pd.isnull(author.loc[i]):
            author.loc[i] = publisher.loc[i]
    return(df)

In [33]:
# Remove gaps 
def gapStrip(df):
    """
    Receives pandas dataframe and leading and traling empty space`
    """
    df.columns = map(str.strip, df.columns) 
    print("Removing leading and trailing spaces and tabs...")
    # element-wise operation
    f = lambda x: x.strip() if (isinstance(x,str)) else x
    df = df.applymap(f)
    return(df)

In [34]:
# Standardize time stamps
def std_timeStamp(df):
    """
    Receives pandas dataframe and standardizes time stamps 
    """
    import datetime
    # Check to see time stamps are in zero timezones
    print("Converting Time Stamps to Desired Standard Formating...")
    for time in df['timeStamp']:
        if time.endswith('Z'):
            df['timeStamp'] = pd.to_datetime(df['timeStamp'],
                                             infer_datetime_format = True,
                                             utc = True)                       # returns a type '.Timestamp'
            return(df)
        else:
            print("Revisit appropriate variable or function to deal with time zones that are not zero")

In [35]:
def feature_clean(df):
    """
    Performs Generic Cleanup and Preprocessing on a given dataframe sourced from newsAPI
    """
    temp = findNone(df)           # removes missing values from author column
    temp2 = gapStrip(temp)        # remove leading and trailing white space
    temp3 = std_timeStamp(temp2)  # convert time stamps to 'utc' standard
    return(temp3)

In [36]:
riskEx_df = feature_clean(newsDF)

Removing 'None' values in author feature...
Removing leading and trailing spaces and tabs...
Converting Time Stamps to Desired Standard Formating...


In [37]:
# checking
riskEx_df.tail(5)

Unnamed: 0,author,description,publisher,source_url,timeStamp,title
24795,Anna Hensel,"Twitter, following in the footsteps of Google ...",Venturebeat.com,https://venturebeat.com/2018/03/26/twitter-to-...,2018-03-26 18:36:42+00:00,"Twitter to start banning cryptocurrency ads, j..."
24796,Camila Russo,Cryptocurrency exchanges and wallet services a...,Livemint.com,https://www.livemint.com/Industry/XpaNirgkBbmC...,2018-03-26 18:30:47+00:00,"Twitter joins Facebook, Google in banning cryp..."
24797,Bram de Haas,Bitcoin has crashed pretty badly since the end...,Seekingalpha.com,https://seekingalpha.com/article/4158925-buy-b...,2018-03-26 18:21:07+00:00,"Why Buy Bitcoin At $8,169.80?"
24798,Financial Times,Social media sites are under pressure to prote...,Financial Times,https://www.ft.com/content/bddd293a-3118-11e8-...,2018-03-26 18:18:25+00:00,Twitter cracks down on cryptocurrency ads
24799,Michelle Meyers,"As was expected, the company joins Facebook an...",Cnet.com,https://www.cnet.com/news/twitter-confirms-its...,2018-03-26 18:17:15+00:00,Twitter confirms it's banning cryptocurrency a...


In [38]:
## Check file size
riskEx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24800 entries, 0 to 24799
Data columns (total 6 columns):
author         24800 non-null object
description    24800 non-null object
publisher      24800 non-null object
source_url     24800 non-null object
timeStamp      24800 non-null datetime64[ns, UTC]
title          24800 non-null object
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 1.1+ MB


__1.4 Write out to csv.__

In [None]:
# write out n-csv-files each with 100 rows. Process is done to reduce computational load
riskEx_df.to_csv('rawData.csv', index_label = False)

__Note:__ if wanting to create batches of raw data files, use the following

In [1]:
#def df_to_csvs(df):
#    articlesPage = int(100)
#    totalArticles = len(df)
#    batchSize=round(totalArticles/articlesPage)          # number of rows in single output file
        
#    for id, df_i in  enumerate(np.array_split(df, batchSize)):
#        df_i.to_csv('rawData_{id}.csv'.format(id=id), index_label = False)                 

### **End Data Mining I:** Read-in NewsAPI feed for a given date range
___