In [1]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp
import json

# Progress bar and delaying requests 
from tqdm import tnrange, tqdm_notebook #progress bars
from random import randint
import datetime
import time

In [2]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining I
This  notebook is intended to perform the following processes:

    1.1 Read-in news articles from newsAPI for a given date range, and up to five queries (passed as a list).

    1.2 Extract features native to the articles (e.g. url).

    1.3 Perform data cleanup and preprocessing.

    1.4 Split dataset into n-csv-files for distrubuted computation or batching.

___
### **Begin Data Mining I:** Read-in NewsAPI feed for a given date range

In [3]:
### NEWSAPI RELATED ###
# keys: 
mkey = '8ba091b7a47b4c9a9162a83ca72eb1ca'
ekey = '2bc85776a0c14af6b9937366ad683e2f'

# Install API 
#!pip install newsapi-python

# Import Client
from newsapi import NewsApiClient

# Initialize Client (create object)
news_api = NewsApiClient(api_key = mkey)
print(type(news_api))

<class 'newsapi.newsapi_client.NewsApiClient'>


__1.1 Read-in news articles from newsAPI for a given date range__

#### Function: **get_news**
Function establishes values to be used for control of loop then calls functions used to extract news article data

In [4]:
def get_news(query, start, stop, sort, lang, article_count, page_count):
    import math
    # extract information about response file to ensure proper loop control
    params = get_params(query, start, stop, sort, lang, article_count, page_count)

    # variable referencing
    status = params['status']
    results = params['totalResults']
    
    # Confirmation of data extraction
    print("\nVerify Read-in Process:", status)
    print("Number of Articles Correctly Extracted: ", results)
    print(type(params), params.keys())
           
    # per page article extraction stop variable -- if number of articles is greater than number articles per page
    loops = math.ceil(results/article_count)
    
    if page_count == 'all' or article_count <  results:
        print("\n\nExtracting News Data...\n")
        news_df = pd.DataFrame()
        # function is called withinin loop, is subject to number of pages available as a function of total no. articles
        for page in range(loops):
            df = news_data(query, start, stop, sort, lang, article_count, page_count)
            news_df = news_df.append(df)  #'.append' does not happen in place, so variable assignement stores dataframes    
        print("Process Completed.")
        return(news_df)            
    else:
        print("Invalid Parameters: Check values") 

#### Function: **get_params**
Function runs an initial newsAPI call, used to store values for controlling loops

In [5]:
# vanilla function for reading all articles, subject to parameters. 
# Can be used alone, or to extract values for iteration control.
def get_params(query, start, stop, sort, lang, article_count, page_count):
    print("\nExtracting Parameters for newsAPI...\n")
    params = news_api.get_everything(q = query,
                                     from_parameter= start,
                                     to= stop,
                                     sort_by= sort,
                                     language= lang,
                                     page_size= int(article_count)
                                    )
    
    # Confirmation of data extraction
    print("Read-in Status of Given Date Range:", params['status'])
    print("Number of Articles in Given Date Range: ", params['totalResults'])
    
    return(params)

#### Function: **news_data**
Function handles cases, and extracts values within 'articles'. Returns dataframe of contents: 


*Index(['author', 'description', 'publishedAt', 'source', 'title', 'url','urlToImage'],dtype='object')*


In [6]:
# Function can handle various relationships between no.pages and no.articles
def news_data(query, start, stop, sort, lang, article_count, page_count):
    if page_count == 'all':
        params = news_api.get_everything(q = query,
                                         from_parameter= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count)
                                         )
    else: 
        params = news_api.get_everything(q = query,
                                         from_parameter= start,
                                         to= stop,
                                         sort_by= sort,
                                         language= lang,
                                         page_size= int(article_count),
                                         page = int(page_count)
                                         )
    values = pd.DataFrame(params['articles'])
    return(values)

#### User provided parameters and function call.

In [7]:
query = 'Bitcoin'         # can handle a list of up to five search topics
start = '2017-10-08'      # yyyy-mm-dd
stop = '2018-01-08'
sort = 'publishedAt'
lang = 'en'
article_count = int(100)  # default is 20
page_count = 'all'        # enter 1, 2, ... Notes: 'all' iterates over all articLes

In [8]:
# object is the result of the following functions: 'get_params', 'get_news', and 'get_data'
news = get_news(query, start, stop, sort, lang, article_count, page_count)


Extracting Parameters for newsAPI...

Read-in Status of Given Date Range: ok
Number of Articles in Given Date Range:  2876

Verify Read-in Process: ok
Number of Articles Correctly Extracted:  2876
<class 'dict'> dict_keys(['status', 'totalResults', 'articles'])


Extracting News Data...

Process Completed.


#### Explore nested key/value pairs from newsAPI data

In [9]:
print(len(news))
print(news.keys())
news.head(5)

2900
Index(['author', 'description', 'publishedAt', 'source', 'title', 'url',
       'urlToImage'],
      dtype='object')


Unnamed: 0,author,description,publishedAt,source,title,url,urlToImage
0,Phillip Molnar,A British real estate company Monday launched ...,2018-01-09T00:00:00Z,"{'id': None, 'name': 'Sandiegouniontribune.com'}","Purplebricks, flat fee real estate listers, la...",http://www.sandiegouniontribune.com/business/r...,http://www.trbimg.com/img-5a54093a/turbine/sd-...
1,,``In parts of the continent - especially comme...,2018-01-09T00:00:00Z,"{'id': None, 'name': 'Ml-implode.com'}",Why African millennials can't get enough of Bi...,http://ml-implode.com/staticnews/2018-01-09_Wh...,
2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...",2018-01-09T00:00:00Z,"{'id': None, 'name': 'Bostonherald.com'}",Kodak surges at it becomes latest 'cryptocurre...,http://www.bostonherald.com/news/national/2018...,
3,,``JPMorgan Chase CEO Jamie Dimon has walked ba...,2018-01-09T00:00:00Z,"{'id': None, 'name': 'Ml-implode.com'}","Dimon: ""I regret calling bitcoin a fraud""",http://ml-implode.com/staticnews/2018-01-09_Di...,
4,,"``Staff at the regulatory agency ""expressed co...",2018-01-09T00:00:00Z,"{'id': None, 'name': 'Ml-implode.com'}",Fund managers say bitcoin ETF proposals withdr...,http://ml-implode.com/staticnews/2018-01-09_Fu...,


__1.2 Extract features native to the articles__

#### Function: **get_info**
Function extracts variables from dataframe and stores each as a list, returning all of them as a single dataframe.

__Note:__ *urlToImage* is not included in this process, as we are uncertain as to the value of the feature

In [10]:
def get_info(df):
    # for deepcopy()
    import copy
    
    author = []
    title = []
    publisher = []
    publish_url = []
    timeStamp = []
    description = []
    
    # loop appends rows to respective lists 
    for col_name in df:
        for index in df[col_name]:
            if col_name == 'author':
                author.append(index)
            elif col_name == 'title':
                title.append(index)
            elif col_name == 'source':
                name = index['name']
                publisher.append(name)
            elif col_name == 'url':
                publish_url.append(index)
            elif col_name == 'publishedAt':
                timeStamp.append(index)
            elif col_name == 'description':
                description.append(index)
            else:
                continue
    
    # merge lists and return them as dataframe.
    df = pd.DataFrame({'author' : author,
                       'title' : title,
                       'publisher' : publisher,
                       'source_url' : publish_url,
                       'timeStamp' : timeStamp,
                       'description' : description})
    
    return(df)
        

#### Completed newsAPI Read-in Process: 
##### newsDF contains features extracted from raw newsAPI feed, for a given data range, and query.

In [11]:
# Object creation
newsDF = get_info(news)

In [12]:
# Verifying correct data extraction
print("\nDataFrame Dimensions:", newsDF.shape, "\n")
newsDF.head(3)


DataFrame Dimensions: (2900, 6) 



Unnamed: 0,author,description,publisher,source_url,timeStamp,title
0,Phillip Molnar,A British real estate company Monday launched ...,Sandiegouniontribune.com,http://www.sandiegouniontribune.com/business/r...,2018-01-09T00:00:00Z,"Purplebricks, flat fee real estate listers, la..."
1,,``In parts of the continent - especially comme...,Ml-implode.com,http://ml-implode.com/staticnews/2018-01-09_Wh...,2018-01-09T00:00:00Z,Why African millennials can't get enough of Bi...
2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...",Bostonherald.com,http://www.bostonherald.com/news/national/2018...,2018-01-09T00:00:00Z,Kodak surges at it becomes latest 'cryptocurre...


__1.3 Perform data cleanup and preprocessing.__



The following functions perform basic clean up on a dataframe. The purpose is to prepare the file to write-out (csv).  
 

In [13]:
# Replace 'None' values
def findNone(df):
    """
     Receives pandas datraframe, and removes null entries from author feature
    """
    print("Removing 'None' values in author feature...")
    author = df['author']
    publisher = df['publisher']
    
    for i in range(len(df)):
        if pd.isnull(author.loc[i]):
            author.loc[i] = publisher.loc[i]
    return(df)

In [14]:
# Remove gaps 
def gapStrip(df):
    """
    Receives pandas dataframe and leading and traling empty space`
    """
    df.columns = map(str.strip, df.columns) 
    print("Removing leading and trailing spaces and tabs...")
    # element-wise operation
    f = lambda x: x.strip() if (isinstance(x,str)) else x
    df = df.applymap(f)
    return(df)

In [15]:
# Standardize time stamps
def std_timeStamp(df):
    """
    Receives pandas dataframe and standardizes time stamps 
    """
    import datetime
    # Check to see time stamps are in zero timezones
    print("Converting Time Stamps to Desired Standard Formating...")
    for time in df['timeStamp']:
        if time.endswith('Z'):
            df['timeStamp'] = pd.to_datetime(df['timeStamp'], 
                                            infer_datetime_format = True,
                                            utc = True)                       # returns a type '.Timestamp'
            return(df)
        else:
            print("Revisit appropriate variable or function to deal with time zones that are not zero")

In [16]:
def feature_clean(df):
    """
    Performs Generic Cleanup and Preprocessing on a given dataframe sourced from newsAPI
    """
    temp = findNone(df)            # removes missing values from author column
    temp2 = gapStrip(temp)      # remove leading and trailing white space
    temp3 = std_timeStamp(temp2)  # convert time stamps to starndard
    return(temp3)

In [17]:
riskEx_df = feature_clean(newsDF)
riskEx_df.head(5)

Removing 'None' values in author feature...
Removing leading and trailing spaces and tabs...
Converting Time Stamps to Desired Standard Formating...


Unnamed: 0,author,description,publisher,source_url,timeStamp,title
0,Phillip Molnar,A British real estate company Monday launched ...,Sandiegouniontribune.com,http://www.sandiegouniontribune.com/business/r...,2018-01-09 00:00:00+00:00,"Purplebricks, flat fee real estate listers, la..."
1,Ml-implode.com,``In parts of the continent - especially comme...,Ml-implode.com,http://ml-implode.com/staticnews/2018-01-09_Wh...,2018-01-09 00:00:00+00:00,Why African millennials can't get enough of Bi...
2,"Associated Press, By Associated Press","NEW YORK — Kodak, which traces its roots to th...",Bostonherald.com,http://www.bostonherald.com/news/national/2018...,2018-01-09 00:00:00+00:00,Kodak surges at it becomes latest 'cryptocurre...
3,Ml-implode.com,``JPMorgan Chase CEO Jamie Dimon has walked ba...,Ml-implode.com,http://ml-implode.com/staticnews/2018-01-09_Di...,2018-01-09 00:00:00+00:00,"Dimon: ""I regret calling bitcoin a fraud"""
4,Ml-implode.com,"``Staff at the regulatory agency ""expressed co...",Ml-implode.com,http://ml-implode.com/staticnews/2018-01-09_Fu...,2018-01-09 00:00:00+00:00,Fund managers say bitcoin ETF proposals withdr...


__1.4 Split dataset into n-csv-files for distrubuted computation or batching.__

In [18]:
def df_to_csvs(df):
    articlesPage = int(100)
    totalArticles = len(df)
    batchSize=round(totalArticles/articlesPage)          # number of rows in single output file
        
    for id, df_i in  enumerate(np.array_split(df, batchSize)):
        df_i.to_csv('rawData_{id}.csv'.format(id=id), index_label = False)                 

In [19]:
# write out n-csv-files each with 100 rows. Process is done to reduce computational load.
df_to_csvs(riskEx_df)

### **End Data Mining I:** Read-in NewsAPI feed for a given date range
___