In [1]:
# https://altair-viz.github.io/case_studies/exploring-weather.html
# look into this once you get started with EDA
# conda install -c conda-forge altair 


In [121]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp

# read-in and write-out
import csv

In [123]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining II
This  notebook is intended to perform the following processes:

    2.1 Read-in batched csv files and perform content extraction.

    2.2 The notebook uses beautifulsoup to extract paragraph content of each url in batch.

    2.3 Content extraction is written out as a matching csv file for future concatenation/merging

___
### **Begin Data Mining II:** Per-url, article content extraction


__2.1 Read-in batched csv files and perform content extraction.__

In [178]:
df = pd.read_csv('rawData.csv')

In [125]:
df.tail(3)

Unnamed: 0,author,description,publisher,source_url,timeStamp,title
24797,Bram de Haas,Bitcoin has crashed pretty badly since the end...,Seekingalpha.com,https://seekingalpha.com/article/4158925-buy-b...,2018-03-26 18:21:07+00:00,"Why Buy Bitcoin At $8,169.80?"
24798,Financial Times,Social media sites are under pressure to prote...,Financial Times,https://www.ft.com/content/bddd293a-3118-11e8-...,2018-03-26 18:18:25+00:00,Twitter cracks down on cryptocurrency ads
24799,Michelle Meyers,"As was expected, the company joins Facebook an...",Cnet.com,https://www.cnet.com/news/twitter-confirms-its...,2018-03-26 18:17:15+00:00,Twitter confirms it's banning cryptocurrency a...


__2.2 Use beautifulsoup to extract paragraph content of each url in batch.__

#### Function: **get_article**
get_artilcle controls two function that extract the contents of a given url -- by accessing the paragraphs.

In [126]:
def validate_site(site):
    from urllib.parse import urlparse
    import copy
    url = copy.copy(site) 
    
    # urlparse requires a string 
    info = urlparse(str(url))
    if 'mp3' in info.path or 'rlslog' in info.path:  # validates that site is not a podcast
        return('')                                   # flagged for removal
    else:
        return(url)                    

In [127]:
def get_content(site):
    src = requests.get(site).content            # accesses content of html object
    soup = bs.BeautifulSoup(src, 'lxml')        # object creation used in extracting paragraphs using built-in html parser
    body = soup.find_all('p')                   # finds all paragraphs '<p>' in html object
    return(body)

In [128]:
def extract_from_content(body):
    sentence = [parags.text for parags in body]
    text = "\t".join(sentence)                  # tab delimeter for easier extraction
    return(text)

In [129]:
def get_article(site):
    valid_site = validate_site(site)
    if not valid_site:
        return('.mp3: Invalid Type')                # uses 'falsy' check if string is empty 
    else:
        body = get_content(valid_site)
        text = extract_from_content(body)
        if not text:                                # uses 'falsy' check if string is empty 
            return('403 Forbidden')
    return(text)

#### Function: **get_text**
Function loops over all urls and creates a list to be appended to newsDF.

In [130]:
def get_text(df):
    """
    Accepts a 6xn dataframe and returns a 1xn list and a modifed dataframe
    Note: stitching a dataframes to a list is much faster that updating existing one with a new column
    """
    import copy

    content = []
    url = []
    author = copy.copy(df['author'])
    link = copy.copy(df['source_url'])
    
    if author == 'Ml-implode.com': 
        r = requests.get(link)
        soup = bs.BeautifulSoup(r.text, 'html.parser')
        source = soup.find(id="LIJIT_title")            # manually found -- may differ moving forward
        link = source.find('a').get('href')             # gets the 'href inside the a tag -- i.e. the correct url
        url.append(link)
        content.append(get_article(link))               # send correct url out for extraction
        
    else:
        content.append(get_article(link))
        
    return({'url':url, 'content':content})

#### Function: **combine_df**
Function sends list of urls to get_text and get_articles for content extraction. Function then combines all data into a single dataframe.

In [131]:
def combine_df(df1, df2):
    """
    Accepts a dataframe, and calls a function that checks for known aggregator. 
    If found, function replaces invalid entries with corrected ones.
    """
    import copy
    author = copy.deepcopy(df1['author'])
    df = copy.deepcopy(df1)
    df['contents'] = "".join(df2['content']) # converts list to string
    
    #checks for know aggregator, replaces it with the url referenced by aggregator
    if author == 'Ml-implode.com':
        df['source_url'] = "".join(df2['url']) # converts 'list' item to 'str' item
        return(df)
    else:
        return(df)

In [132]:
def replace_author_name(df):
    print('inside replace-author-name')
    from urllib.parse import urlparse
    import copy
    url = copy.copy(df) 
    
    # urlparse requires a string 
    info = urlparse(str(url))
    return(info.netloc)          #.netloc extract the main url -- i.e. excludes path

In [133]:
# Replace 'None' values
def rm_false_author(df,start):
    """
     Receives single row of pandas datraframe, and removes aggregator name from author feature
    """
    author = df['author']
    source = df['source_url']
    publisher = df['publisher']
    
    #checks for know aggregator, replaces it with the url referenced by aggregator
    for i in range(len(df)):
        if author.loc[i] == 'Ml-implode.com':
            print('if checks')
            author.loc[start] = replace_author_name(source.loc[i])  # parameter is correct url as type 'str'
            publisher.loc[start] = author.loc[i]                    # replaces instances of incorrect publisher name
    return(df)

In [306]:
def stitch_df(df, start, halt, init):
    import copy
    df = copy.deepcopy(df)
    data = pd.DataFrame()
    start += init
    halt += init
    terminate = len(df) + init
    
    # displays progress
    if halt != terminate:
        print("Batching range:", start+1,"-", halt) 
    else:
        print("Batching range:", start+1,"-", terminate)
    
    # conditional merges dataframes and calls other functions that provide dataframe with data
    while start < halt:
        #print('  extracting article', start+1)
        df1 = copy.deepcopy(df.loc[start])
        df2 = get_text(df1)

        datum = data.append(combine_df(df1,df2), ignore_index = True)  # append without dropping entries with same row number

        ## minor clean up ##
        # optional: data = data.drop(columns=['description'])          # removes redundant column        
        data = rm_false_author(datum, start)                           # replaces invalid author entry

        start += 1
    return(data)        

__Batch Control and out-to-csv fail safe__

In [327]:
def batch_control(df, batch, init):
    """
    Accepts a pandas df, and a batch size, and iterates through extraction and writting as batches.
    The process is structured as such, to minimize computation time, and detect errors.
    """
    
    start = 0
    terminate = len(df)
    print("Number of articles to be extracted:",terminate)
    print("Total number of batches:", int(np.ceil(terminate/batch)),"\n")
    while start < terminate:
        
        halt = start + batch
    
         # ensures we don't over expand range 
        if halt > terminate:        
            halt = start + np.abs(halt - terminate)
            
        temp = df[start:halt]
        temp_out = stitch_df(temp, start, halt, init)

        try:
            with open('riskEx_df.csv') as file:
                print('\t Updating existing csv file')
                temp_in = pd.read_csv('riskEx_df.csv')
                temp_out = temp_in.append(temp_out, ignore_index=True)
                temp_out.to_csv('riskEx_df.csv', index_label = False)
                print('\t Updated batch saved to csv')
                pass
        except IOError as e:
            print("\t Creating initial csv file")
            temp_out.to_csv('riskEx_df.csv', index_label = False)
            print('\t Initial batch saved to csv')

        start += batch  # Batch control
    print("\n PROJECT COMPLETED\n ")

__2.3 Write out content as csv file__

In [348]:
## Batching and project sizes ##
# time to 100 articles:    02:28
# time to 500 articles:    12:25
batch_size = 100
project1 = df[:1000]       # 22:49
project2 = df[1000:5000]
project3 = df[5300:10000]
project4 = df[10000:15000]
project5 = df[15000:20000]
project6 = df[19500:]

## __NOTE__

__In order to ensure there is no data override, update the initializer to the appropriate value prior to running batch_control__

In [None]:
# execute extraction and write out
init = 25000 # KEY: initialize to the first row value in dataframe being passed

#batch_control(project7, batch_size, init)

__Finally:__ The following carries out some minor preprocessing

In [351]:
# Manual check 
riskEx = pd.read_csv('riskEx_df.csv')
print(riskEx.info())
riskEx.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24800 entries, 0 to 24799
Data columns (total 7 columns):
author         24800 non-null object
contents       24800 non-null object
description    24800 non-null object
publisher      24800 non-null object
source_url     24800 non-null object
timeStamp      24800 non-null object
title          24800 non-null object
dtypes: object(7)
memory usage: 1.5+ MB
None


Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
24795,Anna Hensel,"Above: Bitcoin / Cryptocurrency\tTwitter, foll...","Twitter, following in the footsteps of Google ...",Venturebeat.com,https://venturebeat.com/2018/03/26/twitter-to-...,2018-03-26 18:36:42+00:00,"Twitter to start banning cryptocurrency ads, j..."
24796,Camila Russo,|E-Paper\tNew York: Twitter Inc. is joining Fa...,Cryptocurrency exchanges and wallet services a...,Livemint.com,https://www.livemint.com/Industry/XpaNirgkBbmC...,2018-03-26 18:30:47+00:00,"Twitter joins Facebook, Google in banning cryp..."
24797,Bram de Haas,403 Forbidden,Bitcoin has crashed pretty badly since the end...,Seekingalpha.com,https://seekingalpha.com/article/4158925-buy-b...,2018-03-26 18:21:07+00:00,"Why Buy Bitcoin At $8,169.80?"
24798,Financial Times,Hannah Murphy\tTwitter will ban adverts for ce...,Social media sites are under pressure to prote...,Financial Times,https://www.ft.com/content/bddd293a-3118-11e8-...,2018-03-26 18:18:25+00:00,Twitter cracks down on cryptocurrency ads
24799,Michelle Meyers,CNET también está disponible en español.\tDon'...,"As was expected, the company joins Facebook an...",Cnet.com,https://www.cnet.com/news/twitter-confirms-its...,2018-03-26 18:17:15+00:00,Twitter confirms it's banning cryptocurrency a...


In [353]:
# Removes rows with inaccessible data
def rm_site_error(df):
    """
     Receives pandas datraframe, and removes rows containing data not accessible by BeautifulSoup
    """
    import copy
    data = copy.deepcopy(df)
    content = data['contents']
    for i in range(len(content)):
        #print('Preprocessing article', i)
        
        if content.loc[i] == '403 Forbidden':
            print("\t Error Found: 403 Forbidden --> Dropping site from dataframe...")
            data.drop([i], axis = 0, inplace = True)
        elif content.loc[i] == '.mp3: Invalid Type':
            print("\t Error Found: .mp3 Invalid Type --> Dropping site from dataframe...")
            data.drop([i], axis = 0, inplace = True)
    return(data)

In [None]:
use_riskEx = rm_site_error(riskEx)

In [357]:
use_riskEx

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,"Cecchetti, Schoenholtz",Research-based policy analysis and commentary ...,"Despite recent technological advances, the cos...",Voxeu.org,https://voxeu.org/article/stubbornly-high-cost...,2018-03-27 00:00:00+00:00,The stubbornly high cost of remittances
1,Reuters,Sign up now to get free exclusive access to re...,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26 23:43:50+00:00,Twitter to ban cryptocurrency ads
4,newsfeeds@nzherald.co.nz,\t NEW YORK (AP) — Twitter says it will ban or...,NEW YORK (AP) — Twitter says it will ban or re...,Nzherald.co.nz,http://www.nzherald.co.nz/business/news/articl...,2018-03-26 23:25:06+00:00,"Twitter to ban cryptocurrency ads, joining Fac..."
5,"David Cottle, Analyst, David Cottle",Get the best parts of DailyFX.com in the new D...,"The New Zealand Dollar had a busy morning, wit...",Dailyfx.com,https://www.dailyfx.com/forex/market_alert/201...,2018-03-26 23:19:00+00:00,"New Zealand Dollar Firm After Trade Beat,Eyes ..."
6,Kellie Ell,\n\n\n\n var postLoadFunctions = {}...,One expert said the majority of initial coin o...,CNBC,https://www.cnbc.com/2018/03/26/how-to-evaluat...,2018-03-26 23:18:00+00:00,How to evaluate risky initial coin offerings a...
7,"Amanda Lee, Josh Ye, Amanda Lee, Josh Ye","\tBlockchain\t‘Where there’s demand, there wil...",Initial coin offerings (ICOs) may well have be...,Scmp.com,http://www.scmp.com/tech/china-tech/article/21...,2018-03-26 23:15:26+00:00,How agents are helping investors buy into init...
8,Aaron Diamant,Sign in using your wsbtv profile\tNeed a profi...,Bill payment sites down as City of Atlanta wor...,Wsbtv.com,https://www.wsbtv.com/news/local/atlanta/cyber...,2018-03-26 23:13:00+00:00,Bill payment sites down as city of Atlanta wor...
9,Thomas Delahunty,"-Bitcoin news, price, information & analysis\t...",Cyber-criminals attempting to con people out o...,Newsbtc.com,https://www.newsbtc.com/2018/03/26/desperate-m...,2018-03-26 23:10:29+00:00,Desperate Man Dresses Up As Raccoon To Steal B...
10,Scott Scanlon,Nongaming venues inside Las Vegas casinos such...,Nongaming venues inside Las Vegas casinos such...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/las-...,2018-03-26 23:03:17+00:00,Las Vegas venues use cryptocurrencies to lure ...
11,newsfeeds@nzherald.co.nz,"\t Bitcoin began the week on a down note, decl...","Bitcoin began the week on a down note, declini...",Nzherald.co.nz,http://www.nzherald.co.nz/business/news/articl...,2018-03-26 22:56:45+00:00,"Bitcoin breaches US$8,000 level as March slump..."


In [356]:
## RUN AFTER ENTIRE DF IS COMPLETELY EXTRACTED ##
# saves dataframe as a preprocessed and cleaned (slightly) DataFrame
temp_out.to_csv('use_riskEx.csv', index_label = False)

#### Completed Process: 
##### use_riskEx represents a dataframe of extracted features, their values, and the text content of each article's body

### **End Data Mining II:** Per-url, article content extraction
___