In [1]:
# https://altair-viz.github.io/case_studies/exploring-weather.html
# look into this once you get started with EDA
# conda install -c conda-forge altair 


In [2]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp

# read-in and write-out
import csv

In [3]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining II
This  notebook is intended to perform the following processes:

    2.1 Read-in batched csv files and perform content extraction.

    2.2 The notebook uses beautifulsoup to extract paragraph content of each url in batch.

    2.3 Content extraction is written out as a matching csv file for future concatenation/merging

___
### **Begin Data Mining II:** Per-url, article content extraction


__2.1 Read-in batched csv files and perform content extraction.__

In [4]:
# Extract all .csv elements in folder
import glob
csv_all = glob.glob('*.csv') # elements of list are string name for each .csv file
csv_all

['rawData_6.csv',
 'rawData_7.csv',
 'rawData_5.csv',
 'rawData_4.csv',
 'rawData_0.csv',
 'rawData_1.csv',
 'rawData_3.csv',
 'rawData_2.csv',
 'rawData_9.csv',
 'rawData_8.csv']

__2.2 Use beautifulsoup to extract paragraph content of each url in batch.__

#### Function: **get_article**
get_artilcle controls two function that extract the contents of a given url -- by accessing the paragraphs.

In [5]:
def validate_site(site):
    from urllib.parse import urlparse
    import copy
    url = copy.copy(site) 
    
    # urlparse requires a string 
    info = urlparse(str(url))
    if 'mp3' in info.path:            # validates that site is not a podcast
        return('')                    # flagged for removal
    else:
        return(url)                    

In [6]:
def get_content(site):
    src = requests.get(site).content            # accesses content of html object
    soup = bs.BeautifulSoup(src, 'lxml')        # object creation used in extracting paragraphs using built-in html parser
    body = soup.find_all('p')                   # finds all paragraphs '<p>' in html object
    return(body)

In [7]:
def extract_from_content(body):
    sentence = [parags.text for parags in body]
    text = "\t".join(sentence)                  # tab delimeter for easier extraction
    return(text)

In [8]:
def get_article(site):
    valid_site = validate_site(site)
    if not valid_site:
        return('.mp3: Invalid Type')                # uses 'falsy' check if string is empty 
    else:
        body = get_content(valid_site)
        text = extract_from_content(body)
        if not text:                                # uses 'falsy' check if string is empty 
            return('403 Forbidden')
    return(text)

#### Function: **get_text**
Function loops over all urls and creates a list to be appended to newsDF.

In [9]:
def get_text(df):
    """
    Accepts a 6xn dataframe and returns a 1xn list and a modifed dataframe
    Note: stitching a dataframes to a list is much faster that updating existing one with a new column
    """
    import copy

    content = []
    url = []
    author = copy.copy(df['author'])
    link = copy.copy(df['source_url'])

    for name in author:
       
        if name == 'Ml-implode.com': 
            r = requests.get(link)
            soup = bs.BeautifulSoup(r.text, 'html.parser')
            source = soup.find(id="LIJIT_title")            # manually found -- may differ moving forward
            link = source.find('a').get('href')             # gets the 'href inside the a tag -- i.e. the correct url
            url.append(link)
            content.append(get_article(link))               # send correct url out for extraction
        
        else:
            content.append(get_article(link))
        
    return({'url':url, 'content':content})

#### Function: **combine_df**
Function sends list of urls to get_text and get_articles for content extraction. Function then combines all data into a single dataframe.

In [10]:
def combine_df(df1, df2):
    import copy
    author = copy.deepcopy(df1['author'])
    df = copy.deepcopy(df1)
    df['contents'] = "".join(df2['content']) # converts list to string
    
    # catches aggregator names and replaces them with that of the buplisher
    #checks for know aggregator, replaces it with the url referenced by aggregator
    for i in range(len(df)):
        if author == 'Ml-implode.com':
            df['source_url'] = "".join(df2['url']) # converts 'list' item to 'str' item
    return(df)

In [11]:
def replace_author_name(df):
    from urllib.parse import urlparse
    import copy
    #data = copy(df)
    url = copy.copy(df) 
    
    # urlparse requires a string 
    info = urlparse(str(url))
    return(info.netloc)          #.netloc extract the main url -- i.e. excludes path

In [12]:
# Replace 'None' values
def rm_false_author(df):
    """
     Receives pandas datraframe, and removes aggregator name from author feature
    """
    author = df['author']
    source = df['source_url']
    publisher = df['publisher']
    
    #checks for know aggregator, replaces it with the url referenced by aggregator
    for i in range(len(df)):
        if author.loc[i] == 'Ml-implode.com':
            author.loc[i] = replace_author_name(source.loc[i])  # parameter is correct url as type 'str'
            publisher.loc[i] = author.loc[i]                    #replaces instances of incorrect publisher name
    return(df)

In [13]:
def stitch_df(df):
    import copy
    df = copy.deepcopy(df)
    data = pd.DataFrame()
    
    # for-loop merges dataframes and calls other functions that provide dataframe with data
    for i in range(len(df)):
        df1 = copy.deepcopy(df.loc[i])
        df2 = get_text(df1)
        datum = data.append(combine_df(df1,df2))
        
        ## minor clean up ##
        # optional: data = data.drop(columns=['description'])     # removes redundant column        
        data = rm_false_author(datum)                             # replaces invalid author entry
    return(data)

__do something__

In [14]:
# Removes rows with inaccessible data
def rm_site_error(df):
    """
     Receives pandas datraframe, and removes rows containing data not accessible by BeautifulSoup
    """
    import copy
    data = copy.deepcopy(df)
    content = data['contents']
    for i in range(len(content)):
        if content.loc[i] == '403 Forbidden':
            print("Error Found: 403 Forbidden --> Dropping site from dataframe...")
            # df.drop(df.index[[1,3]])
            data.drop([i], axis = 0, inplace = True)
            #df = data[data.index != '403 Forbidden']
            #print(data.index[i], type(data.index[i]))
            #iris.ix[iris['sepal length (cm)'] >= 5]
            #data = df.ix[df['contents'] == '403 Forbidden' ]
        elif content.loc[i] == '.mp3: Invalid Type':
            print("Error Found: .mp3 Invalid Type --> Dropping site from dataframe...")
            data.drop([i], axis = 0, inplace = True)
    return(data)

In [15]:
#takes list of the csv-s, reads them and outputs list of df-s
def make_lst_dfs(lst_csv):
    lst_dfs = []
    for i in lst_csv:
        df= pd.read_csv(i)
        df = df.reset_index(drop = True)
        lst_dfs.append(df)
    return lst_dfs

#### Completed Process: 
##### riskEx_df represents a dataframe of extracted features, their values, and the text content of each article's body

__The following cell has an approximate run time of:__ 
#### **5 HOURS**

Use *riskEx_df.csv* or *riskEx_df.json* instead.

In [16]:
def control(lst):
    for item in range(len(lst)):
        print(item, lst[item])
        
        extract_this = stitch_df(lst[item])
        clean_this = rm_site_error(exract_this) 
        item.to_csv('riskEx_{id}.csv'.format(id=id), index_label = False)
#    for id, df_i in  enumerate(np.array_split(df, batchSize)):
        

In [17]:
chunk = pd.read_csv('rawData_0.csv')


In [18]:
cunk_df = stitch_df(chunk)

In [19]:
cunk_df.to_csv('short')

### **End Data Mining II:** Per-url, article content extraction
___

In [75]:
articles1000 = pd.read_csv('articles1000')

In [76]:
articles1000_content = stitch_df(articles1000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


SSLError: ("bad handshake: SysCallError(54, 'ECONNRESET')",)