In [1]:
import newspaper
import csv
import pandas as pd

In [2]:
file = 'data/training_dataset.csv'

In [3]:
def urls_from_csv(csv_file, column=None):
    '''
    Takes csv directory and returns list of URLs
    '''
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        contents = list(reader)
    
    urls = [line[1] for line in contents[1:]]
    return urls, contents

In [4]:
def urls_to_df(csv_file, column=None):
    '''
    Takes csv directory and returns list of URLs
    '''
    df = pd.read_csv(csv_file)
    df.columns = [x.lower() for x in df.columns]
    urls = list(df['url'])
    return urls, df

In [5]:
urls, contents = urls_from_csv(file)

In [33]:
len(contents[0])

3

In [6]:
def remove_newline(text):
    ''' Removes new line and &nbsp characters.
    '''
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ')
    return text

In [7]:
def html_report(link, nlp=False):
    report = {}
    a = newspaper.Article(link)
    a.download()
    a.parse()
    report['domain'] = a.source_url
    report['title'] = a.title
    report['authors'] = a.authors
    report['date_pub'] = a.publish_date
    report['text'] = remove_newline(a.text)
    # tag the type of article
    ## currently default to text but should be able to determine img/video etc
    report['type'] = 'text'
    return report

In [8]:
urls, df = urls_to_df(file)

In [9]:
def scrape_from_urls(urls):
    reports = []
    for url in urls:
        if url[-3:] == 'pdf':
            continue
        else:
            report = html_report(url)
            reports.append(report)
            
    return reports

In [24]:
url = urls[1]

In [25]:
a = newspaper.Article(url)

In [26]:
a.download()

In [27]:
a.parse()

In [110]:
keys = report[0].keys()
with open('data.csv', 'w') as f:
    dict_writer = csv.DictWriter(f, fieldnames=keys)
    dict_writer.writeheader()
    dict_writer.writerows(report)

In [54]:
contents

[['Country_or_region', 'URL', 'Tag'],
 ['Abyei Area',
  'http://www.securitycouncilreport.org/atf/cf/%7B65BFCF9B-6D27-4E9C-8CD3-CF6E4FF96FF9%7D/S_2015_302.pdf',
  'Conflict and violence'],
 ['Afghanistan',
  'http://www.independent.co.uk/news/world/asia/160-killed-and-hundreds-left-stranded-by-flooding-across-afghanistan-and-pakistan-8746566.html',
  'Disasters'],
 ['Afghanistan',
  'http://reliefweb.int/sites/reliefweb.int/files/resources/Natural%20Hazards%20Update%204-10%20February_1.pdf',
  'Disasters'],
 ['Afghanistan',
  'https://www.humanitarianresponse.info/en/operations/afghanistan/document/rafdump01-jan-2015-till-31-dec-2015-0',
  'Disasters'],
 ['Afghanistan',
  'http://floodlist.com/asia/afghanistan-flash-floods-faryab-baghlan-8-dead',
  'Disasters'],
 ['Afghanistan',
  'http://floodlist.com/asia/afghanistan-6-dead-flash-floods-kofab-badakhshan-july-2015',
  'Disasters'],
 ['Afghanistan',
  'http://reliefweb.int/report/afghanistan/afghanistan-earthquake-overview-assessed-nee

In [56]:
def urls_from_csv(dataset, column=None, header=1):
    '''
    Takes csv in the form of the training dataset and returns list of URLs
    Parameters
    ----------
    csv: path to csv file containing urls
    column: integer number (0 indexed) or name of column with urls
            if not given, function will try to find column with urls
    header: used to index beginning of rows
            defaults to 1, assumes header present

    Returns
    -------
    urls: a list of URLs
    '''
    # if a column is given
    if column:
        # check whether it is a valid integer
        if isinstance(column, int) and column < len(dataset[0]):
            # take urls from that column
            urls = [line[column] for line in dataset[header:]]
        # if a column name is given, check header also selected and is present    
        elif isinstance(column, str) and header == 1 and column in dataset[0]:
            # find the column index containing the string
            column = dataset[0].index(column)
            urls = [line[column] for line in dataset[header:]]
        elif isinstance(column, str) and header == 0:
            raise ValueError("Invalid use of column name."
                             "No header present in dataset.")
        elif isinstance(column, str) and column not in dataset[0]:
            raise ValueError("Invalid column name."
                             "Column name specified not in dataset."
                             "Please use a valid column name.")
        else:
            raise ValueError("Column index not in range of dataset."
                            "Please choose a valid column index.")
    # if no column specified, try to find by looking for  
    elif column is None:
        first_row = dataset[header]
        index = [i for i, s in enumerate(first_row) if 'http' in s]
        urls = [line[index] for line in dataset[header:]]
    else:
        raise ValueError("Can't find any URLs!")

    return urls

In [58]:
def csv_read(csvfile):
    '''
    Takes csv in the form of the training dataset and returns as list of lists
    representing each row.
    Parameters
    ----------
    csvfile: directory of csv file

    Returns
    -------
    dataset: dataset including header as list of lists
    '''
    with open(csvfile, 'r') as f:
        reader = csv.reader(f)
        dataset = list(reader)
    return dataset

In [68]:
def csv2dict(filename):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        contents = [line for line in reader]
    return contents

In [100]:
dicting = csv2dict(file)

In [76]:
dicting[0]

OrderedDict([('Country_or_region', 'Abyei Area'),
             ('URL',
              'http://www.securitycouncilreport.org/atf/cf/%7B65BFCF9B-6D27-4E9C-8CD3-CF6E4FF96FF9%7D/S_2015_302.pdf'),
             ('Tag', 'Conflict and violence'),
             ('dave', 'barry')])

In [98]:
def dict2csv(filename, dataset):
    with open(filename, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=dataset[0].keys())
        writer.writerows(dataset)

In [101]:
dict2csv('out.csv', dicting)

In [102]:
import pandas as pd

In [103]:
a = pd.read_csv('out.csv')
a.head()

Unnamed: 0,Abyei Area,http://www.securitycouncilreport.org/atf/cf/%7B65BFCF9B-6D27-4E9C-8CD3-CF6E4FF96FF9%7D/S_2015_302.pdf,Conflict and violence
0,Afghanistan,http://www.independent.co.uk/news/world/asia/1...,Disasters
1,Afghanistan,http://reliefweb.int/sites/reliefweb.int/files...,Disasters
2,Afghanistan,https://www.humanitarianresponse.info/en/opera...,Disasters
3,Afghanistan,http://floodlist.com/asia/afghanistan-flash-fl...,Disasters
4,Afghanistan,http://floodlist.com/asia/afghanistan-6-dead-f...,Disasters
