In [1]:
import newspaper
import csv
import pandas as pd

In [2]:
file = 'data/training_dataset.csv'

In [3]:
def urls_from_csv(csv_file, column=None):
    '''
    Takes csv directory and returns list of URLs
    '''
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        contents = list(reader)
    
    urls = [line[1] for line in contents[1:]]
    return urls, contents

In [4]:
def urls_to_df(csv_file, column=None):
    '''
    Takes csv directory and returns list of URLs
    '''
    df = pd.read_csv(csv_file)
    df.columns = [x.lower() for x in df.columns]
    urls = list(df['url'])
    return urls, df

In [5]:
urls, contents = urls_from_csv(file)

In [6]:
def remove_newline(text):
    ''' Removes new line and &nbsp characters.
    '''
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ')
    return text

In [13]:
def html_report(link, nlp=False):
    report = {}
    a = newspaper.Article(link)
    a.download()
    a.parse()
    report['domain'] = a.source_url
    report['title'] = a.title
    report['authors'] = a.authors
    report['date_pub'] = a.publish_date
    report['text'] = remove_newline(a.text)
    # tag the type of article
    ## currently default to text but should be able to determine img/video etc
    report['type'] = 'text'
    return report

In [8]:
urls, df = urls_to_df(file)

In [26]:
def scrape_from_urls(urls):
    reports = []
    for url in urls:
        if url[-3:] == 'pdf':
            continue
        else:
            report = html_report(url)
            reports.append(report)
            
    return reports

In [24]:
url = urls[0]

In [25]:
url[-3]

'p'

In [17]:
urls_test = urls[0:5]

In [18]:
urls_test

['http://www.securitycouncilreport.org/atf/cf/%7B65BFCF9B-6D27-4E9C-8CD3-CF6E4FF96FF9%7D/S_2015_302.pdf',
 'http://www.independent.co.uk/news/world/asia/160-killed-and-hundreds-left-stranded-by-flooding-across-afghanistan-and-pakistan-8746566.html',
 'http://reliefweb.int/sites/reliefweb.int/files/resources/Natural%20Hazards%20Update%204-10%20February_1.pdf',
 'https://www.humanitarianresponse.info/en/operations/afghanistan/document/rafdump01-jan-2015-till-31-dec-2015-0',
 'http://floodlist.com/asia/afghanistan-flash-floods-faryab-baghlan-8-dead']

In [21]:
reports = scrape_from_urls(urls_test)

In [22]:
reports

[{'authors': [],
  'date_pub': None,
  'domain': 'http://www.securitycouncilreport.org',
  'text': '',
  'title': '',
  'type': 'text'},
 {'authors': ['Heather Saul'],
  'date_pub': datetime.datetime(2013, 8, 5, 12, 33, 51, tzinfo=tzoffset(None, 3600)),
  'domain': 'http://www.independent.co.uk',
  'text': "Flash flooding across Afghanistan and Pakistan has left more than 160 dead and dozens stranded in one of South Asia's worst natural disasters this year, say officials.  The flooding, caused by unusually heavy rain, has left villagers stuck in remote areas without shelter, food or power.  Mountainous Afghanistan was the worst hit, with 61 people killed and approximately 500 traditional mud-brick homes washed away in more than a dozen villages in Sarobi, a rural district less than an hour from Kabul, officials said.  Floods left a village devastated in the remote eastern Afghan province of Nuristan. At least 60 homes were destroyed across three districts, said provincial spokesman Moh

In [14]:
report = {}

In [15]:
report = html_report('http://floodlist.com/asia/afghanistan-flash-floods-faryab-baghlan-8-dead')

In [16]:
report

{'authors': [],
 'date_pub': datetime.datetime(2015, 5, 11, 9, 15, 5, tzinfo=tzutc()),
 'domain': 'http://floodlist.com',
 'text': 'Afghanistan state news agency, Bakhtar News Agency (BNA) report that at least 7 people have been killed in flash floods in Faryab Province in the north of the country. Flash floods in Baghlan Province have killed 1 person and injured around 10 others.  Flash floods struck on 08 May 2015 in Faryab Province after a period of heavy rainfall. The districts of Garyzan, Pashtunkot and Belcheragh were worst affected. BNA report that at least 7 people were killed and over 1,500 homes damaged. The Faizabada-Takhar highway have been closed to traffic and wide areas of crops and orchards have suffered damaged.  Kuwaiti News Agency (KUNA) also report that flooding struck in the Baghlan-i-Markazi district of Baghlan province, where 1 person was killed and several injured early on Saturday 09 May 2015.  “There was heavy rain in Baghlan-e-Markazi district Friday evening 

In [141]:
report = [report]

In [107]:
keys = report[0].keys()

In [110]:
with open('data.csv', 'w') as f:
    dict_writer = csv.DictWriter(f, fieldnames=keys)
    dict_writer.writeheader()
    dict_writer.writerows(report)