In [21]:
# module to capture source data and write to a CSV
import os, errno, array, csv, json, math, random, urllib, urllib2, json, re
from datetime import datetime
import zipfile
import db_settings
import psycopg2

def download_file(url, filename=None):    
    if filename is None:
        p = url.split('/')
        filename = p[-1]
    if os.path.isfile(filename):
        print 'File already exists'
        return
    test_directory(filename)
    u = urllib2.urlopen(url)
    f = open(filename, 'wb')
    meta = u.info()
    try:
        file_size = int(meta.getheaders("Content-Length")[0])
        print "Downloading: %s Bytes: %s" % (filename, file_size)
    except IndexError:
        # can't get the header, so just download
        urllib.urlretrieve(url, filename)
        print 'Download Finished'
        return

    file_size_dl = 0
    block_sz = 8192
    increment = (file_size / block_sz) / 100
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break
        file_size_dl += len(buffer)
        f.write(buffer)
        if increment > 0 and (file_size_dl / block_sz)%increment == 0:
            status = r"%10d  [%3d%%]" % (file_size_dl, ((file_size_dl / block_sz) / increment))
            print status,
    print 'Download Finished'
    f.close()

def test_directory(filename):
    path = os.path.dirname(filename)
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

def write_to_csv(state, records, overwrite=False):
    today = datetime.today()
    #filename = 'csvs/' + state.name.lower() + '-' + datetime.strftime(datetime.today(), "%Y-%m-%d") + '.csv'
    filename = 'csvs/' + state.lower() + '-' + 'data' + '.csv'

    try:
        os.makedirs(os.path.dirname(filename))
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    if os.path.isfile(filename) and not overwrite:
        raise IOError('File already exists. Specify overwrite = True in function parameters to overwrite.')
    if len(records) == 0:
        raise IndexError('State object has not data!')

    with open(filename, 'wb') as f:
        writer = csv.DictWriter(f, fieldnames=records[0].keys())
        writer.writeheader()
        for row in records:
            writer.writerow(row)
        print 'Wrote', str(len(records)), 'rows to', filename
    return True

    
def strip_and_encode_dict(d):
    new_dict = dict()
    for key, value in d.iteritems():
        if isinstance(value, dict):
            value = strip_and_encode(value)
        elif isinstance(value, list):
            value = strip_and_encode_list(value)
        else:
            key = key.encode('utf-8').strip()
            if value:
                value = value.encode('utf-8') if isinstance(value, unicode) else value
                value = value.strip() if isinstance(value, str) else value
        new_dict[key] = value
    return new_dict
def strip_and_encode_list(l):
    new_list = list()
    for item in l:
        if isinstance(item, dict):
            new_item = strip_and_encode_dict(item)
        elif isinstance(item, list):
            new_item = strip_and_encode_list(item)
        else:
            new_item = item.encode('utf-8') if item and isinstance(item, unicode) else item
            new_item = new_item.strip() if isinstance(new_item, str) else new_item
        new_list.append(new_item)
    return new_list

def st_time(func):
    """
        st decorator to calculate the total time of a func
    """

    def st_func(*args, **keyArgs):
        t1 = time.time()
        r = func(*args, **keyArgs)
        t2 = time.time()
        print "Function=%s, Time=%s" % (func.__name__, t2 - t1)
        return r

    return st_func

In [None]:
from selenium.webdriver import Firefox, FirefoxProfile
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import os, errno, time

class ffdriver(Firefox):
    def __init__(self, download_directory):
        self.download_directory = download_directory
        # To prevent download dialog
        profile = FirefoxProfile()
        print "Download directory is:", self.download_directory
        profile.set_preference('browser.download.folderList', 2) # custom location
        profile.set_preference('browser.download.manager.showWhenStarting', False)
        profile.set_preference('browser.download.dir', self.download_directory)
        profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv,application/vnd.ms-excel')
        profile.set_preference('browser.helperApps.alwaysAsk.force', 'false')

        #super(ffdriver, self).__init__() #
        Firefox.__init__(self, profile)
        
        self.implicitly_wait(60) # seconds
        
    def clear_downloads(self):
        try:
            os.makedirs(self.download_directory)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
            else:
                for f in os.listdir(self.download_directory):
                    os.remove(os.path.join(self.download_directory, f))
    def count_downloads(self):
        return len(os.listdir(self.download_directory))
    


In [17]:
import requests, json
from lxml import html
from datetime import datetime, timedelta

class arcgis_service(object):
    def __init__(self, service_url):
        # http://dog.dnr.alaska.gov/arcgis/rest/services/DOGMapServices/MapServer/9
        self.service_url = service_url
        self.ids = None
        self.records = None
        
    def get_ids(self):
        query = '/query?where=1%3D1&returnIdsOnly=true&f=pjson'
        url = self.service_url + query
        r = requests.get(url)
        response = r.json()
        self.ids = [id for id in response['objectIds']]
        print 'Obtained IDs for', str(len(self.ids)), 'records.'
        return self.ids
    
    def get_records(self):
        if not self.ids:
            self.get_ids()
        self.records = list()
        for i in xrange(0, len(self.ids), 100):
            query = '/query?f=pjson&outSR=4326&returnGeometry=true&returnGeometry=true&outFields=*&objectIds='
            query_ids = [str(j) for j in self.ids[i:i+100]]
            query_url = self.service_url + query + '%2C+'.join(query_ids)
            for attempt in range(2,12):
                try:
                    r = requests.get(query_url)
                except requests.exceptions.ConnectionError as e:
                    print 'Request timed out. Waiting', str(2**attempt), 'seconds then trying again.'
                    time.sleep(2**attempt)
                else:
                    break
            else:
                print 'Failed after 10 retries at', query_url
                raise requests.exceptions.ConnectionError
            
            response = r.json()
            if 'exceededTransferLimit' in response:
                print 'Too many records. Breaking...'
                break
            for well in response['features']:
                self.records.append(well)
        print 'retrieved ' + str(len(self.records)) + ' records'
        return self.records
    # results from ArcGIS server usually come with geometry and attributes
    # dictionaries. This method combined into a single-leveled dictionary
    def get_flat_records(self):
        if not self.records:
            self.get_records()
        flat_records = []
        for record in self.records:
            row = dict()
            for key in record.keys():
                for k, v in record[key].iteritems():
                    row[k] = v
            flat_records.append(row)
        self.flat_records = strip_and_encode_list(flat_records)
        return self.flat_records
    def ticks_to_ymd(self, ts):
        if not ts or ts == u'':
            return None
        try:
            d = datetime(1970, 1, 1) + timedelta(seconds=(ts / 1000))
        except ValueError as e:
            print 'bad TS:', str(ts)
            raise
        return '-'.join([str(d.year), str(d.month), str(d.day)])

In [None]:
# Alabama - Finished
state = 'AL'

def download_files():
    driver = ffdriver(state.download_directory)
    driver.clear_downloads()
    driver.get("http://www.ogb.state.al.us/ogb/database.aspx")
    assert "OGB Well Database" in driver.title
    driver.find_element_by_id("RadioButtonList1_9").click()
    select = Select(driver.find_element_by_id("DropDownList3"))
    options = [str(opt.get_attribute("value")) for opt in select.options]
    num_options = len(options) - 1
    for option in options:
        if option != 'Select a status...':
            print option,
            select = Select(driver.find_element_by_id("DropDownList3"))
            select.select_by_value(option)#.click()
            time.sleep(3)
            #print option,
            driver.find_element_by_id('btn_status_xl').click()
            time.sleep(1)
    while len([f for f in os.listdir(state.download_directory) if f[-4:]=='part']) > 0:
        time.sleep(1) # if current download in process, wait...
    driver.close()
    if driver.count_downloads() == num_options:
        print "successfully downloaded", num_options, "files"
        return True
    else:
        print 'something went wrong!'
        return False

def parse():
    from BeautifulSoup import BeautifulSoup # bs4 crashes python repeatedly. bs3 seems more stable
    table_data = table_headers = []

    for filename in os.listdir(state.download_directory):
        with open(os.path.join(state.download_directory, filename), 'rb') as f:
            html_data = f.read()
        soup = BeautifulSoup(html_data)
        rows = soup('tr')
        if table_data == []:
            table_headers = [cell.text.strip().encode('utf-8') for cell in rows[0]("th")]
        table_data += [[(cell.text).strip().encode('utf-8') for cell in row("td")] for row in rows[1:]]

    rows = []
    for item in table_data:
        if len(item) != len(table_headers):
            print 'invalid row:', item
            table_data.remove(item)
            continue
        
        row = dict(zip(table_headers, item))
        rows.append(row)

    return rows

download_files()
rows = parse()
write_to_csv(state, rows, True)

In [None]:
# Alaska - Finished, Working
state = 'AK'
well_source_url = None
source_url = ''
description = """
"""
write_to_db(state.name, source_url, well_source_url, description)


from datetime import datetime, timedelta

if 'ak_records' not in locals():
    ak_gis = arcgis_service('http://dog.dnr.alaska.gov/arcgis/rest/services/DOGMapServices/MapServer/9')
    ak_records = ak_gis.get_records()

print ak_records[0]

rows = []
for record in ak_records:
    row = dict()
    for k, v in record['geometry'].iteritems():
        row[k] = v
    for k, v in record['attributes'].iteritems():
        row[k] = v
    ts = spud = None
    if record['attributes']['SDate']:
        ts = record['attributes']['SDate'] / 1000
        ts = record['attributes']['CDate'] / 1000
    elif record['attributes']['PDate']: 
        ts = record['attributes']['PDate'] / 1000
    if ts:
        d = datetime.utcfromtimestamp(ts) if ts > 0 else datetime(1970, 1, 1) + timedelta(seconds=(ts))
        spud = '-'.join([str(d.year), str(d.month), str(d.day)])
    row['Spud_Date'] = spud
    rows.append(row)

write_to_csv(state, rows, True)

In [42]:
# Arizona
state = 'AZ'

source = arcgis_service('http://services.azgs.az.gov/arcgis/rest/services/aasggeothermal/AZWellHeaders/MapServer/0')
az_records = source.get_flat_records()

records = [x for x in az_records if x['apino'] and x['apino'].strip() not in {'', u'', 'urn:ogc:def:nil:OGC:1.0:missing'}]
for row in records:
    if row['spuddate'] not in {'', u''}:
        row['date'] = source.ticks_to_ymd(row['spuddate'])
    else:
        row['date'] = source.ticks_to_ymd(row['endeddrillingdate'])

write_to_csv(state, records, True)

Obtained IDs for 4774 records.
retrieved 4774 records
Wrote 1135 rows to csvs/az-data.csv


True

In [None]:
# Arkansas
state = 'AR'

def download_files():
    driver = ffdriver(state.download_directory)
    driver.implicitly_wait(600) # 10 minutes
    driver.clear_downloads()
    driver.get('http://www.aogc2.state.ar.us/welldata/Wells/Default.aspx')
    assert "Production & Well Data" in driver.title
    criteria_select = Select(driver.find_element_by_id("cpMainContent_ddlCriteria"))
    criteria_select.select_by_visible_text('Well Type')
    time.sleep(3)
    well_type_select = Select(driver.find_element_by_id('cpMainContent_ddlListItem'))
    options = [str(opt.get_attribute("value")) for opt in well_type_select.options]

    for opt_idx in range(1, len(options)):
        driver.get('http://www.aogc2.state.ar.us/welldata/Wells/Default.aspx')
        assert "Production & Well Data" in driver.title   
        criteria_select = Select(driver.find_element_by_id("cpMainContent_ddlCriteria"))
        criteria_select.select_by_visible_text('Well Type')
        time.sleep(3)
        well_type_select = Select(driver.find_element_by_id('cpMainContent_ddlListItem'))
        options = [str(opt.get_attribute("value")) for opt in well_type_select.options]
        well_type_select.select_by_value(options[opt_idx])
        driver.find_element_by_id('cpMainContent_btnSubmit').click()
        time.sleep(3)
        driver.find_element_by_id('cpMainContent_btnExcel').click()
        while (len(os.listdir(state.download_directory)) < opt_idx
            and len([f for f in os.listdir(state.download_directory) if f[-4:]=='part']) > 0):
            time.sleep(15) # if current download in process, wait...
    driver.quit()
                         
    if driver.count_downloads() == len(options) - 1:
        print "successfully downloaded", num_options, "files"
        return True
    else:
        print 'something went wrong!'
        return False

download_files()

In [None]:
# California
state = State('CA')

In [None]:
# Colorado
state = State('CO')

In [None]:
# Connecticut
state = State('CT')

In [None]:
# Delaware
state = State('DE')

In [None]:
# District of Columbia
state = State('DC')

In [None]:
# Florida
state = State('FL')

In [None]:
# Georgia
state = State('GA')

In [None]:
# Hawaii
state = State('HI')

In [None]:
# Idaho
state = State('ID')

In [None]:
# Illinois
state = State('IL')

In [None]:
# Indiana - Scraping Finished and working
import Queue
from threading import Thread

def get_records(q, service):
    records = arcgis_service(service).get_flat_records()
    q.put(records)

services = ['https://gis.indiana.edu/arcgis/rest/services/PDMS/Basic_PDMS/MapServer/1', 'https://gis.indiana.edu/arcgis/rest/services/PDMS/Basic_PDMS/MapServer/2']
threads = []
q = Queue.Queue()
for service in services:
    t = Thread(target=get_records, args=(q, service))
    threads.append(t)
    t.start()
for t in threads:
    t.join()

records = []
while not q.empty():
    records += q.get()

import requests, Queue
from lxml import html

# Scrape PDMS to get dates
def get_details(q, rows):
    for row in rows:
        url = 'https://igs.indiana.edu/pdms/wellEvents.cfm?igsID=%s' % str(row['IGS_ID'])
        for attempt in range(12):
            try:
                page = requests.get(url)
            except requests.exceptions.ConnectionError as e:
                print str(2**attempt),
                time.sleep(2**attempt)
            else:
                break
        else:
            print 'Failed after 12 retries at', url
            raise requests.exceptions.ConnectionError                
        tree = html.fromstring(page.content)
        events    = tree.xpath('//*[@id="indEventsTable"]')
        permits   = tree.xpath('//*[@id="indEventsTable"]/tr[1]/td[1]/text()')  
        dates     = tree.xpath('//*[@id="indEventsTable"]/tr[3]/td[1]/text()')
        statuses  = tree.xpath('//*[@id="indEventsTable"]/tr[1]/td[3]/text()')
        operators = tree.xpath('//*[@id="indEventsTable"]/tr[1]/td[2]/text()')
        for idx in range(1 if len(events)==1 else len(events)-1):
            permit = permits[idx].encode('utf-8').strip() if permits[idx] else None
            date = dates[idx].encode('utf-8').strip() if dates[idx] else None
            status = statuses[idx].encode('utf-8').strip() if statuses[idx] else None
            operator = operators[idx].encode('utf-8').strip() if operators[idx] else None
            tmp_row = dict(row)
            tmp_row['PermitNo'] = permit
            tmp_row['Date'] = date
            tmp_row['Status'] = status
            tmp_row['Operator'] = operator
            q.put(tmp_row)
            #results.append((permit, date, status))

threads = []
q = Queue.Queue()
workers = 25
increment = len(records) // workers
for idx in range(0, len(records), increment):
    t = Thread(target=get_details, args=(q, records[idx:idx+increment]))
    threads.append(t)
    t.start()
for t in threads:
    t.join()

in_records = []
while not q.empty():
    in_records.append(q.get())

print 'finished scraping', str(len(in_records)), 'records'

write_to_csv(in_records, True)

In [None]:
# Iowa
state = State('IA')

In [None]:
# Kansas
state = State('KS')

In [None]:
# Kentucky
state = State('KY')

In [None]:
# Louisiana
state = State('LA')

In [None]:
# Maine
state = State('ME')

In [None]:
# Maryland
state = State('MD')

In [None]:
# Massachusetts
state = State('MA')

In [None]:
# Michigan
state = State('MI')

In [None]:
# Minnesota
state = State('MN')

In [None]:
# Mississippi
state = State('MS')

In [None]:
# Missouri
state = State('MO')

In [None]:
# Montana
state = State('MT')

In [None]:
# Nebraska
state = State('NE')

In [None]:
# Nevada
state = State('NV')

In [None]:
# New Hampshire
state = State('NH')

In [None]:
# New Jersey
state = State('NJ')

In [None]:
# New Mexico
state = State('NM')

In [None]:
# New York
state = State('NY')

In [None]:
# North Carolina
state = State('NC')

In [None]:
# North Dakota
state = State('ND')

In [None]:
# Ohio
state = State('OH')

In [None]:
# Oklahoma
state = State('OK')

In [None]:
# Oregon
state = State('OR')

In [None]:
# Pennsylvania
import codecs
state = 'PA'


def download_files():
    driver = ffdriver(state.download_directory)
    driver.clear_downloads()
    driver.get("http://www.depreportingservices.state.pa.us/ReportServer/Pages/ReportViewer.aspx?/Oil_Gas/Spud_External_Data")
    assert "Spud_External_Data" in driver.title
    start_date = driver.find_element_by_id("ReportViewerControl_ctl04_ctl03_txtValue")
    start_date.clear()
    start_date.send_keys("01/01/1800")
    end_date = driver.find_element_by_id("ReportViewerControl_ctl04_ctl05_txtValue")
    end_date.clear()
    end_date.send_keys("12/31/2099")
    driver.find_element_by_id("ReportViewerControl_ctl04_ctl00").click() # Click 'View Report'
    time.sleep(15) # give about two minutes for the report to run
    save_dropdown = driver.find_element_by_id('ReportViewerControl_ctl05_ctl04_ctl00_ButtonLink')
    save_dropdown.click()
    download_link = driver.find_element_by_partial_link_text('CSV (comma delimited)')
    download_link.click()
    time.sleep(10)
    driver.switch_to_window(driver.window_handles[0])
    while (len(os.listdir(state.download_directory)) < 1) or (len([f for f in os.listdir(state.download_directory) if f[-4:]=='part']) > 0):
        #print 'downloading'
        time.sleep(3) # if current download in process, wait...
    driver.quit()
    if driver.count_downloads() == 1:
        print "successfully downloaded file"
        return True
    else:
        print 'something went wrong!'
        return False
    
def parse_to_csv():
    rows = []
    for filename in os.listdir(state.download_directory):
        # deal with BOM issue by using codecs.open
        with codecs.open(os.path.join(state.download_directory, filename), 'rb', encoding='utf-8-sig') as f:
            reader = csv.reader(f)
            for row in reader:
                unicode_row = [cell.decode('utf-8') for cell in row]
                if unicode_row and unicode_row[0] == '1/1/1800':
                    unicode_row[0] = ''
                rows.append(unicode_row)
    with open('csvs/pa-data.csv', 'wb') as f:
        writer = csv.writer(f, delimiter=',')
        for row in rows:
            writer.writerow(row)
            
    return True

#if download_files():
parse_to_csv()

In [None]:
# Rhode Island
state = State('RI')

In [None]:
# South Carolina
state = State('SC')

In [None]:
# South Dakota
state = State('SD')

In [None]:
# Tennessee
state = 'TN'



import requests
# for whatever reason, urllib2 gets stuck in an endless redirect. Since it's a CSV file, we just
# use requests instead

src = 'http://environment-online.state.tn.us:8080/pls/enf_reports/f?p=9034:34300:22741039565748:CSV::::'
r = requests.get(src)
with open('csvs/tn-data.csv', 'wb') as f:
    f.write(r.content)
    print f, 'downloaded'


In [None]:
# Texas
state = State('TX')

In [None]:
# Utah
state = State('UT')

In [None]:
# Vermont
state = State('VT')

In [None]:
# Virginia
state = State('VA')

In [None]:
# Washington
state = State('WA')

In [None]:
# West Virginia
state = State('WV')

In [None]:
# Wisconsin
state = State('WI')

In [None]:
# Wyoming
state = State('WY')

In [None]:
# Alaska Offshore
state = State('AK1'
# Pacific Coast Offshore
state = State('CA1'
# Northern Gulf of Mexico
state = State('TX1'
# Atlantic Coast Offshore
state = State('DC1'
