This notebook takes the well data stored in a local database and renders out a binary file with the well location data in EarthTime format.

## Clean wells that are outside of their states

The dataset has ~50,000 wells that are well outside of their state boundaries. The code below creates a 50km buffer around the state boundaries (to allow for offshore wells) and then deletes the location of anything outside the buffer.

For validation purposes:
```sql
-- DROP MATERIALIZED VIEW IF EXISTS outliers;
CREATE MATERIALIZED VIEW outliers as 
SELECT wells.* FROM wells
JOIN states_shape sf ON wells.state = sf.stusps
AND NOT ST_Within(wells.location::geometry, sf.geom);
```

In [None]:
%%sql
CREATE MATERIALIZED VIEW states_buffered AS
SELECT gid, stusps, name, ST_Transform(ST_Buffer(ST_Transform(geom, 2955), 50000), 4326) as geom
FROM states_shape;

In [None]:
%%sql
UPDATE wells SET location = NULL
WHERE api IN (
    SELECT api FROM wells
    JOIN states_buffered sf ON wells.state = sf.stusps
    WHERE NOT ST_Within(wells.location::geometry, sf.geom)
)

## Export Data

In [1]:
import os, array, csv, json, math, random
from datetime import datetime
from datetime import date
import db_settings
import psycopg2

def LonLatToPixelXY(lonlat):
    (lon, lat) = lonlat
    x = (lon + 180.0) * 256.0 / 360.0
    y = 128.0 - math.log(math.tan((lat + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
    return [x, y]

## Traditional - Lat / Lon / Time

In [2]:
conn = psycopg2.connect(database=db_settings.DB, user=db_settings.USER, password=db_settings.PASSWD, host=db_settings.HOST)
with conn:
    with conn.cursor() as cur:
        query = """
            SELECT date, state,
                ST_X(location::geometry) AS lon, 
                ST_Y(location::geometry) AS lat 
            FROM wells
            WHERE type IN ('OIL', 'GAS', 'OILANDGAS') AND
                location IS NOT NULL;
        """
        #query = 'SELECT date, ST_X(location::geometry) AS lon, ST_Y(location::geometry) AS lat FROM wells '
        #query += "WHERE type IN ('OIL', 'GAS', 'OILANDGAS')"

        try:
            cur.execute(query)
        except psycopg2.Error as e:
            print (query)
            print (e.pgerror)
        wells = cur.fetchall()
conn.close()

In [4]:
# the traditional version
data = []

for well in wells:
    well_date, state, lon, lat = well
    x, y = LonLatToPixelXY([lon,lat])
    if well_date:
        well_date = datetime.combine(well_date, datetime.min.time())
        epochtime = (well_date - datetime(1970, 1, 1)).total_seconds()
    else:
        epochtime = 0
    data += [x,y,epochtime]
array.array('f', data).tofile(open('data/data-plain.bin', 'wb'))

### Fill in missing dates with random dates
* For Texas, fill in random dates between 1894 and 1984
* For New Mexico, fill in random dates between 1922 (first commercial well) and 1984
* any others, fade in between 1960 and 1984

In [5]:
from random import randrange
from datetime import timedelta

def random_date(start = date(1900, 1, 1), end = date(1984, 1, 1)):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    #int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_day = randrange(delta.days)
    return start + timedelta(days=random_day)

data = []

for well in wells:
    well_date, state, lon, lat = well
    x, y = LonLatToPixelXY([lon,lat])
    if well_date:
        if state == 'NM' and well_date < date(1901, 1, 1):
            well_date = random_date(start = date(1922, 1, 1))
        elif state == 'OK' and well_date < date(1897, 1, 1):
            well_date = random_date(start = date(1907, 1, 1))
    elif state == 'TX':
        well_date = random_date(start = date(1894, 1, 1))
    else:
        well_date = random_date(start = date(1960, 1, 1))
    
    well_date = datetime.combine(well_date, datetime.min.time())
    epochtime = (well_date - datetime(1970, 1, 1)).total_seconds()
    data += [x,y,epochtime]
array.array('f', data).tofile(open('data/data.bin', 'wb'))

# Experimental - Lat / Lon / Time / Active

In [None]:
# Extract well data from database and write to .bin file

import db_settings
import psycopg2, psycopg2.extras, ast, math, array
from datetime import datetime
result_file = 'data/decay.bin'

query = """
SELECT DISTINCT ON(api) api, ST_AsGeoJSON(location) AS location, 
date,
date_part('year', date) AS year,
date_part('month', date) AS month,
date_part('day', date) AS day,
CASE WHEN status = 'ACTIVE' THEN 'True' ELSE 'False' END AS active
FROM wells
WHERE type IN ('OIL', 'OILANDGAS', 'GAS')
AND date IS NOT NULL
ORDER BY api, date, capture_time DESC;
"""
data = []
conn = psycopg2.connect(database=db_settings.DB, user=db_settings.USER, password=db_settings.PASSWD, host=db_settings.HOST)
with conn:
    with conn.cursor(name='wells_cur', cursor_factory=psycopg2.extras.DictCursor) as cur:
        cur.execute(query)
        for record in cur:
            if record['date']:
                date = datetime(int(record['year']), int(record['month']), int(record['day']))
            epochtime = (date - datetime(1970, 1, 1)).total_seconds()
            value = 1 if ast.literal_eval(record['active']) else 0            
            point = ast.literal_eval(record['location'])
            x = (point['coordinates'][0] + 180.0) * 256.0 / 360.0
            y = 128.0 - math.log(math.tan((point['coordinates'][1] + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
            data += [x, y, epochtime, value];

array.array('f', data).tofile(open(result_file, 'wb'))
print ('wrote', str(len(data)/4), 'records to', result_file)