# Convert Postgres Data to Zips

This notebook hacks `traffic_prophet` to convert the 15-minute bin volume data table `czhu.btp_centreline_volumes` into zip files of text files organized by centreline ID, direction and year.

Each text file is tab-delimited with the columns:

```
nonsense	centreline_id	dir_bin	count_bin	volume	count_type
1921:43951732	8237152	-1	28-Jan-2016 00:00:00	50	1
1922:43951733	8237152	-1	28-Jan-2016 00:15:00	33	1
...
```

In [21]:
%matplotlib inline
import sys
sys.path.append('./bdit_traffic_prophet/')
import importlib
import matplotlib.pyplot as plt
import numpy as np
import knowyourdata as kyd
import zipfile
import warnings

import pandas as pd
from traffic_prophet import cfg
import pathlib, os
import configparser

from traffic_prophet import connection
from traffic_prophet.countmatch import reader

defaultcolours = plt.rcParams['axes.prop_cycle'].by_key()['color']

filepath = pathlib.Path.home().joinpath('.charlesconfig')
if os.path.isfile(filepath):
    vol_conn = connection.Connection(filepath, 'POSTGRES',
                                     'czhu.btp_centreline_volumes')
    ll_conn = connection.Connection(filepath, 'POSTGRES',
                                    'czhu.btp_centreline_lonlat')
    config = configparser.RawConfigParser()
    config.read(filepath.as_posix())
    MAPBOX_TOKEN = config['MAPBOX']['token']
    PLOTLY_USER = config['PLOTLY']['user']
    PLOTLY_KEY = config['PLOTLY']['key']
else:
    raise ValueError

In [2]:
from traffic_prophet.countmatch import reader

In [17]:
# Get HW401 centreline IDs
zipsn = ["../../VolumeModel/TEPS-EEDrun/PRTCS/negative/15min_counts_{0}.zip".format(x)
         for x in range(2006, 2017)]
zipsp = ["../../VolumeModel/TEPS-EEDrun/PRTCS/positive/15min_counts_{0}.zip".format(x)
         for x in range(2006, 2017)]
zips = zipsn + zipsp
rdr = reader.ReaderZip(zips)

re_centrelines = []

for zf in rdr.source:
    for c in rdr.get_zipreader(zf):
        if 're' in c['filename']:
            re_centrelines.append([c['filename'], c['centreline_id']])

re_centreline_ids = list(set([x[1] for x in re_centrelines]))

In [28]:
# Get all centreline IDs in mid_f_point.csv
df = pd.read_csv("../../VolumeModel/TEPS-EEDrun/PRTCS/negative/mid_f_point.csv", header=None)
midpoint_centerline_ids = list(df[0].values)

In [33]:
# Get all centreline IDs in Landuse_pop_lane_speed.xlsx
df = pd.read_excel("../../VolumeModel/TEPS-EEDrun/PRTCS/negative/locals/Landuse_pop_lane_speed.xlsx")
landuse_centreline_ids = list(df['centreline'].values)

In [35]:
available_teps_centreline_ids = list(
    set(midpoint_centerline_ids).intersection(set(landuse_centreline_ids)))

In [36]:
class ReaderPostgresRaw(reader.ReaderBase):
    """Hacked method of accessing the raw 15-minute bin table from Postgres."""

    def get_pgreader(self, year):
        with self.source.connect() as db_con:
            sql_cmd = (
                ("SELECT centreline_id, dir_bin, count_bin, volume, count_type "
                 "FROM {dbt} WHERE EXTRACT(year from count_bin) = {year} "
                 "ORDER BY centreline_id, dir_bin, count_bin")
                .format(dbt=self.source.tablename,  year=year))

            all_data = pd.read_sql(sql_cmd, db_con,
                                   parse_dates=['count_bin', ])

            for key, df in all_data.groupby(['centreline_id', 'dir_bin']):
                centreline_id = key[0]
                direction = key[1]

                data = df[['count_bin', 'volume', 'count_type']].copy()
                data.columns = ['Timestamp', '15-minute Volume', 'Count Type']
                data.reset_index(drop=True, inplace=True)

                # Filename is used to flag for HW401 data in Arman's zip files,
                # so just pass a dummy value here.  Note that we can't use
                # 'postgres' here since it contains 're'!
                yield {'filename': 'fromPG',
                       'centreline_id': int(centreline_id),
                       'direction': int(direction),
                       'data': data,
                       'year': year}

    def write_db_to_zip(self, year, fpath="./"):
        """Writes a year's worth of 15 minute bins"""
        rdr = self.get_pgreader(year)

        fhzp = zipfile.ZipFile(
            fpath + "15min_counts_{0}_positive.zip".format(year), 'w')
        fhzn = zipfile.ZipFile(
            fpath + "15min_counts_{0}_negative.zip".format(year), 'w')

        for tc in rdr:
            # Control sequence to prevent centreline_ids on HW401 and those with
            # no land use data from being included in zip.
            if tc['centreline_id'] in re_centreline_ids:
                warnings.warn("{0} found in HW401 IDs!".format(tc['centreline_id']))
                continue
            elif tc['centreline_id'] not in available_teps_centreline_ids:
                warnings.warn("{0} doesn't have TEPs land use/geographic data!".format(tc['centreline_id']))
                continue

            # Extract data from dict and convert it to TEPs format.
            data = tc['data']
            # Convert to DD-MMM-YYYY TT:TT:TT format favoured by Matlab.
            data['Timestamp'] = data['Timestamp'].dt.strftime(r"%d-%b-%Y %T")
            data['Nonsense'] = '999:9999999'
            data['Centreline ID'] = tc['centreline_id']
            data['Direction'] = tc['direction']
            # Output to csv, but dump to string instead of file.
            datastr = data[['Nonsense', 'Centreline ID', 'Direction',
                            'Timestamp', '15-minute Volume', 'Count Type']].to_csv(
                None, sep='\t', na_rep='N/A', header=False, index=False)

            filename = "{0}_99999_{1}.txt".format(tc['centreline_id'], year)
            if tc['direction'] > 0:
                fhzp.writestr(filename, datastr)
            else:
                fhzn.writestr(filename, datastr)

In [37]:
pgreader = ReaderPostgresRaw(vol_conn)

In [38]:
pgreader.write_db_to_zip(2017)

In [39]:
pgreader.write_db_to_zip(2018)