# TMC data - preprocessing

Read in raw xslx spreadsheets, format to a proper table, and save to csv.  

issue: https://github.com/CityofToronto/bdit_data-sources/issues/377  

In [1]:
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd 
# import configparser
# from psycopg2 import connect
# import psycopg2.sql as pg
# import pandas.io.sql as pandasql
import numpy as np 
import datetime
# import datetime as dt
# import rick

import importlib
import matplotlib.ticker as ticker
import matplotlib.font_manager as font_manager
import matplotlib.dates as mdates
from matplotlib.lines import Line2D # for legend

In [2]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Config

In [3]:
# # JUPYTERHUB
# CONFIG=configparser.ConfigParser()
# CONFIG.read('/home/cnangini/db.cfg')
# dbset=CONFIG['DBSETTINGS']
# con=connect(**dbset)
# i=0

# Palette

# Dictionaries

# Functions

## Line Chart

In [4]:
def get_ypos(df, sett):
    '''Output y-position of labels based on max of all data.'''
    
    label_ypos=(int(df[list(df)[1:]].max().max() + 
                        0.025*(df[list(df)[1:]].max().max())))
    
    return label_ypos

In [5]:
def get_install_dates(segment):
    '''Returns limits of installation date for given segment.
    '''
    
    query = '''SELECT segment, 
        lower(install) as min_date, 
        upper(install) as max_date
        FROM rapidto.miovision_segment_info_split
        WHERE segment='{segment}'
        GROUP BY segment, lower(install),upper(install)
        ORDER BY lower(install);
        '''.format(segment=segment)
    
    with con:
        df = pandasql.read_sql(query, con)
        
          
    lims=[[pd.to_datetime(df['min_date'].iloc[0]),
         pd.to_datetime(df['max_date'].iloc[0])]]
    
    return lims

In [6]:
def find_weekend_indices(df):
    xcol=list(df)[0]
    datetime_array=df[xcol]
    
    s = []
    for i in range(len(datetime_array) - 1):
        if datetime_array[i].weekday() >= 5:
            s.append([df[xcol][i], df[xcol][i + 1]])
#             s2.append(df[xcol][i + 1])  

    return s

In [7]:
def my_linechart(df_orig, sett):
    """Creates a line chart. x axis must be modified manually        
    Parameters
    -----------
    df : pandas dataframe
        Data for the line chart.
    sett: settings object containing styling and annotation parameters      
    
    Optional params:
    
    
    Returns 
    --------
    fig
        Matplotlib fig object
    ax 
        Matplotlib ax object
 
    """ 
    df=df_orig.copy()
    
    # -------------------------
    # Styling params
    lw=2 if 'lw' not in sett else sett['lw']
    fontsize=9 if 'fontsize' not in sett else sett['fontsize']
    
    # -------------------------
    # Setup the figure
    fig, ax =plt.subplots(1)
#     fig.set_size_inches(9.6, 2.3)
    fig.set_size_inches(18, 5)   #.set_size_inches(18, 10)
    ax = plt.gca()
    ax.tick_params(width=1, length=2)
    ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}')) # format y-axis values with comma
    mpl.rcParams['axes.linewidth'] = 0.3 # set the value globally
    
#     plt.grid()
    #plt.grid(b=True, which='both', color='0.5', linestyle='-')
    plt.grid(b=True, which='major', color='gray', linestyle='-')
    plt.grid(b=True, which='minor', color='#D3D3D3', linestyle='--')

    
    # -------------------------
    # Define line-number-dependent params
    num_lines=len(sett['data'].keys())
    
    col_names=['xcol']
    ymax_array=[]
    for n in range(num_lines):
        col_names.append('ycol_' + str(n))
        ymax_array.append(df.iloc[:,n+1].max())
        
    

    df.columns=col_names
    ymax=np.max(ymax_array)

    
    #-------------------------
    # y-axis
    ymax = ymax + ymax*.1
    if 'ymin' in sett:
        ymin=sett['ymin']
    else:
        ymin=0
          
    delta = (ymax - ymin)/4
    i = 0
    while True:
        delta /= 10
        i += 1
        if delta < 10:
            break
    if 'yinc' in sett:
        yinc=sett['yinc']
    else:
        yinc = int(round(delta+1)*pow(10,i))
                   
    plt.ylim(top=ymax, bottom=ymin)
    if 'ylab' in sett:
        align= 'center' if 'align' not in sett else sett['align']
        plt.ylabel(sett['ylab'], fontsize=fontsize)
    
    # Format x-axis ticks
    if 'major_loc' in sett: # x-values are dates
        if 'minor_loc' in sett:
            ax.xaxis.set_minor_locator(sett['minor_loc'])

    # --------------------------
    # Plot data
    for n in range(num_lines):
        ax.plot(df['xcol'], df['ycol_' + str(n)], linewidth=lw, 
                color = sett['data'][n]['lc'], linestyle=sett['data'][n]['ls'])

    # --------------------------
    # Legend
    if 'legend_label' in sett:
        if sett['legend_label']==True:
            legend_pos='upper left' if 'legend_pos' not in sett else sett['legend_pos']
            ncol=len(df.columns) if 'legend_ncol' not in sett else sett['legend_ncol']

            leg_array=[]
            custom_lines=[]
            for n in range(num_lines):
                leg_array.append(sett['data'][n]['leg'])
                custom_lines.append(
                    Line2D([0], [0], color=sett['data'][n]['lc'], lw=lw,
                          linestyle=sett['data'][n]['ls'])
                )        


            leg_font=fontsize if 'legend_font' not in sett else sett['legend_font']
            ax.legend(custom_lines, leg_array, loc=legend_pos, 
                      prop={"size": leg_font},
                      ncol=ncol, handlelength=3
                     )
    
    # --------------------------
    # Shaded areas
    if 'shaded' in sett:
        num_a=len(sett['shaded'].keys())
        
        for area in range(num_a):
            idx=sett['shaded'][area]['lims']
            facecolour=sett['shaded'][area]['c']
            zorder=0 if 'zorder' not in sett['shaded'][area] else (
                sett['shaded'][area]['zorder'])
            alpha=1 if 'alpha' not in sett['shaded'][area] else (
                sett['shaded'][area]['alpha'])
            
            # Shaded area left and right bds
            for i in range(len(idx)):
                bd1=idx[i][0]
                bd2=idx[i][1]
                
                ax.axvspan(bd1, bd2, 
                       facecolor=facecolour, edgecolor='none', alpha=alpha,
                          zorder=zorder)
            
            # Shaded area label
            if 'label' in sett['shaded'][area]:
                trans = ax.get_xaxis_transform()
        
                plt.text(                    
                    sett['shaded'][area]['label']['x'], # x posn of label
                    sett['shaded'][area]['label']['y'], # y posn of label
                    sett['shaded'][area]['label']['text'], 
                    color = sett['shaded'][area]['label']['colour'], 
                    fontsize=sett['shaded'][area]['label']['fontsize']
                )

# I Read xlsx data

In [8]:
data_dir='~/Documents/PROJECTS/BDITTO/WYS/VALIDATION/PREPROCESSED/'

In [9]:
fname_list=[
    '2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx', '2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx'
'2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx', '2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx'
'2020-04-29_PHARMACY NORTHBOUND SOUTH OF DENTON.xlsx', '2020-05-12_STRATFORD WESTBOUND WEST OF MILDENHALL.xlsx'
'2020-04-30_DENTON WESTBOUND EAST OF BYNG.xlsx',  '2020-05-13_BYNG NORTHBOUND SOUTH OF DENTON.xlsx'
'2020-05-04_GRENOBLE WESTBOUND WEST OF LEEWARD GLENWAY EAST.xlsx', '2020-05-14_PHARMACY SOUTHBOUND NORTH OF DENTON.xlsx'
'2020-05-05_JONES SOUTHBOUND SOUTH OF HUNTER.xlsx', '2020-05-19_BELSIZE WESTBOUND EAST OF CHESTON.xlsx'
'2020-05-06_JONES SOUTHBOUND SOUTH OF HARCOURT.xlsx', '2020-05-20_CHESTON SOUTHBOUND NORTH OF BELSIZE.xlsx'
]

In [10]:
fname_list

['2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx',
 '2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx',
 '2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx2020-04-29_PHARMACY NORTHBOUND SOUTH OF DENTON.xlsx',
 '2020-05-12_STRATFORD WESTBOUND WEST OF MILDENHALL.xlsx2020-04-30_DENTON WESTBOUND EAST OF BYNG.xlsx',
 '2020-05-13_BYNG NORTHBOUND SOUTH OF DENTON.xlsx2020-05-04_GRENOBLE WESTBOUND WEST OF LEEWARD GLENWAY EAST.xlsx',
 '2020-05-14_PHARMACY SOUTHBOUND NORTH OF DENTON.xlsx2020-05-05_JONES SOUTHBOUND SOUTH OF HUNTER.xlsx',
 '2020-05-19_BELSIZE WESTBOUND EAST OF CHESTON.xlsx2020-05-06_JONES SOUTHBOUND SOUTH OF HARCOURT.xlsx',
 '2020-05-20_CHESTON SOUTHBOUND NORTH OF BELSIZE.xlsx']

In [107]:
from datetime import timedelta

def read_xls():
    '''Read set of xlsx files, extract the relevant data, and format into a proper dataframe.'''
    
    # set up output df
    df=pd.DataFrame(columns=['datetime_bin', 'location', 'class_type','count'])
    
    for fname in fname_list[:1]: # start with one file only
        # 1. Get data from spreadsheet and store in this_df
        this_df=pd.read_excel(data_dir + fname, skiprows=9).dropna().reset_index(drop=True)
        class_types=list(this_df)[1:] # class types in spreadsheet
        n_classes=len(list(this_df)[1:]) # number of classes in spreadsheet
        
        print(class_types)
        print(n_classes)
        
        # Convert all times to 24 hour notation in section after this_df['Time']=='Time'
        delta=datetime.timedelta(hours = 12)

        this_df.loc[this_df.index.values > this_df.loc[this_df['Time']=='Time'].index.values[0], 'Time']=(
            this_df.loc[this_df.index.values > 
                        this_df.loc[this_df['Time']=='Time'].index.values[0], 'Time'].apply(lambda x: (
            datetime.datetime.combine(datetime.date(1,1,1), x) + delta).time()))
        
        
        this_loc=list(pd.read_excel(data_dir + fname, skiprows=3))[1]
        obs_date=list(pd.read_excel(data_dir + fname, skiprows=5))[1]
        
        print(this_loc)
        print(obs_date.date())
        
        # drop second column headers
        this_df = this_df[this_df.Time != 'Time']
        
        # 2. Extract column values of this_df for each class_type
        # and append to one long array for each 
        timebin_array=[]
        location_array=[]
        classtype_array=[]
        count_array=[]
        for class_type in class_types:
            print(class_type)
            location_array = location_array + [this_loc for i in range(len(list(this_df['Time'].values)))]
            classtype_array = classtype_array + [class_type for i in range(len(list(this_df['Time'].values)))]
            timebin_array=timebin_array + list(this_df['Time'].values)
            count_array=count_array + list(this_df[class_type].values)
            
        # 3. Add all the arrays to the return df
        df['datetime_bin']=timebin_array
        df['location']=location_array
        df['class_type']=classtype_array
        df['count']=count_array

        
        
    return df

In [108]:
this_df=read_xls()
this_df

['Cars', 'Trucks', 'Bicycles', 'Transit', 'Pedestrians', 'Other']
6
Main Street Southbound North of Swanwick Avenue
2020-04-27
Cars
Trucks
Bicycles
Transit
Pedestrians
Other


Unnamed: 0,datetime_bin,location,class_type,count
0,10:00:00,Main Street Southbound North of Swanwick Avenue,Cars,6
1,10:05:00,Main Street Southbound North of Swanwick Avenue,Cars,8
2,10:10:00,Main Street Southbound North of Swanwick Avenue,Cars,14
3,10:15:00,Main Street Southbound North of Swanwick Avenue,Cars,8
4,10:20:00,Main Street Southbound North of Swanwick Avenue,Cars,15
5,10:25:00,Main Street Southbound North of Swanwick Avenue,Cars,13
6,10:30:00,Main Street Southbound North of Swanwick Avenue,Cars,16
7,10:35:00,Main Street Southbound North of Swanwick Avenue,Cars,15
8,10:40:00,Main Street Southbound North of Swanwick Avenue,Cars,23
9,10:45:00,Main Street Southbound North of Swanwick Avenue,Cars,13


In [36]:
this_df=read_xls()
this_df

['Cars', 'Trucks', 'Bicycles', 'Transit', 'Pedestrians', 'Other']
6
Main Street Southbound North of Swanwick Avenue
2020-04-27
[array([6, 8, 14, 8, 15, 13, 16, 15, 23, 13, 13, 14, 10, 7, 15, 16, 10, 20,
       12, 21, 16, 21, 14, 18, 16, 18, 15, 8, 15, 12, 18, 20, 21, 17, 18,
       17, 16, 25, 16, 18, 11, 20, 12, 11, 10, 13, 16, 12], dtype=object), array([0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0], dtype=object), array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 0,
       0, 0, 2, 4, 0, 0, 2, 1, 1, 0, 1, 0, 2, 3, 0, 0, 3, 1, 5, 0, 1, 7,
       0, 1, 0, 0], dtype=object), array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 2, 0,
       1, 1, 1, 1], dtype=object), array([0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 5, 0, 0, 2, 1, 1, 5, 5, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 2, 3

Unnamed: 0,Time,Cars,Trucks,Bicycles,Transit,Pedestrians,Other
0,10:00:00,6,0,0,1,0,0
1,10:05:00,8,1,0,1,0,0
2,10:10:00,14,2,0,1,0,0
3,10:15:00,8,1,1,1,0,0
4,10:20:00,15,0,0,0,2,0
5,10:25:00,13,0,0,1,0,0
6,10:30:00,16,0,0,0,0,0
7,10:35:00,15,0,1,0,4,0
8,10:40:00,23,0,1,1,0,0
9,10:45:00,13,0,0,1,0,0
