# TMC data - preprocessing

Read in raw xslx spreadsheets, format to a proper table, and save to csv.  

issue: https://github.com/CityofToronto/bdit_data-sources/issues/377  

In [1]:
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd 
# import configparser
# from psycopg2 import connect
# import psycopg2.sql as pg
# import pandas.io.sql as pandasql
import numpy as np 
import datetime
# import datetime as dt
# import rick

import importlib
import matplotlib.ticker as ticker
import matplotlib.font_manager as font_manager
import matplotlib.dates as mdates
from matplotlib.lines import Line2D # for legend

In [2]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Config

In [3]:
# # JUPYTERHUB
# CONFIG=configparser.ConfigParser()
# CONFIG.read('/home/cnangini/db.cfg')
# dbset=CONFIG['DBSETTINGS']
# con=connect(**dbset)
# i=0

# Palette

# Dictionaries

# Functions

## Line Chart

In [4]:
def get_ypos(df, sett):
    '''Output y-position of labels based on max of all data.'''
    
    label_ypos=(int(df[list(df)[1:]].max().max() + 
                        0.025*(df[list(df)[1:]].max().max())))
    
    return label_ypos

In [5]:
def get_install_dates(segment):
    '''Returns limits of installation date for given segment.
    '''
    
    query = '''SELECT segment, 
        lower(install) as min_date, 
        upper(install) as max_date
        FROM rapidto.miovision_segment_info_split
        WHERE segment='{segment}'
        GROUP BY segment, lower(install),upper(install)
        ORDER BY lower(install);
        '''.format(segment=segment)
    
    with con:
        df = pandasql.read_sql(query, con)
        
          
    lims=[[pd.to_datetime(df['min_date'].iloc[0]),
         pd.to_datetime(df['max_date'].iloc[0])]]
    
    return lims

In [6]:
def find_weekend_indices(df):
    xcol=list(df)[0]
    datetime_array=df[xcol]
    
    s = []
    for i in range(len(datetime_array) - 1):
        if datetime_array[i].weekday() >= 5:
            s.append([df[xcol][i], df[xcol][i + 1]])
#             s2.append(df[xcol][i + 1])  

    return s

In [7]:
def my_linechart(df_orig, sett):
    """Creates a line chart. x axis must be modified manually        
    Parameters
    -----------
    df : pandas dataframe
        Data for the line chart.
    sett: settings object containing styling and annotation parameters      
    
    Optional params:
    
    
    Returns 
    --------
    fig
        Matplotlib fig object
    ax 
        Matplotlib ax object
 
    """ 
    df=df_orig.copy()
    
    # -------------------------
    # Styling params
    lw=2 if 'lw' not in sett else sett['lw']
    fontsize=9 if 'fontsize' not in sett else sett['fontsize']
    
    # -------------------------
    # Setup the figure
    fig, ax =plt.subplots(1)
#     fig.set_size_inches(9.6, 2.3)
    fig.set_size_inches(18, 5)   #.set_size_inches(18, 10)
    ax = plt.gca()
    ax.tick_params(width=1, length=2)
    ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}')) # format y-axis values with comma
    mpl.rcParams['axes.linewidth'] = 0.3 # set the value globally
    
#     plt.grid()
    #plt.grid(b=True, which='both', color='0.5', linestyle='-')
    plt.grid(b=True, which='major', color='gray', linestyle='-')
    plt.grid(b=True, which='minor', color='#D3D3D3', linestyle='--')

    
    # -------------------------
    # Define line-number-dependent params
    num_lines=len(sett['data'].keys())
    
    col_names=['xcol']
    ymax_array=[]
    for n in range(num_lines):
        col_names.append('ycol_' + str(n))
        ymax_array.append(df.iloc[:,n+1].max())
        
    

    df.columns=col_names
    ymax=np.max(ymax_array)

    
    #-------------------------
    # y-axis
    ymax = ymax + ymax*.1
    if 'ymin' in sett:
        ymin=sett['ymin']
    else:
        ymin=0
          
    delta = (ymax - ymin)/4
    i = 0
    while True:
        delta /= 10
        i += 1
        if delta < 10:
            break
    if 'yinc' in sett:
        yinc=sett['yinc']
    else:
        yinc = int(round(delta+1)*pow(10,i))
                   
    plt.ylim(top=ymax, bottom=ymin)
    if 'ylab' in sett:
        align= 'center' if 'align' not in sett else sett['align']
        plt.ylabel(sett['ylab'], fontsize=fontsize)
    
    # Format x-axis ticks
    if 'major_loc' in sett: # x-values are dates
        if 'minor_loc' in sett:
            ax.xaxis.set_minor_locator(sett['minor_loc'])

    # --------------------------
    # Plot data
    for n in range(num_lines):
        ax.plot(df['xcol'], df['ycol_' + str(n)], linewidth=lw, 
                color = sett['data'][n]['lc'], linestyle=sett['data'][n]['ls'])

    # --------------------------
    # Legend
    if 'legend_label' in sett:
        if sett['legend_label']==True:
            legend_pos='upper left' if 'legend_pos' not in sett else sett['legend_pos']
            ncol=len(df.columns) if 'legend_ncol' not in sett else sett['legend_ncol']

            leg_array=[]
            custom_lines=[]
            for n in range(num_lines):
                leg_array.append(sett['data'][n]['leg'])
                custom_lines.append(
                    Line2D([0], [0], color=sett['data'][n]['lc'], lw=lw,
                          linestyle=sett['data'][n]['ls'])
                )        


            leg_font=fontsize if 'legend_font' not in sett else sett['legend_font']
            ax.legend(custom_lines, leg_array, loc=legend_pos, 
                      prop={"size": leg_font},
                      ncol=ncol, handlelength=3
                     )
    
    # --------------------------
    # Shaded areas
    if 'shaded' in sett:
        num_a=len(sett['shaded'].keys())
        
        for area in range(num_a):
            idx=sett['shaded'][area]['lims']
            facecolour=sett['shaded'][area]['c']
            zorder=0 if 'zorder' not in sett['shaded'][area] else (
                sett['shaded'][area]['zorder'])
            alpha=1 if 'alpha' not in sett['shaded'][area] else (
                sett['shaded'][area]['alpha'])
            
            # Shaded area left and right bds
            for i in range(len(idx)):
                bd1=idx[i][0]
                bd2=idx[i][1]
                
                ax.axvspan(bd1, bd2, 
                       facecolor=facecolour, edgecolor='none', alpha=alpha,
                          zorder=zorder)
            
            # Shaded area label
            if 'label' in sett['shaded'][area]:
                trans = ax.get_xaxis_transform()
        
                plt.text(                    
                    sett['shaded'][area]['label']['x'], # x posn of label
                    sett['shaded'][area]['label']['y'], # y posn of label
                    sett['shaded'][area]['label']['text'], 
                    color = sett['shaded'][area]['label']['colour'], 
                    fontsize=sett['shaded'][area]['label']['fontsize']
                )

# I Define file list
Output of `ls` in data dir.  

In [8]:
data_dir='~/Documents/PROJECTS/BDITTO/WYS/VALIDATION/PREPROCESSED/'

In [9]:
fname_list=[
    '2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx', 
    '2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx',
    '2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx', 
    '2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx',
    '2020-04-29_PHARMACY NORTHBOUND SOUTH OF DENTON.xlsx', 
    '2020-05-12_STRATFORD WESTBOUND WEST OF MILDENHALL.xlsx',
    '2020-04-30_DENTON WESTBOUND EAST OF BYNG.xlsx',
    '2020-05-13_BYNG NORTHBOUND SOUTH OF DENTON.xlsx',
    '2020-05-04_GRENOBLE WESTBOUND WEST OF LEEWARD GLENWAY EAST.xlsx', 
    '2020-05-14_PHARMACY SOUTHBOUND NORTH OF DENTON.xlsx',
    '2020-05-05_JONES SOUTHBOUND SOUTH OF HUNTER.xlsx',
    '2020-05-19_BELSIZE WESTBOUND EAST OF CHESTON.xlsx',
    '2020-05-06_JONES SOUTHBOUND SOUTH OF HARCOURT.xlsx', 
    '2020-05-20_CHESTON SOUTHBOUND NORTH OF BELSIZE.xlsx'
]

# II Loop through all files and output data in one dataframe
Output dataframe defined as `columns=['datetime_bin', 'location', 'class_type','count']`   
where:

- `datetime_bin` taken from `Time` column in each file  
- `location` taken from top of each file `Location:	Main Street Southbound North of Swanwick Avenue`  
- `class_type = [Cars,Trucks,Bicycles,Transit,Pedestrians,Other]` as read from the column headers in each file  
- `count` taken from the values in each `class_type` in the file  

In [10]:
from datetime import timedelta

def read_xls():
    '''Read set of xlsx files, extract the relevant data, and format into a proper dataframe.'''
    
    # Set up output df
    df=pd.DataFrame(columns=['datetime_bin', 'location', 'class_type','count'])
    
    # Initialize arrays that will store values from each file
    timebin_array=[]
    location_array=[]
    classtype_array=[]
    count_array=[]
    
    # Loop through all files in fname_list, extract data from each column,
    # and append to one big long array that will then be stored in the 
    # final return df.
    idx=1
    for fname in fname_list:
        print('File ' + str(idx) + ': ' + fname)
        # 1. Read spreadsheet
        this_df=pd.read_excel(data_dir + fname, skiprows=9).dropna().reset_index(drop=True)
        class_types=list(this_df)[1:] # class types in spreadsheet
        n_classes=len(list(this_df)[1:]) # number of classes in spreadsheet
        
        # 2. Convert all times to 24 hour notation in section after this_df['Time']=='Time'
        delta=datetime.timedelta(hours = 12)

        this_df.loc[this_df.index.values > this_df.loc[this_df['Time']=='Time'].index.values[0], 'Time']=(
            this_df.loc[this_df.index.values > 
                        this_df.loc[this_df['Time']=='Time'].index.values[0], 'Time'].apply(lambda x: (
            datetime.datetime.combine(datetime.date(1,1,1), x) + delta).time()))
        
        
        this_loc=list(pd.read_excel(data_dir + fname, skiprows=3))[1]
        obs_date=list(pd.read_excel(data_dir + fname, skiprows=5))[1]
        
        print(this_loc)
        print(obs_date.date())
        print('')
        idx=idx+1
        
        # 3. Drop second column headers
        this_df = this_df[this_df.Time != 'Time']
        
        # 4. Add obs_date to times in Time col
        # this_df['Time']=this_df['Time'].apply(lambda x: datetime.datetime.combine(
        #                                        datetime.datetime.strptime('2020-04-27', '%Y-%m-%d'), 
        #                      x))
        this_df['Time']=this_df['Time'].apply(lambda x: datetime.datetime.combine(obs_date, x))
        
        # 2. Extract column values of this_df for each class_type
        #    and append to one long array for each 
        for class_type in class_types:
            location_array = location_array + [this_loc for i in range(len(list(this_df['Time'].values)))]
            classtype_array = classtype_array + [class_type for i in range(len(list(this_df['Time'].values)))]
            timebin_array=timebin_array + list(this_df['Time'].values)
            count_array=count_array + list(this_df[class_type].values)
            print('len count_array: ' + str(len(count_array)))
            
        print('')
            
    # 3. Add all the arrays to the return df
    df['datetime_bin']=timebin_array
    df['location']=location_array
    df['class_type']=classtype_array
    df['count']=count_array
    
  
        
    return df

In [11]:
df_all=read_xls()

File 1: 2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx
Main Street Southbound North of Swanwick Avenue
2020-04-27

len count_array: 48
len count_array: 96
len count_array: 144
len count_array: 192
len count_array: 240
len count_array: 288

File 2: 2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx
Gateway Boulevard South Eastbound East of Don Mills Road
2020-05-07

len count_array: 371
len count_array: 454
len count_array: 537
len count_array: 620
len count_array: 703
len count_array: 786

File 3: 2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx
Newport Avenue Eastbound West of August Avenue
2020-04-28

len count_array: 834
len count_array: 882
len count_array: 930
len count_array: 978
len count_array: 1026
len count_array: 1074

File 4: 2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx
Underhill Drive Northbound South of Monarchwood Crescent
2020-05-11

len count_array: 1148
len count_array: 1222
len count_array: 1296
len count_array: 1370
len count_array: 1444
len c

In [12]:
288*14

4032

In [13]:
print(df_all.shape)
df_all

(5472, 4)


Unnamed: 0,datetime_bin,location,class_type,count
0,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Cars,6
1,2020-04-27 10:05:00,Main Street Southbound North of Swanwick Avenue,Cars,8
2,2020-04-27 10:10:00,Main Street Southbound North of Swanwick Avenue,Cars,14
3,2020-04-27 10:15:00,Main Street Southbound North of Swanwick Avenue,Cars,8
4,2020-04-27 10:20:00,Main Street Southbound North of Swanwick Avenue,Cars,15
5,2020-04-27 10:25:00,Main Street Southbound North of Swanwick Avenue,Cars,13
6,2020-04-27 10:30:00,Main Street Southbound North of Swanwick Avenue,Cars,16
7,2020-04-27 10:35:00,Main Street Southbound North of Swanwick Avenue,Cars,15
8,2020-04-27 10:40:00,Main Street Southbound North of Swanwick Avenue,Cars,23
9,2020-04-27 10:45:00,Main Street Southbound North of Swanwick Avenue,Cars,13


In [14]:
# sort by datetime_bin
df_all.sort_values('datetime_bin').reset_index(drop=True)

Unnamed: 0,datetime_bin,location,class_type,count
0,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Cars,6
1,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Other,0
2,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Pedestrians,0
3,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Transit,1
4,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Bicycles,0
5,2020-04-27 10:00:00,Main Street Southbound North of Swanwick Avenue,Trucks,0
6,2020-04-27 10:05:00,Main Street Southbound North of Swanwick Avenue,Cars,8
7,2020-04-27 10:05:00,Main Street Southbound North of Swanwick Avenue,Other,0
8,2020-04-27 10:05:00,Main Street Southbound North of Swanwick Avenue,Pedestrians,0
9,2020-04-27 10:05:00,Main Street Southbound North of Swanwick Avenue,Transit,1


# III Check size

Size of final dataframe should be number of files * (number of data lines in each file  * num class types) 

In [15]:
len(fname_list) * ((24 + 24) * 6)

4032

In [16]:
print(df_all.shape)

(5472, 4)


# IV Save to csv

In [17]:
savename='kertcher_all.csv'
df_all.sort_values('datetime_bin').reset_index(drop=True).to_csv(data_dir + savename, sep=';',index=False)