# TMC data - preprocessing

Read in raw xslx spreadsheets, format to a proper table, and save to csv.  

issue: https://github.com/CityofToronto/bdit_data-sources/issues/377  

In [1]:
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd 
# import configparser
# from psycopg2 import connect
# import psycopg2.sql as pg
# import pandas.io.sql as pandasql
import numpy as np 
import datetime
# import datetime as dt
# import rick

import importlib
import matplotlib.ticker as ticker
import matplotlib.font_manager as font_manager
import matplotlib.dates as mdates
from matplotlib.lines import Line2D # for legend

In [2]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Config

In [3]:
# # JUPYTERHUB
# CONFIG=configparser.ConfigParser()
# CONFIG.read('/home/cnangini/db.cfg')
# dbset=CONFIG['DBSETTINGS']
# con=connect(**dbset)
# i=0

# Palette

# Dictionaries

# Functions

## Line Chart

In [4]:
def get_ypos(df, sett):
    '''Output y-position of labels based on max of all data.'''
    
    label_ypos=(int(df[list(df)[1:]].max().max() + 
                        0.025*(df[list(df)[1:]].max().max())))
    
    return label_ypos

In [5]:
def get_install_dates(segment):
    '''Returns limits of installation date for given segment.
    '''
    
    query = '''SELECT segment, 
        lower(install) as min_date, 
        upper(install) as max_date
        FROM rapidto.miovision_segment_info_split
        WHERE segment='{segment}'
        GROUP BY segment, lower(install),upper(install)
        ORDER BY lower(install);
        '''.format(segment=segment)
    
    with con:
        df = pandasql.read_sql(query, con)
        
          
    lims=[[pd.to_datetime(df['min_date'].iloc[0]),
         pd.to_datetime(df['max_date'].iloc[0])]]
    
    return lims

In [6]:
def find_weekend_indices(df):
    xcol=list(df)[0]
    datetime_array=df[xcol]
    
    s = []
    for i in range(len(datetime_array) - 1):
        if datetime_array[i].weekday() >= 5:
            s.append([df[xcol][i], df[xcol][i + 1]])
#             s2.append(df[xcol][i + 1])  

    return s

In [7]:
def my_linechart(df_orig, sett):
    """Creates a line chart. x axis must be modified manually        
    Parameters
    -----------
    df : pandas dataframe
        Data for the line chart.
    sett: settings object containing styling and annotation parameters      
    
    Optional params:
    
    
    Returns 
    --------
    fig
        Matplotlib fig object
    ax 
        Matplotlib ax object
 
    """ 
    df=df_orig.copy()
    
    # -------------------------
    # Styling params
    lw=2 if 'lw' not in sett else sett['lw']
    fontsize=9 if 'fontsize' not in sett else sett['fontsize']
    
    # -------------------------
    # Setup the figure
    fig, ax =plt.subplots(1)
#     fig.set_size_inches(9.6, 2.3)
    fig.set_size_inches(18, 5)   #.set_size_inches(18, 10)
    ax = plt.gca()
    ax.tick_params(width=1, length=2)
    ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}')) # format y-axis values with comma
    mpl.rcParams['axes.linewidth'] = 0.3 # set the value globally
    
#     plt.grid()
    #plt.grid(b=True, which='both', color='0.5', linestyle='-')
    plt.grid(b=True, which='major', color='gray', linestyle='-')
    plt.grid(b=True, which='minor', color='#D3D3D3', linestyle='--')

    
    # -------------------------
    # Define line-number-dependent params
    num_lines=len(sett['data'].keys())
    
    col_names=['xcol']
    ymax_array=[]
    for n in range(num_lines):
        col_names.append('ycol_' + str(n))
        ymax_array.append(df.iloc[:,n+1].max())
        
    

    df.columns=col_names
    ymax=np.max(ymax_array)

    
    #-------------------------
    # y-axis
    ymax = ymax + ymax*.1
    if 'ymin' in sett:
        ymin=sett['ymin']
    else:
        ymin=0
          
    delta = (ymax - ymin)/4
    i = 0
    while True:
        delta /= 10
        i += 1
        if delta < 10:
            break
    if 'yinc' in sett:
        yinc=sett['yinc']
    else:
        yinc = int(round(delta+1)*pow(10,i))
                   
    plt.ylim(top=ymax, bottom=ymin)
    if 'ylab' in sett:
        align= 'center' if 'align' not in sett else sett['align']
        plt.ylabel(sett['ylab'], fontsize=fontsize)
    
    # Format x-axis ticks
    if 'major_loc' in sett: # x-values are dates
        if 'minor_loc' in sett:
            ax.xaxis.set_minor_locator(sett['minor_loc'])

    # --------------------------
    # Plot data
    for n in range(num_lines):
        ax.plot(df['xcol'], df['ycol_' + str(n)], linewidth=lw, 
                color = sett['data'][n]['lc'], linestyle=sett['data'][n]['ls'])

    # --------------------------
    # Legend
    if 'legend_label' in sett:
        if sett['legend_label']==True:
            legend_pos='upper left' if 'legend_pos' not in sett else sett['legend_pos']
            ncol=len(df.columns) if 'legend_ncol' not in sett else sett['legend_ncol']

            leg_array=[]
            custom_lines=[]
            for n in range(num_lines):
                leg_array.append(sett['data'][n]['leg'])
                custom_lines.append(
                    Line2D([0], [0], color=sett['data'][n]['lc'], lw=lw,
                          linestyle=sett['data'][n]['ls'])
                )        


            leg_font=fontsize if 'legend_font' not in sett else sett['legend_font']
            ax.legend(custom_lines, leg_array, loc=legend_pos, 
                      prop={"size": leg_font},
                      ncol=ncol, handlelength=3
                     )
    
    # --------------------------
    # Shaded areas
    if 'shaded' in sett:
        num_a=len(sett['shaded'].keys())
        
        for area in range(num_a):
            idx=sett['shaded'][area]['lims']
            facecolour=sett['shaded'][area]['c']
            zorder=0 if 'zorder' not in sett['shaded'][area] else (
                sett['shaded'][area]['zorder'])
            alpha=1 if 'alpha' not in sett['shaded'][area] else (
                sett['shaded'][area]['alpha'])
            
            # Shaded area left and right bds
            for i in range(len(idx)):
                bd1=idx[i][0]
                bd2=idx[i][1]
                
                ax.axvspan(bd1, bd2, 
                       facecolor=facecolour, edgecolor='none', alpha=alpha,
                          zorder=zorder)
            
            # Shaded area label
            if 'label' in sett['shaded'][area]:
                trans = ax.get_xaxis_transform()
        
                plt.text(                    
                    sett['shaded'][area]['label']['x'], # x posn of label
                    sett['shaded'][area]['label']['y'], # y posn of label
                    sett['shaded'][area]['label']['text'], 
                    color = sett['shaded'][area]['label']['colour'], 
                    fontsize=sett['shaded'][area]['label']['fontsize']
                )

# I Read xlsx data

In [8]:
data_dir='~/Documents/PROJECTS/BDITTO/WYS/VALIDATION/PREPROCESSED/'

In [9]:
fname_list=[
    '2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx', '2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx'
'2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx', '2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx'
'2020-04-29_PHARMACY NORTHBOUND SOUTH OF DENTON.xlsx', '2020-05-12_STRATFORD WESTBOUND WEST OF MILDENHALL.xlsx'
'2020-04-30_DENTON WESTBOUND EAST OF BYNG.xlsx',  '2020-05-13_BYNG NORTHBOUND SOUTH OF DENTON.xlsx'
'2020-05-04_GRENOBLE WESTBOUND WEST OF LEEWARD GLENWAY EAST.xlsx', '2020-05-14_PHARMACY SOUTHBOUND NORTH OF DENTON.xlsx'
'2020-05-05_JONES SOUTHBOUND SOUTH OF HUNTER.xlsx', '2020-05-19_BELSIZE WESTBOUND EAST OF CHESTON.xlsx'
'2020-05-06_JONES SOUTHBOUND SOUTH OF HARCOURT.xlsx', '2020-05-20_CHESTON SOUTHBOUND NORTH OF BELSIZE.xlsx'
]

In [10]:
fname_list

['2020-04-27_MAIN SOUTHBOUND NORTH OF SWANWICK.xlsx',
 '2020-05-07_GATEWAY SOUTH EASTBOUND EAST OF DON MILLS.xlsx2020-04-28_NEWPORT EASTBOUND WEST OF AUGUST.xlsx',
 '2020-05-11_UNDERHILL NORTHBOUND SOUTH OF MONARCHWOOD.xlsx2020-04-29_PHARMACY NORTHBOUND SOUTH OF DENTON.xlsx',
 '2020-05-12_STRATFORD WESTBOUND WEST OF MILDENHALL.xlsx2020-04-30_DENTON WESTBOUND EAST OF BYNG.xlsx',
 '2020-05-13_BYNG NORTHBOUND SOUTH OF DENTON.xlsx2020-05-04_GRENOBLE WESTBOUND WEST OF LEEWARD GLENWAY EAST.xlsx',
 '2020-05-14_PHARMACY SOUTHBOUND NORTH OF DENTON.xlsx2020-05-05_JONES SOUTHBOUND SOUTH OF HUNTER.xlsx',
 '2020-05-19_BELSIZE WESTBOUND EAST OF CHESTON.xlsx2020-05-06_JONES SOUTHBOUND SOUTH OF HARCOURT.xlsx',
 '2020-05-20_CHESTON SOUTHBOUND NORTH OF BELSIZE.xlsx']

In [90]:
from datetime import timedelta

def read_xls():
    '''Read set of xlsx files, extract the relevant data, and format into a proper dataframe.'''
    
    # set up output df
    df=pd.DataFrame(columns=['datetime_bin', 'location', 'class_type','count'])
    
    for fname in fname_list[:1]: # start with one file only
        this_df=pd.read_excel(data_dir + fname, skiprows=9).dropna().reset_index(drop=True)
        
        # Find index where second set of data starts
        idx_table2=this_df.loc[this_df['Time']=='Time'].index.values[0]
        print(idx_table2)
        
        # Convert all times after idx_table2 to 24 hour notation
        delta=datetime.timedelta(hours = 12)

        this_df.iloc[idx_table2+1:]['Time']=this_df.iloc[idx_table2+1:]['Time'].apply(lambda x: (
            datetime.datetime.combine(datetime.date(1,1,1), x) + delta).time())
        
        
        loc=list(pd.read_excel(data_dir + fname, skiprows=3))[1]
        obs_date=list(pd.read_excel(data_dir + fname, skiprows=5))[1]
        
        print(loc)
        print(obs_date.date())
        
        # drop second column headers
#         this_df = this_df[this_df.Time != 'Time']
        
    return this_df

In [91]:
this_df=read_xls()
this_df

24
Main Street Southbound North of Swanwick Avenue
2020-04-27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Time,Cars,Trucks,Bicycles,Transit,Pedestrians,Other
0,10:00:00,6,0,0,1,0,0
1,10:05:00,8,1,0,1,0,0
2,10:10:00,14,2,0,1,0,0
3,10:15:00,8,1,1,1,0,0
4,10:20:00,15,0,0,0,2,0
5,10:25:00,13,0,0,1,0,0
6,10:30:00,16,0,0,0,0,0
7,10:35:00,15,0,1,0,4,0
8,10:40:00,23,0,1,1,0,0
9,10:45:00,13,0,0,1,0,0


In [74]:
this_df.iloc[25:].head()

Unnamed: 0,Time,Cars,Trucks,Bicycles,Transit,Pedestrians,Other
25,04:00:00,16,1,2,0,1,0
26,04:05:00,18,0,4,1,0,0
27,04:10:00,15,0,0,0,1,0
28,04:15:00,8,0,0,2,0,0
29,04:20:00,15,0,2,1,0,0


In [75]:
this_df.iloc[25:]['Time'].apply(lambda x: (datetime.datetime.combine(datetime.date(1,1,1),x) + delta).time())

25    16:00:00
26    16:05:00
27    16:10:00
28    16:15:00
29    16:20:00
30    16:25:00
31    16:30:00
32    16:35:00
33    16:40:00
34    16:45:00
35    16:50:00
36    16:55:00
37    17:00:00
38    17:05:00
39    17:10:00
40    17:15:00
41    17:20:00
42    17:25:00
43    17:30:00
44    17:35:00
45    17:40:00
46    17:45:00
47    17:50:00
48    17:55:00
Name: Time, dtype: object

In [56]:
from datetime import timedelta

this_time=df_test.iloc[25]['Time']
print(this_time)

# this_time.strftime('%H:%M:%S') # + datetime.timedelta(hours=12)

print(this_time)
print(this_time + timedelta(hours=9))

# pd.to_datetime(d).strftime('%H:%M:%S')

04:00:00
04:00:00


TypeError: unsupported operand type(s) for +: 'datetime.time' and 'datetime.timedelta'

In [64]:
delta=datetime.timedelta(hours = 12)

print((datetime.datetime.combine(datetime.date(1,1,1),this_time) + delta).time())

16:00:00
