# Bicycle data melbourne analysis

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from tqdm.notebook import tqdm
import plotly.express as px
import geopandas as gpd


In [3]:
rootdir = f'../../data/DPC/bicycle_flows/'

# Open Site Number Listing file
sites = pd.read_excel(f'{rootdir}/VicRoads_Bike_Site_Number_Listing.xlsx')
sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SITE_ID       93 non-null     int64  
 1   TFM_ID        93 non-null     int64  
 2   STRT_LAT      93 non-null     float64
 3   STRT_LONG     93 non-null     float64
 4   GPS           92 non-null     object 
 5   SITE_NAME     91 non-null     object 
 6   TFM_DESC      93 non-null     object 
 7   BEARING_DESC  93 non-null     object 
 8   DATA_SRC_CD   93 non-null     object 
 9   RGN_SHORT_NM  93 non-null     object 
 10  Comments      75 non-null     object 
dtypes: float64(2), int64(2), object(7)
memory usage: 8.1+ KB


In [9]:
sites.describe()

Unnamed: 0,SITE_ID,TFM_ID,STRT_LAT,STRT_LONG
count,93.0,93.0,93.0,93.0
mean,18008.935484,57763.236559,-37.832249,144.998457
std,12974.657259,10612.712297,0.112396,0.082142
min,6411.0,20082.0,-38.50772,144.73627
25%,7596.0,59454.0,-37.83228,144.97364
50%,9999.0,59477.0,-37.80677,144.98755
75%,32615.0,61176.0,-37.78399,145.0281
max,40005.0,70011.0,-37.74322,145.2911


In [10]:
fig = px.scatter_mapbox(sites, lat="STRT_LAT", lon="STRT_LONG", hover_name="SITE_NAME", hover_data=["SITE_ID", "BEARING_DESC"],
                        color_discrete_sequence=["fuchsia"], zoom=9, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html('Site_Map.html')
fig.show()


## Data validation 

The bicycle data is delivered in a large number of CSV files, without an accompanying data dictionary.  To be sure the same columns are shared by all CSVs, such that they can be appended to a master file, we iterate over them and tally up the unique column combinations.  Ideally, the results will be a single combination of columns with a tally count the length of the number of CSV files.  Let's see!

In [2]:
#rootdir ='../data'
#tally = pd.DataFrame(columns=["count","len"])
#counter = 0
#for subdir, dirs, files in tqdm(os.walk(rootdir)):
#    for file in files:
#        if ('.csv' in file) and ('.zip' not in file):
#            # read in CSV, if it contains records (which at least one doesn't!)
#            file_path = os.path.join(subdir,file)
#            if os.path.getsize(file_path) > 0:
#                df = pd.read_csv(os.path.join(subdir,file))
#                # store list of columns in variable df_columns as a string
#                df_columns = f"{df.columns.to_list()}"
#                # if CSV columns string is in the tally index, increment this
#                if df_columns in tally.index:
#                    tally[tally.index==df_columns] += 1
#                # otherwise add CSV columns string to the tally index
#                else:
#                    tally.loc[df_columns] = 1
#                # increment a counter; athough theoretically this should only sum to the sum of tallys!
#                counter+=1
#
#print(counter)

In [8]:
tally

Unnamed: 0,count
"['DATA_TYPE', 'TIS_DATA_REQUEST', 'SITE_XN_ROUTE', 'LOC_LEG', 'DATE', 'TIME', 'CLASS', 'LANE', 'SPEED', 'WHEELBASE', 'HEADWAY', 'GAP', 'AXLE', 'AXLE_GROUPING', 'RHO', 'VEHICLE', 'DIRECTION']",13229


## Creating a master dataframe

Now that we know that all of the CSVs share the same column names, we will join the various files together to create a master dataframe to run the analysis on. 

In [3]:
data_years = ['Bicycle_Volume_Speed_2017',
'Bicycle_Volume_Speed_2018',
'Bicycle_Volume_Speed_2019',
'Bicycle_Volume_Speed_2020',
'Bicycle_Volume_Speed_2021']

rootdir = f'../../data/DPC/bicycle_flows/{data_years[0]}'
csv_files = []

for subdir, dirs, files in tqdm(os.walk(rootdir),desc="Getting CSV file paths...",unit="CSVs"):
    for file in files:
        if ('.csv' in file) and ('.zip' not in file):
            # record filepaths of CSVs containing records
            file_path = os.path.join(subdir,file)
            if os.path.getsize(file_path) > 0:
                csv_files.append(os.path.join(subdir,file))

print(f"Identified the locations of {len(csv_files)} valid CSV files to compile!")

Getting CSV file paths...: 0CSVs [00:00, ?CSVs/s]

Identified the locations of 1430 valid CSV files to compile!


In [4]:
dfs=[]
for csv in tqdm(csv_files,desc=f"Reading csv files for {rootdir}...",unit="CSVs"):
    csv_df = pd.read_csv(csv, index_col=None, header=0)
    dfs.append(csv_df)
    del csv_df

dfs = pd.concat(dfs, axis=0, ignore_index=True)

Reading csv files for ../../data/DPC/bicycle_flows/Bicycle_Volume_Speed_2017...:   0%|          | 0/1430 [00:0…

## Summary statistics

Let's look at the summary statistics for the master dataframe! First we'll look at what type of data is included in each column. Then we can see can check the max/min values, and the distribution of the data, to see if there are any outlying data points. 

In [5]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9989156 entries, 0 to 9989155
Data columns (total 17 columns):
 #   Column            Dtype  
---  ------            -----  
 0   DATA_TYPE         object 
 1   TIS_DATA_REQUEST  object 
 2   SITE_XN_ROUTE     object 
 3   LOC_LEG           object 
 4   DATE              object 
 5   TIME              object 
 6   CLASS             object 
 7   LANE              object 
 8   SPEED             float64
 9   WHEELBASE         float64
 10  HEADWAY           float64
 11  GAP               float64
 12  AXLE              object 
 13  AXLE_GROUPING     object 
 14  RHO               float64
 15  VEHICLE           object 
 16  DIRECTION         object 
dtypes: float64(5), object(12)
memory usage: 1.3+ GB


In [6]:
dfs.head()

Unnamed: 0,DATA_TYPE,TIS_DATA_REQUEST,SITE_XN_ROUTE,LOC_LEG,DATE,TIME,CLASS,LANE,SPEED,WHEELBASE,HEADWAY,GAP,AXLE,AXLE_GROUPING,RHO,VEHICLE,DIRECTION
0,IND,208,10223,59444,26/12/2016,03:13:06,15,1,22.2,1.1,0.0,0.1,2,1,1.0,CYCLE,S
1,IND,208,10223,59443,26/12/2016,04:13:44,15,0,28.6,1.0,3637.5,3637.5,2,1,1.0,CYCLE,N
2,IND,208,10223,59443,26/12/2016,05:10:53,15,0,25.3,1.0,3429.8,3429.7,2,1,1.0,CYCLE,N
3,IND,208,10223,59444,26/12/2016,05:11:28,15,1,17.7,1.0,7102.2,7102.1,2,1,1.0,CYCLE,S
4,IND,208,10223,59443,26/12/2016,05:42:05,15,0,30.3,1.0,1872.0,1871.8,2,1,1.0,CYCLE,N


In [23]:
#Summary statistics

dfs.describe()

Unnamed: 0,SPEED,WHEELBASE,HEADWAY,GAP,RHO
count,9989156.0,9989156.0,9989156.0,9989156.0,9989156.0
mean,21.83996,1.030735,189.0292,190.5951,0.9768114
std,6.683758,0.09946097,1348.285,1553.156,0.105566
min,0.3,0.0,0.0,0.0,0.0
25%,17.4,1.0,3.3,3.1,1.0
50%,21.9,1.0,26.7,26.5,1.0
75%,26.4,1.1,113.2,113.0,1.0
max,159.6,7.6,86400.0,569233.1,1.5


In [None]:
import plotly.express as px
df = px.data.gapminder().query("year == 2007")
fig = px.scatter_geo(df, locations="iso_alpha",
                     size="pop", # size of markers, "pop" is one of the columns of gapminder
                     )
fig.show()