### Reading in data and assessing what sort of information we can get from it

In [1]:
# !pip install sodapy
# !pip install geopandas
# !pip install folium
# !pip install mapclassify

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Imports

In [3]:
# For importing the data and using API
from sodapy import Socrata
import os
import zipfile as zf
import requests
from io import BytesIO

# Working with the data
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
#from sklearn.preprocessing import LabelEncoder
import json

# Creating Visualisations
# import plotly.graph_objs as go
# import plotly.express as px
# Visualisation
import matplotlib.pylab as plt
import folium
from folium import plugins
from folium import GeoJson
import mapclassify
import seaborn as sns
import branca.colormap as cm

#### COM Population Dataset
https://data.melbourne.vic.gov.au/resource/sp4r-xphj.json

In [4]:
domain = "data.melbourne.vic.gov.au"
data_file = 'sp4r-xphj'

In [5]:
apptoken = os.environ.get("SODAPY_APPTOKEN") # Anonymous app token
client = Socrata(domain, apptoken) 



##### View the Population dataset head sample

In [6]:
population_data = pd.DataFrame.from_dict(client.get_all(data_file))
print(population_data.shape)
population_data.head()

(16989, 5)


Unnamed: 0,geography,year,gender,age,value
0,City of Melbourne,2020,Female,Age 0-4,2683
1,City of Melbourne,2021,Female,Age 0-4,2945
2,City of Melbourne,2022,Female,Age 0-4,3212
3,City of Melbourne,2023,Female,Age 0-4,3515
4,City of Melbourne,2024,Female,Age 0-4,3833


In [7]:
def pop_data_by_year(dataset, year):
    """
    Filters and cleans the Population dataset returning a new pandas dataframe focused on the year passed to the function.
    
    Note that the year must be between 2020 and 2040 inclusive. 
    """
    # Extract the colomns of interest into "summary".
    summary = dataset[['geography', 'year', 'value']]
    # Convert datatypes and rename geography to suburb
    summary = summary.astype({'year':int, 'value':float, 'geography':str})
    summary.rename(columns={'geography':'suburb'}, inplace=True)
    # Extract the data matching the year passed from the summary.
    data = summary[summary['year'] == year]
   
    # Grouping the data by suburb while summing the population values. 
    data = pd.DataFrame(data.groupby('suburb')['value'].sum())
    data = data.reset_index()
    # Renaming the column "value" to "population_year" where year represents the year passed.
    data.rename(columns={'value':f'population_{year}'}, inplace=True)
    
    # Cleaning the data and reset indexes
    data['suburb'] = data['suburb'].replace(['Melbourne (CBD)', 'Melbourne (Remainder)'], ['Melbourne', 'Melbourne'])
    data = pd.DataFrame(data.groupby('suburb')[f'population_{year}'].sum())
    data = data.reset_index()
    
    # Removing unrequired data.
    subs_to_delete = ['West Melbourne (Industrial)', 'City of Melbourne']
    subs = [data.index[data['suburb']==sub].tolist()[0] for sub in subs_to_delete]
    data.drop(subs, inplace = True)

    data = data.reset_index(drop=True)
    data['suburb'] = data['suburb'].replace(['West Melbourne (Residential)'], ['West Melbourne'])
    
    # sort data
    data.sort_values('suburb', inplace = True)
    data = data.reset_index(drop=True)
    data['suburb'] = data['suburb'].astype(str)
    
    return data

#### Create dictionary for population data

In [8]:
def create_population_dict(data):
    '''Create dictionary with a datframe for all years in the dataset'''
    population = {}
    years = data['year'].unique().tolist()
    
    for year in years:
        population[int(year)] = year_data = pop_data_by_year(data, int(year))
    
    return population
    

population_layers = create_population_dict(population_data)
population_layers

{2020:              suburb  population_2020
 0           Carlton         68920.81
 1         Docklands         48213.64
 2    East Melbourne         18713.34
 3        Kensington         33948.22
 4         Melbourne        175344.97
 5   North Melbourne         52697.81
 6         Parkville         31881.88
 7    Port Melbourne           141.77
 8       South Yarra         14261.88
 9         Southbank         83619.16
 10   West Melbourne         24884.36,
 2021:              suburb  population_2021
 0           Carlton         66981.24
 1         Docklands         47711.07
 2    East Melbourne         18472.78
 3        Kensington         33988.86
 4         Melbourne        177233.63
 5   North Melbourne         51508.42
 6         Parkville         30837.88
 7    Port Melbourne           144.37
 8       South Yarra         14085.97
 9         Southbank         83582.46
 10   West Melbourne         25026.38,
 2022:              suburb  population_2022
 0           Carlton         6

In [9]:
# Importing victorian geo data from json file.
url = (
    'https://data.gov.au/geoserver/vic-suburb-locality-boundaries-psma-administrative-boundaries/'
    + 'wfs?request=GetFeature&typeName=ckan_af33dd8c_0534_4e18_9245_fc64440f742e&outputFormat=json')
vic_suburbs = gpd.read_file(url)
vic_suburbs_reduced = vic_suburbs[['vic_loca_2', 'geometry']]
vic_suburbs_reduced.rename(columns={'vic_loca_2':'suburb'}, inplace=True)
vic_suburbs_reduced.head()

Unnamed: 0,suburb,geometry
0,UNDERBOOL,"MULTIPOLYGON (((141.74552 -35.07229, 141.74552..."
1,NURRAN,"MULTIPOLYGON (((148.66877 -37.39571, 148.66876..."
2,WOORNDOO,"MULTIPOLYGON (((142.92288 -37.97886, 142.90449..."
3,DEPTFORD,"MULTIPOLYGON (((147.82336 -37.66001, 147.82313..."
4,YANAC,"MULTIPOLYGON (((141.27978 -35.99859, 141.27989..."


In [10]:
# Extract the suburbs of interest that match the population_data into "target_suburbs".
target_suburbs = population_layers[2022]['suburb'].str.upper()

# Locate the index of the target suburbs and store as a list in "subs"
subs = [vic_suburbs_reduced.index[vic_suburbs_reduced['suburb']==sub].tolist()[0] for sub in target_suburbs]

# Create a new dataframe for the melbourne suburbs "mel_suburbs" and reformat.
mel_suburbs = pd.DataFrame(vic_suburbs_reduced.iloc[subs])
mel_suburbs = mel_suburbs.reset_index(drop=True)
mel_suburbs['suburb'] = mel_suburbs['suburb'].str.title()
mel_suburbs

Unnamed: 0,suburb,geometry
0,Carlton,"MULTIPOLYGON (((144.97401 -37.80311, 144.97320..."
1,Docklands,"MULTIPOLYGON (((144.95376 -37.82363, 144.95336..."
2,East Melbourne,"MULTIPOLYGON (((144.97136 -37.80773, 144.97308..."
3,Kensington,"MULTIPOLYGON (((144.92282 -37.79913, 144.91977..."
4,Melbourne,"MULTIPOLYGON (((144.97797 -37.83867, 144.97803..."
5,North Melbourne,"MULTIPOLYGON (((144.95599 -37.80588, 144.95360..."
6,Parkville,"MULTIPOLYGON (((144.96521 -37.79315, 144.96460..."
7,Port Melbourne,"MULTIPOLYGON (((144.90749 -37.84326, 144.90652..."
8,South Yarra,"MULTIPOLYGON (((145.00455 -37.84131, 145.00453..."
9,Southbank,"MULTIPOLYGON (((144.97041 -37.83016, 144.97030..."


In [11]:
# Convert the DF to a GeoDF
melbourne_geo_data = gpd.GeoDataFrame(mel_suburbs) 

# Convert the GeoDF to a JSON format. 
melbourne_geo_data_json = melbourne_geo_data.to_json()
melbourne_geo_data_json[:200] # 2000 Char preview

'{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"suburb": "Carlton"}, "geometry": {"type": "MultiPolygon", "coordinates": [[[[144.97400757, -37.80311047], [144'

In [12]:
# Testing inbuilt GeoPands Features 

melbourne_geo_data.explore(melbourne_geo_data.area , legend=False)
# Hover over the suburbs Anj!! I think we add this as an interactive explore our traget suburbs, its just cool.

### Function to create layers for each year

In [13]:
# Create base map with Folium

melb_map = folium.Map(
    location=[-37.81368709240999, 144.95738102347036],
    #width=500, height=300,
    tiles='Cartodb Positron',
    zoom_start=12,
    min_zoom=10)

In [14]:
def create_map_layer(dataset, dataset_name, data_legend, col_name):
    folium.Choropleth(
        geo_data = melbourne_geo_data_json,
        name = dataset_name,
        data = dataset,
        columns = ['suburb', col_name],
        key_on='feature.properties.suburb', 
        fill_color = 'YlGn',
        fill_opacity = 0.7,
        line_opacity = 0.2,
        legend_name = data_legend,
    ).add_to(melb_map)

    folium.LayerControl().add_to(melb_map)
    
    '''The tooltips are not working...'''
#     # Add tooltips
#     folium.features.GeoJson(
#     data=dataset,
#     name=dataset_name,
#     smooth_factor=2,
#     style_function=lambda x: {'color':'black','fillColor':'transparent','weight':0.5},
#     tooltip=folium.features.GeoJsonTooltip(
#         fields=['suburb',
#                 col_name
#                ],
#         aliases=["Suburb:",
#                  "Population:"
#                 ], 
#         localize=True,
#         sticky=False,
#         labels=True,
#         style="""
#             background-color: #F0EFEF;
#             border: 2px solid black;
#             border-radius: 3px;
#             box-shadow: 3px;
#         """,
#         max_width=800,),
#             highlight_function=lambda x: {'weight':3,'fillColor':'grey'},
#         ).add_to(melb_map)  


In [15]:
def create_map_feature_group(dataset, dataset_name, data_legend, col_name, data_map):
    """Almost works..."""
    fg = folium.FeatureGroup(name=dataset_name,overlay=False).add_to(data_map)
    population =folium.Choropleth(
            geo_data = melbourne_geo_data_json,
            data = dataset,
            columns = ['suburb', col_name],  
            key_on = 'feature.properties.suburb', 
#             threshold_scale=custom_scale1, #use the custom scale we created for legend
            fill_color='YlGn',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            legend_name = data_legend,
            highlight=True,
            overlay=False,
            line_color='black').geojson.add_to(fg)
    folium.LayerControl().add_to(data_map)
    return data_map

In [16]:
# Create feature groups
# fg1 = folium.FeatureGroup(name='New Covid-19 Cases Past 7 Days',overlay=False).add_to(us_map)
# fg2 = folium.FeatureGroup(name='Percent of Positive Cases Past 7 Days',overlay=False).add_to(us_map)


In [17]:
def add_population_to_map(dataframe,label, main_data, melb_map):    
    col_name = f'population_{label}'
    
    if col_name not in main_data.columns:
        main_data = main_data.join(dataframe.set_index(["suburb"]), on=["suburb"])
    
    '''Swap these two lines to either have one layer or attempt to group layers... (one so far)'''
#     melb_map = create_map_feature_group(main_data, f'Population Density ({label})', f'Population Density',col_name, melb_map)
    create_map_layer(main_data, f'Population Density ({label})', f'Population Density',col_name)

    return main_data, melb_map

In [18]:
year = 2022
data = population_layers[year]

melbourne_geo_data, melb_map = add_population_to_map(data, year, melbourne_geo_data, melb_map)

In [19]:
# folium.TileLayer('cartodbdark_matter',overlay=True,name="View in Dark Mode").add_to(melb_map)
# folium.TileLayer('cartodbpositron',overlay=True,name="Viw in Light Mode").add_to(melb_map)
# folium.LayerControl(collapsed=False).add_to(melb_map)
# melb_map.save("index.html") 

In [20]:
melb_map 

In [21]:
# year = 2021
# data = population_layers[year]

# add_population_to_map(data, year)

In [22]:
pop_data_2030 = pop_data_by_year(population_data, 2030)
pop_data_2030 

Unnamed: 0,suburb,population_2030
0,Carlton,86749.87
1,Docklands,69207.61
2,East Melbourne,23736.45
3,Kensington,49074.21
4,Melbourne,249852.84
5,North Melbourne,83938.89
6,Parkville,39775.58
7,Port Melbourne,4842.04
8,South Yarra,17033.03
9,Southbank,122042.52


In [23]:
# Different years
pop_data_22 = pop_data_by_year(population_data, 2022)

In [24]:
pop_data_22.head()

Unnamed: 0,suburb,population_2022
0,Carlton,68646.64
1,Docklands,49629.55
2,East Melbourne,19080.77
3,Kensington,34963.64
4,Melbourne,182472.75


In [25]:
# folium.Choropleth(
#     geo_data = melbourne_geo_data_json,
#     name = 'pop_22',
#     data = pop_data_22,
#     columns = ['geography', 'population'],
#     key_on='feature.properties.suburb', 
#     fill_color = 'YlGn',
#     fill_opacity = 0.7,
#     line_opacity = 0.2,
#     legend_name = 'Population Density 2022',
# ).add_to(melb_map)

# folium.LayerControl().add_to(melb_map)
# melb_map

In [26]:
pop_data_21 = pop_data_by_year(population_data, 2021)

In [27]:
# folium.Choropleth(
#     geo_data = melbourne_geo_data_json,
#     name = 'pop_21',
#     data = pop_data_21,
#     columns = ['geography', 'population'],
#     key_on='feature.properties.suburb', 
#     fill_color = 'YlGn',
#     fill_opacity = 0.7,
#     line_opacity = 0.2,
#     legend_name = 'Population Density 2021',
# ).add_to(melb_map)

# folium.LayerControl().add_to(melb_map)
# melb_map

-----

In [28]:
#returns a new DF with the difference between the two years as a number and a percentage.
def population_diff(population_data, year_1, year_2):
    
    start_year = pop_data_by_year(population_data, year_1)
    end_year = pop_data_by_year(population_data, year_2)

    combined = start_year.merge(end_year, left_on='suburb', right_on='suburb')
    
    combined['change #'] = combined[list(end_year)[1]] - combined[list(start_year)[1]]
    combined['change %'] = combined[list(start_year)[1]] / combined[list(end_year)[1]]
    
    return combined

In [29]:
pop_diff_2020_2030 = population_diff(population_data, 2020, 2030)
pop_diff_2020_2030

Unnamed: 0,suburb,population_2020,population_2030,change #,change %
0,Carlton,68920.81,86749.87,17829.06,0.794477
1,Docklands,48213.64,69207.61,20993.97,0.696652
2,East Melbourne,18713.34,23736.45,5023.11,0.78838
3,Kensington,33948.22,49074.21,15125.99,0.691773
4,Melbourne,175344.97,249852.84,74507.87,0.701793
5,North Melbourne,52697.81,83938.89,31241.08,0.627812
6,Parkville,31881.88,39775.58,7893.7,0.801544
7,Port Melbourne,141.77,4842.04,4700.27,0.029279
8,South Yarra,14261.88,17033.03,2771.15,0.837307
9,Southbank,83619.16,122042.52,38423.36,0.685164


In [30]:
# Simple function to return a fresh map.

def change_map(population_data, layer_name):
    # Create Bse Layer 
    my_map = folium.Map(
        location=[-37.81368709240999, 144.95738102347036],
        tiles='Cartodb Positron',
        zoom_start=13,
        min_zoom=10)

    # Create choropleth overlay
    folium.Choropleth(
        geo_data = melbourne_geo_data_json,
        name = layer_name,
        data = population_data,
        columns = ['suburb', 'change %'], # list(population_data)[1]
        key_on='feature.properties.suburb', 
        fill_color = 'YlGn',
        fill_opacity = 0.7,
        line_opacity = 0.2,
        legend_name = 'Growth of Population (%)',
    ).add_to(my_map)

    folium.LayerControl().add_to(my_map)
    
    return my_map

In [31]:
pop_change = population_diff(population_data, 2020, 2030) # Enter the two years to compare

my_map = change_map(pop_change, '2020 to 2030')
my_map

## GeoPandas Take 2

In [32]:
all_subs = gpd.read_file(url)
all_subs.head(2)

Unnamed: 0,id,lc_ply_pid,dt_create,dt_retire,loc_pid,vic_locali,vic_loca_1,vic_loca_2,vic_loca_3,vic_loca_4,vic_loca_5,vic_loca_6,vic_loca_7,geometry
0,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.1,6670,2011-08-31,,VIC2615,2012-04-27,,UNDERBOOL,,,G,,2,"MULTIPOLYGON (((141.74552 -35.07229, 141.74552..."
1,ckan_af33dd8c_0534_4e18_9245_fc64440f742e.2,6671,2011-08-31,,VIC1986,2012-04-27,,NURRAN,,,G,,2,"MULTIPOLYGON (((148.66877 -37.39571, 148.66876..."


In [33]:
all_subs_df = all_subs[['vic_loca_2', 'geometry']]
all_subs_df.rename(columns={'vic_loca_2':'suburb'}, inplace=True)
all_subs_df.head()

Unnamed: 0,suburb,geometry
0,UNDERBOOL,"MULTIPOLYGON (((141.74552 -35.07229, 141.74552..."
1,NURRAN,"MULTIPOLYGON (((148.66877 -37.39571, 148.66876..."
2,WOORNDOO,"MULTIPOLYGON (((142.92288 -37.97886, 142.90449..."
3,DEPTFORD,"MULTIPOLYGON (((147.82336 -37.66001, 147.82313..."
4,YANAC,"MULTIPOLYGON (((141.27978 -35.99859, 141.27989..."


In [34]:
# Extract the suburbs of interest that match the population_data into "target_suburbs".
target_subs = population_layers[2022]['suburb'].str.upper()

# Locate the index of the target suburbs and store as a list in "subs"
subs = [all_subs_df.index[all_subs_df['suburb']==sub].tolist()[0] for sub in target_subs]

#Remove unwanted rows and keep data in geo dataframe format
city_suburbs = all_subs_df.take(list(subs))
city_suburbs.reset_index(drop=True, inplace = True)
city_suburbs['suburb'] = city_suburbs['suburb'].str.title()

city_suburbs

Unnamed: 0,suburb,geometry
0,Carlton,"MULTIPOLYGON (((144.97401 -37.80311, 144.97320..."
1,Docklands,"MULTIPOLYGON (((144.95376 -37.82363, 144.95336..."
2,East Melbourne,"MULTIPOLYGON (((144.97136 -37.80773, 144.97308..."
3,Kensington,"MULTIPOLYGON (((144.92282 -37.79913, 144.91977..."
4,Melbourne,"MULTIPOLYGON (((144.97797 -37.83867, 144.97803..."
5,North Melbourne,"MULTIPOLYGON (((144.95599 -37.80588, 144.95360..."
6,Parkville,"MULTIPOLYGON (((144.96521 -37.79315, 144.96460..."
7,Port Melbourne,"MULTIPOLYGON (((144.90749 -37.84326, 144.90652..."
8,South Yarra,"MULTIPOLYGON (((145.00455 -37.84131, 145.00453..."
9,Southbank,"MULTIPOLYGON (((144.97041 -37.83016, 144.97030..."


In [35]:
type(city_suburbs)

geopandas.geodataframe.GeoDataFrame

In [36]:
city_suburbs.explore()

In [37]:
city_suburbs_pop = city_suburbs.join(population_layers[2022].set_index(["suburb"]), on=["suburb"])
city_suburbs_pop

Unnamed: 0,suburb,geometry,population_2022
0,Carlton,"MULTIPOLYGON (((144.97401 -37.80311, 144.97320...",68646.64
1,Docklands,"MULTIPOLYGON (((144.95376 -37.82363, 144.95336...",49629.55
2,East Melbourne,"MULTIPOLYGON (((144.97136 -37.80773, 144.97308...",19080.77
3,Kensington,"MULTIPOLYGON (((144.92282 -37.79913, 144.91977...",34963.64
4,Melbourne,"MULTIPOLYGON (((144.97797 -37.83867, 144.97803...",182472.75
5,North Melbourne,"MULTIPOLYGON (((144.95599 -37.80588, 144.95360...",52874.8
6,Parkville,"MULTIPOLYGON (((144.96521 -37.79315, 144.96460...",31820.68
7,Port Melbourne,"MULTIPOLYGON (((144.90749 -37.84326, 144.90652...",144.74
8,South Yarra,"MULTIPOLYGON (((145.00455 -37.84131, 145.00453...",14510.35
9,Southbank,"MULTIPOLYGON (((144.97041 -37.83016, 144.97030...",91839.18


In [38]:
type(city_suburbs_pop)

geopandas.geodataframe.GeoDataFrame

In [39]:
city_suburbs_pop.explore(column = 'population_2022', name = 'Population (thousands) 2022')

In [40]:
# Save the layer view as a map layer
pop_m = city_suburbs_pop.explore(column = 'population_2022', name = 'Population (thousands) 2022')

# Add layer control for user
folium.LayerControl().add_to(pop_m)

# display the map
pop_m

In [41]:
city_suburbs_pop = city_suburbs_pop.join(population_layers[2030].set_index(["suburb"]), on=["suburb"])
city_suburbs_pop

Unnamed: 0,suburb,geometry,population_2022,population_2030
0,Carlton,"MULTIPOLYGON (((144.97401 -37.80311, 144.97320...",68646.64,86749.87
1,Docklands,"MULTIPOLYGON (((144.95376 -37.82363, 144.95336...",49629.55,69207.61
2,East Melbourne,"MULTIPOLYGON (((144.97136 -37.80773, 144.97308...",19080.77,23736.45
3,Kensington,"MULTIPOLYGON (((144.92282 -37.79913, 144.91977...",34963.64,49074.21
4,Melbourne,"MULTIPOLYGON (((144.97797 -37.83867, 144.97803...",182472.75,249852.84
5,North Melbourne,"MULTIPOLYGON (((144.95599 -37.80588, 144.95360...",52874.8,83938.89
6,Parkville,"MULTIPOLYGON (((144.96521 -37.79315, 144.96460...",31820.68,39775.58
7,Port Melbourne,"MULTIPOLYGON (((144.90749 -37.84326, 144.90652...",144.74,4842.04
8,South Yarra,"MULTIPOLYGON (((145.00455 -37.84131, 145.00453...",14510.35,17033.03
9,Southbank,"MULTIPOLYGON (((144.97041 -37.83016, 144.97030...",91839.18,122042.52


In [42]:
com = city_suburbs.explore(name = 'Melbourne Cities')

city_suburbs.join(population_layers[2022].set_index(["suburb"]), on=["suburb"]).explore(m = com, column = 'population_2022', name = 'Population (thousands) 2022')
city_suburbs.join(population_layers[2030].set_index(["suburb"]), on=["suburb"]).explore(m = com, column = 'population_2030', name = 'Population (thousands) 2030')
city_suburbs.join(population_layers[2040].set_index(["suburb"]), on=["suburb"]).explore(m = com, column = 'population_2040', name = 'Population (thousands) 2040')

folium.TileLayer(control=True).add_to(com)  # use folium to add alternative tiles
folium.LayerControl().add_to(com)
com

#### VicRoads Traffic Dataset

In [43]:
traffic_url = 'https://vicroadsopendata-vicroadsmaps.opendata.arcgis.com/datasets/5512df2ff41e4941bacf868053dbfba9_0.csv?outSR=%7B%22latestWkid%22%3A3111%2C%22wkid%22%3A102171%7D'

In [44]:
traffic_data = pd.read_csv(traffic_url)

In [45]:
traffic_data.head()

Unnamed: 0,OBJECTID_1,OBJECTID,TIS_ID,HMGNS_FLOW_ID,HMGNS_LNK_ID,HMGNS_LNK_DESC,LGA_SHORT_NM,RGN_LONG_NM,ROAD_NBR,DECLARED_ROAD_NM,...,TWO_WAY_AADT_TRUCKS,ALLVEH_AMPEAK_AADT,ALLVEH_PMPEAK_AADT,GROWTH_RATE,CI,AM_PEAK_SPEED,OFF_PEAK_SPEED,PM_PEAK_SPEED,YR,LABEL
0,1,743,14915,14915,2006,MARYSVILLE-WOODS POINT ROAD btwn LAKE MOUNTAI...,YARRA RANGES,METROPOLITAN SOUTH EAST REGION,4961,MARYSVILLE-WOODS POINT ROAD,...,0.0,,,0.013,0.005,,,,2020,24* (13% 3*) EAST BOUND
1,2,650,14140,14140,8786,STEELS CREEK ROAD btwn WILLOWBEND DRIVE & ELT...,YARRA RANGES,METROPOLITAN SOUTH EAST REGION,9999,Not Applicable,...,40.0,,,0.019,0.002,,,,2020,373* (6% 22*) NORTH BOUND
2,3,701,12113,12113,6035,LATROBE ROAD btwn TANJIL EAST ROAD & GORDON S...,LATROBE,EASTERN REGION,5911,MORWELL-YALLOURN NORTH ROAD,...,160.0,,,0.015,0.009,,,,2020,"1,100* (6% 61*) NORTH BOUND"
3,4,702,12897,12897,7079,CASTERTON ROAD btwn GLENELG HIGHWAY & COLERAI...,SOUTHERN GRAMPIANS,SOUTH WESTERN REGION,2670,GLENELG HIGHWAY,...,340.0,,,0.02,0.001,,,,2020,801* (21% 165*) WEST BOUND
4,5,703,9893,9893,3475,HUTTON ROAD btwn CHAPEL ROAD & GREENS ROAD,DANDENONG,METROPOLITAN SOUTH EAST REGION,5168,BRAESIDE-DANDENONG ROAD,...,1500.0,1000.0,1100.0,0.003,0.002,,,,2020,"12,000 (6% 744*) WEST BOUND"


In [46]:
print(traffic_data['LGA_SHORT_NM'].unique())

['YARRA RANGES' 'LATROBE' 'SOUTHERN GRAMPIANS' 'DANDENONG' 'BRIMBANK'
 'WELLINGTON' 'WYNDHAM' 'KNOX' 'HUME' 'EAST GIPPSLAND' 'BAYSIDE' 'BENALLA'
 'MOIRA' 'NILLUMBIK' 'CASEY' 'LODDON' 'MAROONDAH' 'KINGSTON' 'GLENELG'
 'MORNINGTON PENINSULA' 'MANNINGHAM' 'DAREBIN' 'CORANGAMITE' 'SURF COAST'
 'TOWONG' 'WEST WIMMERA' 'BENDIGO' 'GEELONG' 'BASS COAST'
 '(MOUNT STIRLING)' 'MOYNE' 'QUEENSCLIFFE' 'BAW BAW' 'BALLARAT'
 'CENTRAL GOLDFIELDS' 'FRANKSTON' 'PYRENEES' 'WHITTLESEA' 'MARIBYRNONG'
 'INDIGO' 'MOUNT ALEXANDER' 'MELBOURNE' 'BANYULE' 'SOUTH GIPPSLAND'
 'STONNINGTON' 'GOLDEN PLAINS' 'SHEPPARTON' 'MORELAND' 'BOROONDARA'
 'NORTHERN GRAMPIANS' 'YARRA' 'ALPINE' 'WHITEHORSE' 'COLAC OTWAY'
 'WANGARATTA' 'MACEDON RANGES' 'ARARAT' 'HORSHAM' 'WODONGA' 'HOBSONS BAY'
 'CARDINIA' 'MILDURA' 'PORT PHILLIP' 'MOORABOOL' 'BULOKE' 'MITCHELL'
 'MURRINDINDI' 'MONASH' 'HINDMARSH' 'MELTON' 'MOONEE VALLEY' 'WARRNAMBOOL'
 'YARRIAMBIACK' 'MANSFIELD' 'GLEN EIRA' 'GANNAWARRA' 'CAMPASPE'
 'SWAN HILL' 'STRATHBOGIE' 'HEPB

In [47]:
melb_traffic = traffic_data.where(traffic_data['LGA_SHORT_NM']=='MELBOURNE')

In [48]:
melb_traffic['LGA_SHORT_NM'].unique()

array([nan, 'MELBOURNE'], dtype=object)

In [49]:
melb_traffic.dropna(subset = ['LGA_SHORT_NM'], inplace = True)
melb_traffic

Unnamed: 0,OBJECTID_1,OBJECTID,TIS_ID,HMGNS_FLOW_ID,HMGNS_LNK_ID,HMGNS_LNK_DESC,LGA_SHORT_NM,RGN_LONG_NM,ROAD_NBR,DECLARED_ROAD_NM,...,TWO_WAY_AADT_TRUCKS,ALLVEH_AMPEAK_AADT,ALLVEH_PMPEAK_AADT,GROWTH_RATE,CI,AM_PEAK_SPEED,OFF_PEAK_SPEED,PM_PEAK_SPEED,YR,LABEL
95,96.0,1288.0,53.0,53.0,3458.0,FOOTSCRAY ROAD btwn WESTERN LINK TOLLWAY Onra...,MELBOURNE,METROPOLITAN NORTH WEST REGION,2120.0,DOCKLANDS HIGHWAY,...,1700.0,,,-0.001,0.001,,,,2020.0,"16,000* (5% 833*) EAST BOUND"
104,105.0,1297.0,615.0,615.0,4654.0,CITY ROAD btwn ALEXANDRA AVENUE & SOUTHBANK B...,MELBOURNE,METROPOLITAN NORTH WEST REGION,2240.0,YARRA BANK HIGHWAY,...,1400.0,,,0.002,0.002,,,,2020.0,"19,000* (4% 754*) EAST BOUND"
134,135.0,1406.0,1092.0,1092.0,1555.0,WESTERN LINK TOLLWAY btwn WESTERN LINK TOLLWA...,MELBOURNE,METROPOLITAN NORTH WEST REGION,2999.0,WESTERN LINK TOLLWAY,...,14000.0,,,0.026,0.003,,,,2020.0,"58,000* (13% 7,400*) SOUTH BOUND"
156,157.0,1311.0,1359.0,1359.0,2886.0,HARKER STREET btwn FLEMINGTON ROAD & ERROL ST...,MELBOURNE,METROPOLITAN NORTH WEST REGION,5026.0,HARKER STREET,...,720.0,,,0.014,0.003,,,,2020.0,"5,800* (6% 361*) SOUTH WEST BOUND"
160,161.0,1315.0,2266.0,2266.0,2919.0,DYNON ROAD btwn DOCK LINK ROAD & LLOYD STREET,MELBOURNE,METROPOLITAN NORTH WEST REGION,5035.0,DYNON ROAD,...,3300.0,,,0.003,0.005,,,,2020.0,"17,000* (10% 1,800*) EAST BOUND"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14548,14549.0,15159.0,15242.0,15242.0,2474.0,FLINDERS STREET btwn ELIZABETH STREET & ST KI...,MELBOURNE,METROPOLITAN NORTH WEST REGION,9999.0,Not Applicable,...,1200.0,,,-0.014,0.006,,,,2020.0,"9,600* (6% 581*) EAST BOUND"
14549,14550.0,15160.0,15868.0,15868.0,2477.0,ELIZABETH STREET btwn COLLINS STREET & BOURKE...,MELBOURNE,METROPOLITAN NORTH WEST REGION,9999.0,Not Applicable,...,540.0,,,-0.007,0.001,,,,2020.0,"4,500* (6% 270*) SOUTH BOUND"
14570,14571.0,33578.0,6447.0,6447.0,8561.0,POWER STREET btwn WEST GATE FREEWAY & SOUTHER...,MELBOURNE,METROPOLITAN NORTH WEST REGION,8040.0,7824F 2240F,...,760.0,,,0.022,0.004,,,,2020.0,"15,000* (5% 764*) SOUTH EAST BOUND"
14589,14590.0,15446.0,13346.0,13346.0,7409.0,BOURKE STREET btwn SWANSTON STREET WALK & RUS...,MELBOURNE,METROPOLITAN NORTH WEST REGION,9999.0,Not Applicable,...,,,,-0.012,0.003,,,,2020.0,"3,700* (N/A) WEST BOUND"


In [50]:
list(melb_traffic.columns)

['OBJECTID_1',
 'OBJECTID',
 'TIS_ID',
 'HMGNS_FLOW_ID',
 'HMGNS_LNK_ID',
 'HMGNS_LNK_DESC',
 'LGA_SHORT_NM',
 'RGN_LONG_NM',
 'ROAD_NBR',
 'DECLARED_ROAD_NM',
 'LOCAL_ROAD_NM',
 'ALT_ROAD_NM',
 'RMA_CLSFCN_CD',
 'RMA_CLSFCN_GROUP',
 'RMA_DESC',
 'SRNS',
 'SRNS_CD',
 'SRNS_DESC',
 'RMC',
 'FLOW',
 'HMGNS_FLOW_LENGTH',
 'ALLVEHS_MMW',
 'ALLVEH_CALC',
 'ALLVEHS_AADT',
 'BUSES_MMW',
 'BUSES_AADT',
 'TRAMS_MMW',
 'TRAMS_AADT',
 'MOTORCYCLES_MMW',
 'MOTORCYCLES_AADT',
 'MOTORCYCLE_CALC',
 'TRUCKS_MMW',
 'TRUCKS_AADT',
 'TRUCK_CALC',
 'PER_TRUCKS_AADT',
 'EHV',
 'PER_HV',
 'PER_LV',
 'ESA',
 'TWO_WAY_AADT',
 'TWO_WAY_AADT_TRUCKS',
 'ALLVEH_AMPEAK_AADT',
 'ALLVEH_PMPEAK_AADT',
 'GROWTH_RATE',
 'CI',
 'AM_PEAK_SPEED',
 'OFF_PEAK_SPEED',
 'PM_PEAK_SPEED',
 'YR',
 'LABEL']

#### VicRoads Transportation Accidents Dataset

In [51]:
# This url is for a zip file which contains multiple csv files
crash_url = 'https://vicroadsopendatastorehouse.vicroads.vic.gov.au/opendata/Road_Safety/ACCIDENT.zip'

In [52]:
crash_request = requests.get(crash_url)

Read the zip file and view filename contents

In [53]:
crash_files = zf.ZipFile(BytesIO(crash_request.content))
print(crash_files.namelist())

['ACCIDENT.csv', 'ACCIDENT_CHAINAGE.csv', 'ACCIDENT_EVENT.csv', 'ACCIDENT_LOCATION.csv', 'ATMOSPHERIC_COND.csv', 'NODE.csv', 'NODE_ID_COMPLEX_INT_ID.csv', 'PERSON.csv', 'ROAD_SURFACE_COND.csv', 'Statistic Checks.csv', 'SUBDCA.csv', 'VEHICLE.csv']


Save the 'ACCIDENT.csv' file to a temp folder and load into a dataframe

In [54]:
accident = crash_files.extract('ACCIDENT.csv', 'temp')
accident = pd.read_csv(accident, low_memory=False)
accident.head()

Unnamed: 0,ACCIDENT_NO,ACCIDENTDATE,ACCIDENTTIME,ACCIDENT_TYPE,Accident Type Desc,DAY_OF_WEEK,Day Week Description,DCA_CODE,DCA Description,DIRECTORY,...,NO_PERSONS,NO_PERSONS_INJ_2,NO_PERSONS_INJ_3,NO_PERSONS_KILLED,NO_PERSONS_NOT_INJ,POLICE_ATTEND,ROAD_GEOMETRY,Road Geometry Desc,SEVERITY,SPEED_ZONE
0,T20060000010,13/01/2006,12:42:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,6,0,1,0,5,1,1,Cross intersection,3,60
1,T20060000018,13/01/2006,19:10:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,4,0,1,0,3,1,2,T intersection,3,70
2,T20060000022,14/01/2006,12:10:00,7,Fall from or in moving vehicle,7,Saturday,190,FELL IN/FROM VEHICLE,MEL,...,2,1,0,0,1,1,5,Not at intersection,2,100
3,T20060000023,14/01/2006,11:49:00,1,Collision with vehicle,7,Saturday,130,REAR END(VEHICLES IN SAME LANE),MEL,...,2,1,0,0,1,1,2,T intersection,2,80
4,T20060000026,14/01/2006,10:45:00,1,Collision with vehicle,7,Saturday,121,RIGHT THROUGH,MEL,...,3,0,3,0,0,1,5,Not at intersection,3,50
