In [None]:
from sodapy import Socrata
import pandas as pd
import time
from datetime import date
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from IPython.core.display import display, HTML

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

# from jupyter_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output

## Part 1: Investigating the Pedestrian Sensor Data.
### Pedestrian sensor data - basic analysis.

This next dataset is pulled from the Melbourne Open Playground. The code block in the next cell needs to be uncommented to extract the data and copy it into a csv file that gets stored on your local drive.

This section of code will see whether the data in this dataset can be used to make predictions of the pedestrian counts.

Further sections will introduce new datasets and see if the extra information can help to make better, more accurate predictions.

In [None]:
##Function to get Sensor count history data
# def sensor_count():
#     client = Socrata('data.melbourne.vic.gov.au', 'nlPM0PQJSjzCsbVqntjPvjB1f', None)
#     sensor_data_id = "b2ak-trbp"
#     results = client.get(sensor_data_id, limit=5000000)
#     df = pd.DataFrame.from_records(results)
#     df = df[['year', 'month', 'mdate', 'day', 'time', 'sensor_id', 'sensor_name', 'hourly_counts']]
#     return df

# sensor_history = sensor_count()

# sensor_history.to_csv('sensor_history.csv', index=False)

In [None]:
sensor_history = pd.read_csv('sensor_history.csv')

In [None]:
#This function grabs the location (longitude and latitude) of the pedestrian sensors
def sensor_location():
    client = Socrata('data.melbourne.vic.gov.au', 'nlPM0PQJSjzCsbVqntjPvjB1f', None)
    sensor_location_data_id = "h57g-5234"
    results = client.get(sensor_location_data_id)
    df = pd.DataFrame.from_records(results)
    sensor_location = df[["sensor_id", "sensor_description", "latitude", "longitude"]]
    sensor_location.columns = ["Sensor ID", "Sensor Description", "lat", "lon"]
    sensor_location["lat"] = sensor_location["lat"].apply(lambda x: float(x))
    sensor_location["lon"] = sensor_location["lon"].apply(lambda x: float(x))
    return sensor_location

sensor_location = sensor_location()
sensor_location['sensor_id'] = sensor_location['Sensor ID'].astype(int)
sensor_location = sensor_location.drop(['Sensor ID', 'Sensor Description'], axis=1)
sensor_history = sensor_history.merge(sensor_location, on=('sensor_id'), how='inner')

In [None]:
#Do some exploration of the dataset we just imported.
print(sensor_history.info())
print("")
print(sensor_history.head())
print("")
print(sensor_history.corr())

In [None]:
#Let's do a quick linear regression to see how well we can model the relationships
#contained within the pedestrian sensor network data.
x = sensor_history.drop(columns='hourly_counts')
y = sensor_history.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))

## Part 2: Adding new datasets:
### Add a new dataset: climate microsensors.

We didn't get a great result from the previous model. The score it output is the 'R-squared' score. These scores range from 0 to 1, with 1 being a perfect score and 0 being the worst possible score.

Let's see if we can improve on this score by adding other datasets.

This next dataset is also pulled from the Melbourne Open Playground. The code block in the next cell needs to be uncommented to extract the data and copy it into a csv file that gets stored on your local drive.

The dataset is based on climate microsensors in Melbourne's CBD. For this analysis, I am only trying to get an idea of what the climate is like in Melbourne's city as a whole, not going into the detail of each sensor location. So I am only grabbing the data from one sensor.

In [None]:
##Function to get Sensor count history data
# def micro_count():
#     client = Socrata('data.melbourne.vic.gov.au', 'nlPM0PQJSjzCsbVqntjPvjB1f', None)
#     micro_data_id = "u4vh-84j8"
#     results = client.get(micro_data_id, limit=4000000)
#     if results:
#         df = pd.DataFrame.from_records(results)
#     return df

# micro_history = micro_count()

# micro_history.to_csv('micro_history.csv', index=False)

In [None]:
micro_history = pd.read_csv('micro_history.csv')

micro_history = micro_history[(micro_history.sensor_id == '5a') | (micro_history.sensor_id == '5b') |
                             (micro_history.sensor_id == '5c') | (micro_history.sensor_id == '0a') |
                             (micro_history.sensor_id == '0b') | (micro_history.sensor_id == '6')]

micro_history = micro_history[(micro_history.site_id == 1003) | (micro_history.site_id == 1009)]

micro_history = micro_history.drop(['id', 'gateway_hub_id', 'type', 'units'], axis=1)

micro_history.loc[micro_history.sensor_id == '5a', 'temp'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '5b', 'humidity'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '5c', 'pressure'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '0a', 'part_2p5'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '0b', 'part_10'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '6', 'wind'] = micro_history.value

micro_history.local_time = pd.to_datetime(micro_history.local_time, format='%Y-%m-%d')
micro_history['year'] = micro_history.local_time.dt.year
micro_history['month'] = micro_history.local_time.dt.month_name()
micro_history['mdate'] = micro_history.local_time.dt.day
micro_history['time'] = micro_history.local_time.dt.hour

micro_history = micro_history.drop(['site_id', 'sensor_id', 'value', 'local_time'], axis=1)
micro_history = micro_history.groupby(by=['year', 'month', 'mdate', 'time']).max()

ped_climate = sensor_history.merge(micro_history, on=('year', 'month', 'mdate', 'time'), how='inner')

x = ped_climate.drop(columns='hourly_counts')
y = ped_climate.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))

### Add a different dataset: school and public holidays:

Adding in the climate data resulted in an imrpovement of the R-squared score. Let's see if we can find other datasets to add in to the model and get that score even higher. The next dataset was one that was created manually - by looking up the details online, then entering them into a csv.
You will need to have this csv downloaded into your local directory for this to work.

This dataset has details of which dates are public holidays and which are school holidays. This could have an impact on how many people are walking around, right?

In [None]:
vic_holidays = pd.read_csv('vic_holidays.csv')

ped_holidays = sensor_history.merge(vic_holidays, on=('year', 'month', 'mdate'), how='left')
ped_holidays =  ped_holidays.fillna(0)

x = ped_holidays.drop(columns='hourly_counts')
y = ped_holidays.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
#Have a little look at this dataset.
ped_holidays.head()

In [None]:
#And then do the prediction.
LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))

### What about Covid-19?:

The holiday dataset only had a small, but consistent, positive effect on the scoring. Let's keep looking. The next dataset was found on the internet. You will need to have this csv downloaded into your local directory for this to work.

It contains statistics about covid-19, including historical data. It is expected that these numbers could have a reasonably large impact on the numbers of pedestrians.

**Source:** https://govtstats.covid19nearme.com.au/data/all.csv

In [None]:
covid_data = pd.read_csv('covid_data.csv')

covid_data = covid_data[['DATE', 'VIC_CASES_LOCAL_LAST_24H', 'VIC_CASES_ACTIVE', 
                    'VIC_CASES_LOCAL_LAST_7D', 'VIC_CASES_OVERSEAS_ACQUIRED_LAST_24H', 'VIC_CASES_OVERSEAS_ACQUIRED_LAST_7D',
                    'VIC_CASES_UNDER_INVESTIGATION_LAST_24H', 'VIC_CASES_UNDER_INVESTIGATION_LAST_7D',
                    'VIC_TESTS_LAST_7D', 'VIC_TESTS_PER_100K_LAST_7D']]

covid_data.fillna(0)

covid_data.DATE = pd.to_datetime(covid_data.DATE, format='%Y-%m-%d')

covid_data['year'] = covid_data.DATE.dt.year
covid_data['month'] = covid_data.DATE.dt.month_name()
covid_data['mdate'] = covid_data.DATE.dt.day
covid_data['mon'] = covid_data.DATE.dt.month

sensor_covid = sensor_history.merge(covid_data, on=('year', 'month', 'mdate'), how='inner')

x = sensor_covid.drop(columns='hourly_counts')
y = sensor_covid.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id', 'mon', 'DATE'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
#Let's have a look at the newly created dataset:
print(sensor_covid.head())
print("")
print(sensor_covid.corr())

In [None]:
#And then do the prediction.
LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))

## Part 3: Making the final dataset:

The Covid data also seems to have a small, positive impact on the scoring. What if we add all of these datasets together? Will the sum of the parts be greater, or will the different datasets just confuse the model?

In [None]:
merge_1 = ped_climate.merge(sensor_covid, on=('year', 'month', 'mdate', 'day', 'time', 'sensor_id'
                                            , 'sensor_name', 'hourly_counts'), how='inner')

merged = merge_1.merge(ped_holidays, on=('year', 'month', 'mdate', 'day', 'time', 'sensor_id'
                                            , 'sensor_name', 'hourly_counts'), how='inner')

merge_days = pd.get_dummies(merged.day)
merge_months = pd.get_dummies(merged.month)
merge_sensor = pd.get_dummies(merged.sensor_name)
merge_drop = merged.drop(['month', 'day', 'sensor_name', 'sensor_id', 'DATE'], axis=1)

merged_final = pd.concat([merge_drop, merge_days, merge_months, merge_sensor],axis=1)

X = merged_final.drop(columns='hourly_counts')
y = merged_final.hourly_counts

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

#And then do the prediction.
LR = LinearRegression()
LR.fit(X_train, y_train)
print("The R-squared score is: ", LR.score(X_test, y_test))

## Part 4: Creating the predictive models:

Ok, so now we have some datasets and we have had a look at their individual impacts - before finding that they are stronger when combined together. We have a better understanding of how different events effect the number of pedestrians going past different sensors.

But are we using the best model? There are other alternatives such as Decision Tree regressors, Random Forest regressors, Support Vector Machine regressors and even Deep Learning regression models.

However, Support Vector Machine regressors can take a long time to run when the data has many dimensions, and Deep Learning models are also resource intensive. Below we will limit ourselves to adding in a Decision Tree regressor and a Random Forest regressor. Even these can take a long time to run, but the results will hopefully be worth it!

In [None]:
#Linear Regression
LR = LinearRegression(fit_intercept=False)
LR.fit(X_train, y_train)
print("The basic Linear Regression R-squared score: ", LR.score(X_test, y_test))

#Decision Tree Regressor
DT = DecisionTreeRegressor(max_depth = 75)
DT.fit(X_train, y_train)
print("The Decision Tree regressor's R-squared score: ", DT.score(X_test, y_test))

#Random Forest Regressor
RFR = RandomForestRegressor(n_estimators=150, max_depth=100, n_jobs= -1, max_features=100)
RFR.fit(X_train, y_train)
print("The Random Forest regressor's R-squared score: ', RFR.score(X_test, y_test))

## Part 5: Interacting with our predictive models:

The R-squared scores we have managed to create now are much better, with the Decision Tree being a huge jump over the basic Linear Regression, and the Random Forest being even better again.

So now we have these cool models, we need a really easy, intuitive way to investigate them. For that, we build an interactive interface using Plotly Dash.

In [None]:
app = dash.Dash(external_stylesheets=[dbc.themes.SOLAR])

fig = px.scatter_mapbox(merged_final, lat=merged_final.lat, lon=merged_final.lon
                        , zoom = 12.5
                       , size = merged_final.hourly_counts)
fig.update_layout(mapbox_style="carto-positron", mapbox_center_lon=144.96
                      , mapbox_center_lat = -37.81)
fig.update_layout(margin={"r":5,"t":5,"l":5,"b":5})

app.layout = html.Div(id='parent', children=[ #main Div
    html.Div(id='header', children=[ #header Div
        
        html.Div([ #calender selector Div
            html.P('Choose a date for analysis:'),
            dcc.DatePickerSingle(
                id = 'selector_date',
                month_format = 'MMMM Y',
                calendar_orientation = 'horizontal',
                placeholder = 'Select a date',
                date = date(2021, 6, 21),
                display_format = 'DD/MM/YYYY')
        ],
        style={'width': '15%', 'display': 'inline-block', 'verticalAlign': 'top', 'padding': '20px 20px 20px 20px'}),

        html.Div([ #hour slider Div
            html.P('Select the hour of the day:'),
            dcc.Slider(
                id='selector_hour',
                min=0,
                max=23,
                step=1,
                value=0,
                marks={0: 'midnight', 3: '3am', 6: '6am', 9: '9am', 12: 'midday',
                       15: '3pm', 18: '6pm', 21: '9pm'}
        )], style={'width': '85%', 'display': 'inline-block', 'textAlign':'left', 'verticalAlign': 'top', 'padding': '20px 20px 20px 20px'}),
    ]), #end of 'header' Div

    html.Div([ #map and various selectors Div
        
            html.Div([ #various selectors Div

                html.Hr(),
                html.P('Temperature:'),
                dcc.Slider(
                    id='selector_temp',
                    min=0,
                    max=50,
                    value=25,
                    marks = {0: '0C', 10: '10C', 20: '20C', 30: '30C', 40: '40C', 50: '50C'}
                    ),
                html.P('Humidity:'),
                 dcc.Slider(
                    id='selector_humid',
                    min=0,
                    max=100,
                    value=0,
                    marks = {0: '0%', 25: '25%', 50: '50%', 75: '75%', 100: '100%'}
                    ),
                html.P('Wind speed:'),
                 dcc.Slider(
                    id='selector_wind',
                    min=0,
                    max=100,
                    value=0,
                    marks = {0: 'Calm', 20: '20km/h', 40: '40km/h', 60: '60km/h', 80: '80km/h', 100: '100km/h'}
                    ),
                html.P('Air pressure:'),
                 dcc.Slider(
                    id='selector_pressure',
                    min=975,
                    max=1050,
                    value=975,
                    marks = {975: '975hPa', 1000: '1000hPa', 1025: '1025hPa', 1050: '1050hPa'}
                    ),
                html.P('Particulate concentration 2.5 microns:'),
                 dcc.Slider(
                    id='selector_part2p5',
                    min=0,
                    max=500,
                    value=0,
                    marks = {0: '0', 100: '100', 200: '200', 300: '300', 400: '400', 500: '500'}
                    ),
                html.P('Particulate concentration 10 microns:'),
                 dcc.Slider(
                    id='selector_part10',
                    min=0,
                    max=1000,
                    value=0,
                    marks = {0: '0', 250: '200', 500: '500', 750: '750', 1000: '1000'}
                    ),
                html.Hr(),
                html.P('Holiday type:'),
                dcc.RadioItems(id='selector_holiday', 
                   options=[
                       {'label': 'School holiday ', 'value': 'SCH'},
                       {'label': 'Public holiday ', 'value': 'PUB'},
                       {'label': 'Both ', 'value': 'BOTH'},
                       {'label': 'Neither ', 'value': 'NONE'}
                   ],
                   value='NONE'
                ),
                html.Hr(),

                html.P('Covid cases under investigation in the previous 7 days:'),
                dcc.Input(id='selector_covid', type='number', min=0, max=10000, step=100, value=0),

            ],
            style={'height': '49%', 'width': '49%', 'display': 'inline-block', 'padding': '20px 20px 20px 20px'}), #various selectors Div

            html.Div(id = 'right_panel', children=[ #format and place the right side panel

            html.Div(id='map', children=[ #map div
                    dcc.Graph(id = 'world_map', figure = fig)
            ],
                    style={'textAlign':'center', 'verticalAlign': 'top', 'display': 'inline-block', 'padding': '0px 20px 20px 50px'}), #map selector Div
            html.Br(),
            html.Hr(),
            html.P('Select which predictor model to use:'),
            html.Div(id='predictor', children=[ #prediction model div
                    dcc.Dropdown(id = 'selector_model', #pick the predictive model to use
                        options = [
                            {'label':'Linear Regression', 'value':'LR' },
                            {'label': 'Decision Tree Regression', 'value':'DT'},
                            {'label': 'Random Forest Regression', 'value':'RFR'},
                        ],
                        value = 'LR'),                        
            ],
                    style={'textAlign':'center', 'width': '35%', 'display': 'inline-block'}), 

        ], style={'textAlign':'center', 'width': '49%', 'display': 'inline-block'})
    ]) #end of 'map and various selectors' Div
]) #end of 'parent' Div

@app.callback(Output(component_id='world_map', component_property='figure'),
            Input(component_id='selector_temp', component_property='value'),
            Input(component_id='selector_humid', component_property='value'),
            Input(component_id='selector_wind', component_property='value'),
            Input(component_id='selector_part2p5', component_property='value'),
            Input(component_id='selector_part10', component_property='value'),
            Input(component_id='selector_pressure', component_property='value'),
            Input(component_id='selector_holiday', component_property='value'),
            Input(component_id='selector_covid', component_property='value'),
            Input(component_id='selector_date', component_property='date'),
            Input(component_id='selector_hour', component_property='value'),
            Input(component_id='selector_model', component_property='value'))

def selectors(temp, humid, wind, part_2p5, part_10, pressure, holiday, covid, indate, time, model):

    date_object = date.fromisoformat(indate)
    year = date_object.year
    mon = date_object.month
    mdate = date_object.day

    scenario = merged_final[(merged_final.year == year) & (merged_final.mdate == mdate)
                    & (merged_final.mon == mon) & (merged_final.time == time)]

    scenario.school_hol = 0
    scenario.pub_hol = 0
    
    if holiday == 'BOTH':
        scenario.school_hol = 1
        scenario.pub_hol = 1
    if holiday == 'SCH':
        scenario.school_hol = 1
    if holiday == 'PUB':
        scenario.pub_hol = 1
    
    scenario.temp = temp
    scenario.humidity = humid
    scenario.wind = wind
    scenario.time = time
    scenario.pressure = pressure
    scenario.part_2p5 = part_2p5
    scenario.part_10 = part_10
    scenario.VIC_CASES_UNDER_INVESTIGATION_LAST_7D = covid
   
    if model == 'LR':
        guess = LR.predict(scenario.drop(columns='hourly_counts'))
    if model == 'DT':
        guess = DT.predict(scenario.drop(columns='hourly_counts'))
    if model == 'RFR':
        guess = RFR.predict(scenario.drop(columns='hourly_counts'))
    
    scenario.guess = guess.astype(int)

    fig = px.scatter_mapbox(scenario, lat=scenario.lat, size = scenario.guess, lon=scenario.lon)
    fig.update_layout(mapbox_style="carto-positron", uirevision='scenario')
    fig.update_layout(margin={"r":5,"t":5,"l":5,"b":5})

    return fig
    
if __name__ == '__main__':
    app.run_server()

So there we have it. An awesome, fun way to interact with our predictive models!

Of course, these models and their predictions can always be improved. You can find new datasets to add, or you can look to make the models better through feature selection, feature reduction or feature engineering. The possibilities are endless.