In [54]:
from sodapy import Socrata
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML

### Pedestrian sensor data - basic analysis.

This next dataset is pulled from the Melbourne Open Playground. The code block in the next cell needs to be uncommented to extract the data and copy it into a csv file that gets stored on your local drive.

This section of code will see whether the data in this dataset can be used to make predictions of the pedestrian counts.

Further sections will introduce new datasets and see if the extra information can help to make better, more accurate predictions.

In [55]:
##Function to get Sensor count history data
# def sensor_count():
#     client = Socrata('data.melbourne.vic.gov.au', 'nlPM0PQJSjzCsbVqntjPvjB1f', None)
#     sensor_data_id = "b2ak-trbp"
#     results = client.get(sensor_data_id, limit=5000000)
#     df = pd.DataFrame.from_records(results)
#     df = df[['year', 'month', 'mdate', 'day', 'time', 'sensor_id', 'sensor_name', 'hourly_counts']]
#     return df

# sensor_history = sensor_count()

# sensor_history.to_csv('sensor_history.csv', index=False)

In [56]:
sensor_history = pd.read_csv('sensor_history.csv')

In [57]:
sensor_history.head()

Unnamed: 0,year,month,mdate,day,time,sensor_id,sensor_name,hourly_counts
0,2019,November,1,Friday,17,34,Flinders St-Spark La,300
1,2019,November,1,Friday,17,39,Alfred Place,604
2,2019,November,1,Friday,17,37,Lygon St (East),216
3,2019,November,1,Friday,17,40,Lonsdale St-Spring St (West),627
4,2019,November,1,Friday,17,36,Queen St (West),774


In [58]:
sensor_history = sensor_history[(sensor_history.year == 2021) & (sensor_history.month == 'December')]

In [59]:
x = sensor_history.drop(columns='hourly_counts')
y = sensor_history.hourly_counts

In [60]:
x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

In [61]:
x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)
X = X.fillna(0)

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)

In [63]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)

LinearRegression()

In [64]:
LR.score(X_train, y_train)

0.5177308372742289

In [65]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=.000001, normalize=True)
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train)

0.5177345336894372

In [66]:
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV()
ridgecv.fit(X_train, y_train)
ridgecv.score(X_train, y_train)

0.5177328642114642

In [67]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.00001)
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train)

0.5177345336882573

In [68]:
from sklearn.linear_model import LassoLars
lassolars = LassoLars(alpha=0.00001)
lassolars.fit(X_train, y_train)
lassolars.score(X_train, y_train)

0.5177345327992975

In [69]:
from sklearn.linear_model import BayesianRidge
bayridge = BayesianRidge()
bayridge.fit(X_train, y_train)
bayridge.score(X_train, y_train)

0.5177317159027988

In [70]:
from sklearn.linear_model import ARDRegression
ARD = ARDRegression(alpha_1 = 0.00001)
ARD.fit(X_train, y_train)
ARD.score(X_train, y_train)

0.5176551516536102

In [71]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet(alpha = 0.0000001, normalize=True)
EN.fit(X_train, y_train)
EN.score(X_train, y_train)

0.5177331076673444

In [72]:
from sklearn.linear_model import LassoLarsIC
lassolarsic = LassoLarsIC(normalize=True)
lassolarsic.fit(X_train, y_train)
lassolarsic.score(X_train, y_train)

0.5176819677518061

In [73]:
from sklearn.linear_model import LassoLarsCV
lassolarscv = LassoLarsCV()
lassolarscv.fit(X_train, y_train)
lassolarscv.score(X_train, y_train)

0.5177074611614844

### Add a new dataset: climate microsensors.

This next dataset is pulled from the Melbourne Open Playground. The code block in the next cell needs to be uncommented to extract the data and copy it into a csv file that gets stored on your local drive.

The dataset is based on climate microsensors in Melbourne's CBD. For this analysis, I am only trying to get an idea of what the climate is like in Melbourne's city as a whole, not going into the detail of each sensor location.

In [74]:
##Function to get Sensor count history data
# def micro_count():
#     client = Socrata('data.melbourne.vic.gov.au', 'nlPM0PQJSjzCsbVqntjPvjB1f', None)
#     micro_data_id = "u4vh-84j8"
#     results = client.get(micro_data_id, limit=4000000)
#     if results:
#         df = pd.DataFrame.from_records(results)
#     return df

# micro_history = micro_count()

# micro_history.to_csv('micro_history.csv', index=False)

In [75]:
micro_history = pd.read_csv('micro_history.csv')

micro_history = micro_history[(micro_history.sensor_id == '5a') | (micro_history.sensor_id == '5b') |
                             (micro_history.sensor_id == '5c') | (micro_history.sensor_id == '0a') |
                             (micro_history.sensor_id == '0b') | (micro_history.sensor_id == '6')]

micro_history = micro_history[(micro_history.site_id == 1003) | (micro_history.site_id == 1009)]

micro_history = micro_history.drop(['id', 'gateway_hub_id', 'type', 'units'], axis=1)

micro_history.loc[micro_history.sensor_id == '5a', 'temp'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '5b', 'humidity'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '5c', 'pressure'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '0a', 'part_2-5'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '0b', 'part_10'] = micro_history.value
micro_history.loc[micro_history.sensor_id == '6', 'wind'] = micro_history.value

micro_history.local_time = pd.to_datetime(micro_history.local_time, format='%Y-%m-%d')
micro_history['year'] = micro_history.local_time.dt.year
micro_history['month'] = micro_history.local_time.dt.month_name()
micro_history['mdate'] = micro_history.local_time.dt.day
micro_history['time'] = micro_history.local_time.dt.hour

micro_history = micro_history.drop(['site_id', 'sensor_id', 'value', 'local_time'], axis=1)
micro_history = micro_history.groupby(by=['year', 'month', 'mdate', 'time']).max()

ped_climate = sensor_history.merge(micro_history, on=('year', 'month', 'mdate', 'time'), how='inner')

ped_climate = ped_climate[(ped_climate.year == 2021) & (ped_climate.month == 'December')]

x = ped_climate.drop(columns='hourly_counts')
y = ped_climate.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)
X = X.fillna(0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)

from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)

LR.score(X_train, y_train)

0.5494506421578875

### Add a different dataset: school and public holidays:

The next dataset was one that was created manually - by looking up the details online, then entering them into a csv.
You will need to have this csv downloaded into your local directory for this to work.

This dataset has deatils of which dates are public holidays and which are school holidays. This could have an impact on how many people are walking around, right?

In [76]:
vic_holidays = pd.read_csv('vic_holidays.csv')

ped_holidays = sensor_history.merge(vic_holidays, on=('year', 'month', 'mdate'), how='left')
ped_holidays =  ped_holidays.fillna(0)

ped_holidays = ped_holidays[(ped_holidays.year == 2021) & (ped_holidays.month == 'December')]

x = ped_holidays.drop(columns='hourly_counts')
y = ped_holidays.hourly_counts

x_day = pd.get_dummies(x.day)
x_month = pd.get_dummies(x.month)
x_sensor = pd.get_dummies(x.sensor_name)

x_drop = x.drop(['month', 'day', 'sensor_name', 'sensor_id'], axis=1)
X = pd.concat([x_drop, x_day, x_month, x_sensor],axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)

from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)

LR.score(X_train, y_train)

0.5195048500861614