In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
!pip install sodapy

Collecting sodapy
  Downloading https://files.pythonhosted.org/packages/9e/74/95fb7d45bbe7f1de43caac45d7dd4807ef1e15881564a00eef489a3bb5c6/sodapy-2.1.0-py2.py3-none-any.whl
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


In [2]:
#import the libraries 
import numpy as np
import pandas as pd
from pandas import DataFrame as df, Series as se
#import the library for the API
from sodapy import Socrata

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.melbourne.vic.gov.au", None)


#getting the pedestrian count data from city of melbourne website from 2009-05-01 to 
#2020-10-31
results = client.get("b2ak-trbp", limit=3391522)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)




In [4]:
results_df

Unnamed: 0,id,date_time,year,month,mdate,day,time,sensor_id,sensor_name,hourly_counts
0,2887628,2019-11-01T17:00:00.000,2019,November,1,Friday,17,34,Flinders St-Spark La,300
1,2887629,2019-11-01T17:00:00.000,2019,November,1,Friday,17,39,Alfred Place,604
2,2887630,2019-11-01T17:00:00.000,2019,November,1,Friday,17,37,Lygon St (East),216
3,2887631,2019-11-01T17:00:00.000,2019,November,1,Friday,17,40,Lonsdale St-Spring St (West),627
4,2887632,2019-11-01T17:00:00.000,2019,November,1,Friday,17,36,Queen St (West),774
...,...,...,...,...,...,...,...,...,...,...
3391517,3391518,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,67,Flinders Ln -Degraves St (South),0
3391518,3391519,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,68,Flinders Ln -Degraves St (North),0
3391519,3391520,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,69,Flinders Ln -Degraves St (Crossing),0
3391520,3391521,2020-10-31T23:00:00.000,2020,October,31,Saturday,23,70,Errol Street (East),0


In [5]:
#Examining the data types of the features of the dataset  
results_df.dtypes

id               object
date_time        object
year             object
month            object
mdate            object
day              object
time             object
sensor_id        object
sensor_name      object
hourly_counts    object
dtype: object

In [6]:
results_df['date'] = results_df['mdate'] + '-' + results_df['month'] + '-' + results_df['year']
results_df.drop(columns=['id',	'date_time','year',	'month',	'mdate',	'day',	'time' ], inplace = True)

In [7]:
#converting 'sensor_id' and 'hourly_counts' features from object datatypes to int
results_df[['sensor_id','hourly_counts']] = results_df[['sensor_id','hourly_counts']].astype('int')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,1-November-2019
1,39,Alfred Place,604,1-November-2019
2,37,Lygon St (East),216,1-November-2019
3,40,Lonsdale St-Spring St (West),627,1-November-2019
4,36,Queen St (West),774,1-November-2019
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,31-October-2020
3391518,68,Flinders Ln -Degraves St (North),0,31-October-2020
3391519,69,Flinders Ln -Degraves St (Crossing),0,31-October-2020
3391520,70,Errol Street (East),0,31-October-2020


In [8]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id         int64
sensor_name      object
hourly_counts     int64
date             object
dtype: object

In [9]:
#converting date_time feature from object datatypes to date
results_df['date'] = results_df['date'].astype('datetime64[ns]')
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,2020-10-31
3391518,68,Flinders Ln -Degraves St (North),0,2020-10-31
3391519,69,Flinders Ln -Degraves St (Crossing),0,2020-10-31
3391520,70,Errol Street (East),0,2020-10-31


In [10]:
#Examining the features after data type conversion
results_df.dtypes

sensor_id                 int64
sensor_name              object
hourly_counts             int64
date             datetime64[ns]
dtype: object

In [11]:
#extracting data from 2015 till date
results_df = results_df[results_df['date'] > '2014-12-31']

In [12]:
results_df

Unnamed: 0,sensor_id,sensor_name,hourly_counts,date
0,34,Flinders St-Spark La,300,2019-11-01
1,39,Alfred Place,604,2019-11-01
2,37,Lygon St (East),216,2019-11-01
3,40,Lonsdale St-Spring St (West),627,2019-11-01
4,36,Queen St (West),774,2019-11-01
...,...,...,...,...
3391517,67,Flinders Ln -Degraves St (South),0,2020-10-31
3391518,68,Flinders Ln -Degraves St (North),0,2020-10-31
3391519,69,Flinders Ln -Degraves St (Crossing),0,2020-10-31
3391520,70,Errol Street (East),0,2020-10-31


In [13]:
#Aggregating hourly pedestrian count for each by sensor-id
new_results_df = pd.DataFrame(results_df.groupby(['date', 'sensor_id'])['hourly_counts'].sum())
new_results_df = new_results_df.reset_index()
new_results_df

Unnamed: 0,date,sensor_id,hourly_counts
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
98662,2020-10-31,67,4718
98663,2020-10-31,68,2447
98664,2020-10-31,69,3428
98665,2020-10-31,70,3035


In [14]:
#renaming the hourly_count column
new_results_df = new_results_df.rename(columns={"hourly_counts": "Total_Pedestrian_Count"})
new_results_df

Unnamed: 0,date,sensor_id,Total_Pedestrian_Count
0,2015-01-01,2,21217
1,2015-01-01,3,32695
2,2015-01-01,4,36958
3,2015-01-01,5,31224
4,2015-01-01,6,20457
...,...,...,...
98662,2020-10-31,67,4718
98663,2020-10-31,68,2447
98664,2020-10-31,69,3428
98665,2020-10-31,70,3035


In [29]:
#Aggregating hourly pedestrian count for each day
new_results_df_per_day = pd.DataFrame(new_results_df.groupby(['date'])['Total_Pedestrian_Count'].sum())
new_results_df_per_day = new_results_df_per_day.reset_index()
new_results_df_per_day = new_results_df_per_day.rename(columns={"Total_Pedestrian_Count": "Total_Pedestrian_Count_per_day"})
new_results_df_per_day

Unnamed: 0,date,Total_Pedestrian_Count_per_day
0,2015-01-01,471563
1,2015-01-02,389048
2,2015-01-03,291761
3,2015-01-04,382364
4,2015-01-05,487220
...,...,...
2126,2020-10-27,177267
2127,2020-10-28,311278
2128,2020-10-29,263334
2129,2020-10-30,313225


In [260]:
# Loading the weather dataset 
dataset = pd.read_csv('export_df.csv') #export_df_csv as created by Hung son on Trello card https://trello.com/c/7dcc09P9
#converting date_time feature from object datatypes to date
dataset['date'] = dataset['date'].astype('datetime64[ns]')

dataset.dtypes

date                                    datetime64[ns]
Rainfall amount (millimetres)                  float64
Minimum temperature (Degree C)                 float64
Maximum temperature (Degree C)                 float64
Daily global solar exposure (MJ/m*m)           float64
dtype: object

In [261]:
dataset.isna().sum()

date                                    0
Rainfall amount (millimetres)           8
Minimum temperature (Degree C)          0
Maximum temperature (Degree C)          0
Daily global solar exposure (MJ/m*m)    1
dtype: int64

In [262]:
dataset_wt_weather_features = dataset.merge(new_results_df_per_day, left_on='date', right_on='date')

dataset_wt_weather_features

Unnamed: 0,date,Rainfall amount (millimetres),Minimum temperature (Degree C),Maximum temperature (Degree C),Daily global solar exposure (MJ/m*m),Total_Pedestrian_Count_per_day
0,2015-01-01,0.0,13.3,26.9,23.6,471563
1,2015-01-02,0.0,15.4,38.8,26.8,389048
2,2015-01-03,0.0,20.0,38.2,26.5,291761
3,2015-01-04,4.6,16.3,21.4,25.2,382364
4,2015-01-05,0.0,15.0,22.0,30.7,487220
...,...,...,...,...,...,...
2126,2020-10-27,0.0,11.1,19.6,20.1,177267
2127,2020-10-28,0.0,9.5,20.9,26.7,311278
2128,2020-10-29,0.0,11.2,24.3,17.1,263334
2129,2020-10-30,0.0,12.9,18.6,21.0,313225


In [238]:
#writing the dataset into a csv file
import csv  

# field names  
fields = []  
for col in dataset_wt_weather_features.columns: 
    fields.append(col) 

# data rows of csv file  
rows = dataset_wt_weather_features.values  
    
# name of the csv file  
filename = "dataset_wt_weather_features.csv"
    
# writing to the csv file  
with open(filename, 'w') as csvfile:

    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
        
    # writing the data rows  
    csvwriter.writerows(rows) 

In [263]:
#Incorporating the weekly index. Representing monday as 1 and sunday as 7.
dataset_wt_weather_features['day'] = dataset_wt_weather_features.date.dt.dayofweek
dataset_wt_weather_features['day'] = dataset_wt_weather_features['day'] + 1

In [264]:
#Incorporating the monthly index. Representing January as 1 and december as 12.
dataset_wt_weather_features['monthly_index'] = dataset_wt_weather_features.date.dt.month

In [265]:
#Public Holiday dataset
public = pd.read_csv('Public_Holidays.csv')
holidays = public[['Holiday']]

In [266]:
#View the dataset
holidays

Unnamed: 0,Holiday
0,2015-01-01
1,2015-01-26
2,2015-03-14
3,2015-03-25
4,2015-03-26
...,...
74,2020-04-12
75,2020-04-13
76,2020-04-25
77,2020-06-08


In [267]:
#converting Holiday feature from object datatypes to date
holidays['Holiday'] = holidays['Holiday'].astype('datetime64[ns]')
holidays['Public_Holiday'] = 'Public Holiday'

In [268]:
#Validating the changes in datatype
holidays.dtypes

Holiday           datetime64[ns]
Public_Holiday            object
dtype: object

In [269]:
#Incorporating the public holiday information in the dataset
dataset_wt_weather_features = dataset_wt_weather_features.set_index("date").join(holidays.set_index("Holiday"), how="outer")

In [270]:
#Resetting the index
dataset_wt_weather_features.reset_index(inplace = True)

In [271]:
#Renaming the columns
dataset_wt_weather_features.rename(columns = {'index':'Date'}, inplace = True)

In [275]:
#Replacing the na values as 'not a holiday'
dataset_wt_weather_features['Public_Holiday'].fillna('Not a public holiday', inplace=True)

In [281]:
#Rearranging the columns 
dataset_wt_weather_features = dataset_wt_weather_features[['Date', 'day', 'monthly_index','Public_Holiday','Rainfall amount (millimetres)','Minimum temperature (Degree C)','Maximum temperature (Degree C)','Daily global solar exposure (MJ/m*m)', 'Total_Pedestrian_Count_per_day']]

In [283]:
dataset_wt_weather_features.to_csv('dataset_with_public_holidays.csv')

In [24]:
#Importing the dataset containing status, location and directional information for each pedestrian sensor device installed throughout the city using the API.
results2 = client.get("h57g-5234", limit=2000)
results_df2 = pd.DataFrame.from_records(results2)

In [25]:
#Converting 'sensor_id' and 'hourly_counts' features from object datatypes to int
results_df2['sensor_id'] = results_df2['sensor_id'].astype('int')
results_df2[['latitude','longitude']] = results_df2[['latitude','longitude']].astype('float')

In [26]:
#Examining the features after data type conversion
results_df2 = pd.DataFrame.from_records(results2)

In [None]:
#Merging the datasets on sensor_id
merged_dataset = new_results_df.set_index('sensor_id').join(results_df2.set_index('sensor_id'))[['date','sensor_description','latitude','longitude','Total_Pedestrian_Count']]

In [None]:
#Validating the dataset
merged_dataset

In [None]:
#Adding the day variable using the date
merged_dataset['day'] = merged_dataset.date.dt.day_name()

In [None]:
#Visualizing recorded observations based on sensor_ID
z = merged_dataset.index.value_counts().plot(kind = 'bar', figsize = (20,10))
z.set_xlabel("Sensor_ID")
z.set_ylabel("Frequency of observations")
z.set_title("Recorded observations based on each Sensor_ID")

z.plot()