In [66]:
import os
import time
from datetime import datetime
import numpy as np
import pandas as pd
from sodapy import Socrata

apptoken = os.environ.get("SODAPY_APPTOKEN")
domain = "data.melbourne.vic.gov.au"
client = Socrata(domain, apptoken)

In [67]:
# function to get attributes of a column in the df about datasets
def dseries(df, col, attrib, attrib_sub=None):  
    ls = []
    if attrib_sub == None:
        for d in df[col]:
            ls.append(d[attrib])
    else:
        for d in df[col]:
            ls.append(d[attrib][attrib_sub])
    return pd.Series(ls)

# function to make dataframe of interested info
def interesteddf(rdf):
    a = dseries(rdf, 'resource', 'name')
    b = dseries(rdf, 'resource', 'id')
    c = dseries(rdf, 'resource', 'parent_fxf')
    d = dseries(rdf, 'resource', 'description')
    e = dseries(rdf, 'resource', 'data_updated_at')
    f = dseries(rdf, 'resource', 'page_views', 'page_views_last_week')
    g = dseries(rdf, 'resource', 'page_views', 'page_views_last_month')
    h = dseries(rdf, 'resource', 'page_views', 'page_views_total')
    i = dseries(rdf, 'resource', 'download_count')
    j = dseries(rdf, 'classification', 'categories')
    k = dseries(rdf, 'classification', 'domain_category')
    l = dseries(rdf, 'classification', 'domain_tags')
    m = dseries(rdf, 'classification', 'domain_metadata')

    col =  ['name', 'id', 'parent_fxf', 'description', 'data_upd_at', 'pv_last_wk', 'pv_last_mth', 'pv_total',
            'download_count', 'categories', 'domain_category', 'domain_tags', 'domain_metadata']

    df = pd.concat([a,b,c,d,e,f,g,h,i,j,k,l,m], axis='columns')
    df.columns = col

    return df

# function to tokenise description and name columns
def tokenise2(df):
    # tokenize by [1] making all lower case [2] removing some unwanted stop characters [3] splitting string into list of word tokens

    df['description'] = df['description'].str.lower().str.replace(r'\. ', ' ').str.replace('\, ', ' ')
    df['description'] = df['description'].str.replace('– ',' ').str.replace('- ',' ').str.split()

    df['name'] = df['name'].str.replace('(','').str.replace(')','')
    df['name'] = df['name'].str.lower().str.replace(r'\. ', ' ').str.replace('\, ', ' ')
    df['name'] = df['name'].str.replace('– ',' ').str.replace('- ',' ').str.split()

    return df

In [68]:
rds = client.datasets()
rdf = pd.DataFrame.from_dict(rds)

ds_df = interesteddf(rdf)
ds_df.head(2).T

Unnamed: 0,0,1
name,Pedestrian Counting System - Monthly (counts p...,Tree canopies 2011 (Urban Forest)
id,b2ak-trbp,y79a-us3f
parent_fxf,[],[]
description,This dataset contains hourly pedestrian counts...,Tree canopy within City of Melbourne mapped us...
data_upd_at,2021-09-06T01:54:59.000Z,
pv_last_wk,230,23
pv_last_mth,1251,102
pv_total,71864,66774
download_count,8507,3178
categories,[finance],[environment]


In [4]:
ds_df_tok = tokenise2(ds_df)
ds_df_tok.head(2).T



Unnamed: 0,0,1
name,"[pedestrian, counting, system, monthly, counts...","[tree, canopies, 2011, urban, forest]"
id,b2ak-trbp,y79a-us3f
parent_fxf,[],[]
description,"[this, dataset, contains, hourly, pedestrian, ...","[tree, canopy, within, city, of, melbourne, ma..."
data_upd_at,2021-09-06T01:54:59.000Z,
pv_last_wk,229,23
pv_last_mth,1248,102
pv_total,71860,66774
download_count,8505,3178
categories,[finance],[environment]


In [None]:
## 000 ## other interesting info
#
## columns of dataset
# rdf['resource'][0]['columns_name']
# rdf['resource'][0]['columns_field_name']
# rdf['resource'][0]['columns_datatype']
# rdf['resource'][0]['columns_description']
#

In [6]:
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             223 non-null    object
 1   id               223 non-null    object
 2   parent_fxf       223 non-null    object
 3   description      223 non-null    object
 4   data_upd_at      201 non-null    object
 5   pv_last_wk       223 non-null    int64 
 6   pv_last_mth      223 non-null    int64 
 7   pv_total         223 non-null    int64 
 8   download_count   223 non-null    int64 
 9   categories       223 non-null    object
 10  domain_category  223 non-null    object
 11  domain_tags      223 non-null    object
 12  domain_metadata  223 non-null    object
dtypes: int64(4), object(9)
memory usage: 22.8+ KB


In [19]:
ds_df['domain_metadata'][2]

[{'key': 'Quality_Known-Issues',
  'value': 'Date and year planted fields may have missing date values. '},
 {'key': "Quality_What's-included",
  'value': 'All data held by the City of Melbourne has been included'},
 {'key': 'Quality_Update-frequency', 'value': 'Daily'},
 {'key': 'Quality_Reliability-level', 'value': 'Reliable and timely'},
 {'key': 'Data-management_Source-data-update-frequency', 'value': 'Monthly'}]

In [23]:
for kv in ds_df['domain_metadata'][2]:
    print(kv['key'])

Quality_Known-Issues
Quality_What's-included
Quality_Update-frequency
Quality_Reliability-level
Data-management_Source-data-update-frequency


In [20]:
for kv in ds_df['domain_metadata'][0]:
    if kv['key']=='Quality_Update-frequency':
        print(f"[{kv['key']}] : {kv['value']}")
    if kv['key']=='Data-management_Source-data-update-frequency':
        print(f"[{kv['key']}] : {kv['value']}")

[Quality_Update-frequency] : Monthly
[Data-management_Source-data-update-frequency] : Hourly


In [78]:
keys = []
for record in ds_df['domain_metadata']:
    for kv in record:
        keys.append(kv['key'])
keys_ds = pd.Series(keys).drop_duplicates().reset_index(drop=True)

In [79]:
keys_ds

0                               Quality_What's-included
1                              Quality_Update-frequency
2                             Quality_Reliability-level
3                                  How-to-use_Linked-to
4          Data-management_Source-data-update-frequency
5                                  Quality_Known-Issues
6                        How-to-use_Further-information
7                  Quality_Source-data-update-frequency
8                        Quality_Data-quality-statement
9                Melbourne-Metadata_Further-Information
10                  Internal-management_IMGF-risk-level
11                 Internal-management_Update-mechanism
12    Internal-management_Source-system-(GIS,-AssetM...
13                 How-to-use_Applicable-standard-(URL)
dtype: object

In [204]:
df1 = ds_df.copy()
for key in keys_ds:
    df1[key] = np.nan
    for i in range(df1.shape[0]):
        for kv in df1['domain_metadata'][i]:
            if kv['key'] == key:
                df1[key][i] = kv['value']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [140]:
for qi in df1['Quality_Known-Issues'].unique():
    print(qi)

nan
Date and year planted fields may have missing date values. 
Parking Sensors are not operational on Public Holidays. Parking Sensors will show car parks as vacant when blocked by construction zones. 
None
No known issues
Note this dataset may not contain a reading for every sensor for every 15 minutes as the sensor devices might not have a reading for each value. There may be situations where no readings are reported for all sensors or only some readings are reported at a particular site. 
Dataset contains 'duration in minutes' field which differs from other parking sensor datasets which contain 'duration in seconds'.
Docks have since been removed after the end of the program in Nov 2019
Dataset contains 'duration in minutes' field which differs from other parking sensor datasets which contain 'duration in seconds'. Dataset was extracted from system so some active readings may not have been received from sensors towards the end of the dataset (May 26th). 
Listed branches will be dif

In [141]:
df1['Quality_Update-frequency'].unique()

array(['Monthly', 'Static (not updated)', 'Daily', '2 Minutes',
       'Quarterly', 'Annually', '15 Minutes', 'Every two years', 'Hourly'],
      dtype=object)

In [152]:
df1['Data-management_Source-data-update-frequency'].unique()

array(['Hourly', 'Monthly', 'Real-time', 'Annually', 'Daily', 'Weekly',
       'Static (not updated)', '15 Minutes', 'Every two years',
       'Quarterly'], dtype=object)

In [155]:
df1['Quality_Source-data-update-frequency'].unique()

array([nan, 'Daily', 'Monthly', 'Annually', 'Quarterly'], dtype=object)

In [145]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 27 columns):
 #   Column                                                     Non-Null Count  Dtype 
---  ------                                                     --------------  ----- 
 0   name                                                       223 non-null    object
 1   id                                                         223 non-null    object
 2   parent_fxf                                                 223 non-null    object
 3   description                                                223 non-null    object
 4   data_upd_at                                                201 non-null    object
 5   pv_last_wk                                                 223 non-null    int64 
 6   pv_last_mth                                                223 non-null    int64 
 7   pv_total                                                   223 non-null    int64 
 8   download_count      

In [None]:
# very wordy and different for each record
for qi in df1["Quality_What's-included"].unique():
    print(qi)

In [158]:
df1['Quality_Reliability-level'].unique()

array(['Reliable and timely', 'Useful with caveats', nan], dtype=object)

In [160]:
for hl in df1['How-to-use_Linked-to'].unique():
    print(hl)

http://www.pedestrian.melbourne.vic.gov.au/
nan
Environmental Data - Assets, Environmental Data - GHG Factors
Road Corridor
http://cityofmelbourne.maps.arcgis.com/apps/MapSeries/index.html?appid=5c6bf1fc3f094e418e55eb1bce03953e
https://data.melbourne.vic.gov.au/Economy/Census-of-Land-Use-and-Employment-Blocks/aia8-ryiq
Environmental Data - Assets, Environmental Data - Billing Data
Environmental Data - Billing, Environmental Data - GHG Factors
https://data.melbourne.vic.gov.au/Economy/Census-Of-Land-Use-And-Employment-CLUE-Small-Areas/gei8-3w86
http://biodiversity.melbourne.vic.gov.au/insects/#/butterflies
https://data.melbourne.vic.gov.au/Economy/Employment-and-floor-space-forecasts-by-suburb/gb88-t7zc
N/A


In [162]:
df1["Quality_Update-frequency"].unique()

array(['Monthly', 'Static (not updated)', 'Daily', '2 Minutes',
       'Quarterly', 'Annually', '15 Minutes', 'Every two years', 'Hourly'],
      dtype=object)

In [161]:
df1['Data-management_Source-data-update-frequency'].unique()

array(['Hourly', 'Monthly', 'Real-time', 'Annually', 'Daily', 'Weekly',
       'Static (not updated)', '15 Minutes', 'Every two years',
       'Quarterly'], dtype=object)

In [199]:
df1['Quality_Source-data-update-frequency'].unique()

array([nan, 'Daily', 'Monthly', 'Annually', 'Quarterly'], dtype=object)

In [202]:
df1[~df1['Quality_Source-data-update-frequency'].isnull()].T

Unnamed: 0,6,11,16,57,58,85,101
name,On-street Parking Bays,On-street Car Park Bay Restrictions,Property boundaries,On-street Car Parking Meters with Location,Public toilets,Building information 2017 map,Melbourne Visitor Contact Stats
id,crvt-b4kt,ntht-5rk7,e56b-j9mj,vdsi-4gtj,ru3z-44we,be3i-empa,e63d-rbu7
parent_fxf,[],[],[],[],[],[pmhb-s6pn],[]
description,This dataset contains spatial polygons which r...,Each row contains information about the restri...,This details the boundaries of all properties ...,This dataset shows the location and informatio...,Public toilets known about or operated by the ...,Data collected as part of the City of Melbourn...,The City of Melbourne provides visitor program...
data_upd_at,2021-08-23T16:04:33.000Z,2021-08-31T16:03:56.000Z,2021-08-23T16:14:32.000Z,2021-09-03T16:01:41.000Z,2021-09-08T16:07:37.000Z,2020-09-11T00:20:26.000Z,2019-01-14T00:47:27.000Z
pv_last_wk,125,62,34,14,22,11,4
pv_last_mth,558,301,178,66,88,32,32
pv_total,24168,17236,12448,3398,3379,2130,1768
download_count,39776,562971,2212262,2349,2495,252,2709
categories,[transportation],"[transportation, recreation, economy]",[],[transportation],[],"[housing & development, demographics, transpor...","[demographics, infrastructure]"


In [171]:
for qs in df1["Quality_Data-quality-statement"].unique():
    print(qs,"\n")

nan 

This data contains all spatially mapped on-street car parking bays. There is a large number of bays which have not yet been mapped. Data has been collected from a number of years, there may be parking bays that no longer exist that still show in dataset. Users are advised to contact council using the email listed to report data errors.  

A team of 4 census officers conduct field interviews which involves visiting every establishment in every building in the Census area (City of Melbourne municipality). Every commercial property is surveyed at least once every two years. 

Building accessibility data is collected to track accessibility for internal City of Melbourne purposes. This data is provided as a community service by the City of Melbourne. It is not and does not purport to be a complete guide. There may be errors or omissions. Data is liable to change. The City of Melbourne accepts no responsibility in respect of any claim arising from use or reliance upon this data. 

Data

In [172]:
for mi in df1["Melbourne-Metadata_Further-Information"].unique():
    print(mi,"\n")

nan 

https://www.melbourne.vic.gov.au/CommunityServices/SocialSupport/Documents/Helping_Out_Booklet.pdf 



In [165]:
for ki in df1['Quality_Known-Issues'].unique():
    print(ki,"\n")

nan 

Date and year planted fields may have missing date values.  

Parking Sensors are not operational on Public Holidays. Parking Sensors will show car parks as vacant when blocked by construction zones.  

None 

No known issues 

Note this dataset may not contain a reading for every sensor for every 15 minutes as the sensor devices might not have a reading for each value. There may be situations where no readings are reported for all sensors or only some readings are reported at a particular site.  

Dataset contains 'duration in minutes' field which differs from other parking sensor datasets which contain 'duration in seconds'. 

Docks have since been removed after the end of the program in Nov 2019 

Dataset contains 'duration in minutes' field which differs from other parking sensor datasets which contain 'duration in seconds'. Dataset was extracted from system so some active readings may not have been received from sensors towards the end of the dataset (May 26th).  

Listed br

In [168]:
for hi in df1['How-to-use_Further-information'].unique():
    print(hi,"\n")

nan 

http://www.melbourne.vic.gov.au/parking-and-transport/parking/Pages/parking-faqs.aspx 

https://data.melbourne.vic.gov.au/api/views/gh7s-qda8/files/51e3db26-a063-4825-ae6c-1a40c0259c75?download=true&filename=DAM_-_Info_-_DAM_May_2016_Meta_Data_Info.pdf 

http://www.melbourne.vic.gov.au/clue 

Data is updated yearly (but refreshed overnight) 

http://www.melbourne.vic.gov.au/ParkingTransportandRoads/Parking/Pages/InGroundSensors.aspx 

To create a spatial dataset use the road_segment and the seg_id fields to join the two datasets 

http://www.melbourne.vic.gov.au/Sustainability/RooftopProject/Pages/Rooftop.aspx 

http://open.dataforcities.org/ 

Data will be availible for duration of Open Innovation Competition only.  

http://www.arborcarbon.com.au/ 

https://www.melbourne.vic.gov.au/CommunityServices/SocialSupport/Documents/Helping_Out_Booklet.pdf 

http://www.melbourne.vic.gov.au/AboutCouncil/financegovernance/Budget/Documents/Annual_Plan_Budget_Document_2014_2015.pdf 

http://

In [178]:
df1['Internal-management_IMGF-risk-level'].unique()

array([nan, 'Low risk'], dtype=object)

In [181]:
df1['Internal-management_IMGF-risk-level'].isnull().sum()

222

In [176]:
df1['Internal-management_Update-mechanism'].unique()

array([nan, 'Scheduled FME job'], dtype=object)

In [177]:
df1['Internal-management_Update-mechanism'].isnull().sum()

222

In [173]:
df1['How-to-use_Applicable-standard-(URL)'].unique()

array([nan, 'http://www.abs.gov.au/ausstats/abs@.nsf/mf/1292.0',
       'http://standards.opencouncildata.org/#/dogzones'], dtype=object)

In [175]:
df1['How-to-use_Applicable-standard-(URL)'].isnull().sum()

221

In [203]:
# disinterested domain_metadata
dis = ['How-to-use_Applicable-standard-(URL)',\
       'Internal-management_Source-system-(GIS,-AssetMaster,-etc)',
       'Internal-management_Update-mechanism',
       'Internal-management_IMGF-risk-level',
       'Melbourne-Metadata_Further-Information',
       'Quality_Source-data-update-frequency']

In [205]:
df1 = df1.drop(columns=dis)

In [206]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 21 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   name                                          223 non-null    object
 1   id                                            223 non-null    object
 2   parent_fxf                                    223 non-null    object
 3   description                                   223 non-null    object
 4   data_upd_at                                   201 non-null    object
 5   pv_last_wk                                    223 non-null    int64 
 6   pv_last_mth                                   223 non-null    int64 
 7   pv_total                                      223 non-null    int64 
 8   download_count                                223 non-null    int64 
 9   categories                                    223 non-null    object
 10  do

In [207]:
df1.head(4).T

Unnamed: 0,0,1,2,3
name,Pedestrian Counting System - Monthly (counts p...,Tree canopies 2011 (Urban Forest),"Trees, with species and dimensions (Urban Forest)",On-street Parking Bay Sensors
id,b2ak-trbp,y79a-us3f,fp38-wiyy,vh2v-4nfs
parent_fxf,[],[],[],[]
description,This dataset contains hourly pedestrian counts...,Tree canopy within City of Melbourne mapped us...,"The City of Melbourne maintains more than 70,0...",Contains information from in-ground car parkin...
data_upd_at,2021-09-06T01:54:59.000Z,,2021-09-08T16:06:40.000Z,2021-09-09T04:15:48.000Z
pv_last_wk,230,23,97,135
pv_last_mth,1251,102,656,699
pv_total,71864,66774,38899,37067
download_count,8507,3178,6233,12079878
categories,[finance],[environment],"[environment, demographics]","[transportation, economy]"


In [None]:
# above extraction and refinement of interested meta data on datasets retrieved
# is consolidated in df1 above

## NEXT

# check Oscar's text processing / NLP / clustering code to try improve deliverable
# try to use my df1 above with Oscar's smarts to produce useful output
# for those users searching / looking / trying to understand CoM Open Data datasets