In [1]:
import os
import time
from datetime import datetime
import pandas as pd
from sodapy import Socrata

## based on 
# https://github.com/xmunoz/sodapy/blob/master/examples/basic_queries.ipynb
# https://github.com/xmunoz/sodapy
# /COM-Team_Project/deakin.py

## useful info from
# https://socratadiscovery.docs.apiary.io/#

apptoken = os.environ.get("SODAPY_APPTOKEN")
domain = "data.melbourne.vic.gov.au"

client = Socrata(domain, apptoken)


In [None]:
## 001 ##
# rds = client.datasets()
# datasets_df = pd.DataFrame.from_dict(rds)
# datasets_df.describe().transpose()

In [5]:
!export SODAPY_APPTOKEN="wMEkdLbVuXIpLiCFVic1PgiZ3"

In [4]:
## 002 ##
#fields_abt_datasets = ['resource','classification','metadata','permalink','link','owner','creator','preview_image_url']

## not interested : metadata, link, owner, creator, preview_image_url

## interested ...
# resource -> [1] name [2] id [3] parent_fxf [4] description [5] metadata_updated_at [6] data_updated_at
#  [7] page_views [8] columns_field_name [9] columns_datatype [10] columns_description [11] publication_date
# classification -> [1] domain_tags [2] domain_metadata
# permalink

In [4]:
## 003 ##
# define function to get interested info from datasets info

def get_interested_df(res):
    rname = []
    rid = []
    rparent_fxf = []
    rdescription = []
    rupdatedAt = []
    rcreatedAt = []
    rmetadata_updated_at = []
    rdata_updated_at = []
    rpage_views = []
    rcolumns_field_name = []
    rcolumns_datatype = []
    rcolumns_description = []
    rpublication_date = []
    cdomain_tags = []
    cdomain_metadata = []
    pl = []

    for ds in res:
        rname.append(ds['resource']['name'])
        rid.append(ds['resource']['id'])
        rparent_fxf.append(ds['resource']['parent_fxf'])
        rdescription.append(ds['resource']['description'])   
        rupdatedAt.append(ds['resource']['updatedAt'])
        rcreatedAt.append(ds['resource']['createdAt'])
        rmetadata_updated_at.append(ds['resource']['metadata_updated_at'])
        rdata_updated_at.append(ds['resource']['data_updated_at'])
        rpage_views.append(ds['resource']['page_views'])
        rcolumns_field_name.append(ds['resource']['columns_field_name'])
        rcolumns_datatype.append(ds['resource']['columns_datatype'])
        rcolumns_description.append(ds['resource']['columns_description'])
        rpublication_date.append(ds['resource']['publication_date'])
        cdomain_tags.append(ds['classification']['domain_tags'])
        cdomain_metadata.append(ds['classification']['domain_metadata'])
        pl.append(ds['permalink'])

    ndf = pd.DataFrame({
        "r.name":rname,
        "r.id":rid,
        "r.parent_fxf":rparent_fxf,
        "r.description":rdescription,
        "r.updatedAt":rupdatedAt,
        "r.createdAt":rcreatedAt,
        "r.metadata_updated_at":rmetadata_updated_at,
        "r.data_updated_at":rdata_updated_at,
        "r.page_views":rpage_views,
        "r.columns_field_name":rcolumns_field_name,
        "r.columns_datatype":rcolumns_datatype,
        "r.columns_description":rcolumns_description,
        "r.publication_date":rpublication_date,
        "c.domain_tags":cdomain_tags,
        "c.domain_metadata":cdomain_metadata,
        "permalink":pl
    })

    ndf["r.metadata_updated_at"] = pd.to_datetime(ndf["r.metadata_updated_at"])
    ndf["r.data_updated_at"] = pd.to_datetime(ndf["r.data_updated_at"])
    ndf["r.publication_date"] = pd.to_datetime(ndf["r.publication_date"])

    return ndf 

In [86]:
## 004 ##
# get interested datasets info and write to csv
rds = client.datasets()
df = get_interested_df(rds)
df.to_csv("com_datasets_info.csv", index=True)  # write to csv   

In [15]:
## 005 ##
# check top 20 most recently updated datasets

df.sort_values(by=["r.updatedAt"],ascending=False).head(20)

# seems there are 11 datasets that update regularly (multiple times within the hour)
# and another 3 probably hourly

Unnamed: 0,r.name,r.id,r.parent_fxf,r.description,r.updatedAt,r.createdAt,r.metadata_updated_at,r.data_updated_at,r.page_views,r.columns_field_name,r.columns_datatype,r.columns_description,r.publication_date,c.domain_tags,c.domain_metadata,permalink
3,On-street Parking Bay Sensors,vh2v-4nfs,[],Contains information from in-ground car parkin...,2021-07-31T16:35:43.000Z,2017-08-10T04:57:59.000Z,2021-07-31 16:35:30+00:00,2021-07-31 16:35:43+00:00,"{'page_views_last_week': 219, 'page_views_last...","[bay_id, st_marker_id, status, location, locat...","[Number, Text, Text, Point, Text, Text, Text, ...",[The unique ID of the parking bay where the pa...,2017-09-25 05:01:31+00:00,"[parking, sensor, near real-time, vacancy, tra...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/vh2v-4nfs
14,"Street furniture including bollards, bicycle r...",8fgn-5q6t,[],The City of Melbourne owns and maintains vario...,2021-07-31T16:34:46.000Z,2014-09-09T01:46:26.000Z,2021-07-31 16:34:04+00:00,2021-07-31 16:34:46+00:00,"{'page_views_last_week': 9, 'page_views_last_m...","[gis_id, description, asset_class, asset_type,...","[Number, Text, Text, Text, Text, Text, Text, T...","[, , , , , , , , , Excellent = 5, Poor = 1, , ...",2015-10-27 06:00:13+00:00,"[barbeques, bicycle rails, bollards, drinking ...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/8fgn-5q6t
116,Stormwater Pits,psq9-yz4x,[],This dataset captures all stormwater pits thro...,2021-07-31T16:32:13.000Z,2017-11-22T03:42:47.000Z,2021-07-31 16:26:39+00:00,2021-07-31 16:32:13+00:00,"{'page_views_last_week': 11, 'page_views_last_...","[asset_number, asset_description, construction...","[Number, Text, Text, Number, Text, Number, Tex...","[The councils unique id of this asset., Descri...",2018-02-05 04:36:20+00:00,"[asset, water, storm, pits]",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/psq9-yz4x
29,Pedestrian Counting System - Past Hour (counts...,d6mv-s43h,[],<b>Current issue 23/09/2020</b>\nPlease note: ...,2021-07-31T16:32:12.000Z,2018-10-19T03:55:44.000Z,2021-07-31 16:32:02+00:00,2021-07-31 16:32:12+00:00,"{'page_views_last_week': 34, 'page_views_last_...","[date, time, sensor_id, direction_1, direction...","[Calendar date, Text, Number, Number, Number, ...","[Date of the reading, Time of the reading in 2...",2019-01-17 04:46:56+00:00,"[pedestrian, sensors, foot traffic, traffic fl...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/d6mv-s43h
195,Soil Sensor Readings,mv4n-8k4v,[],This dataset contains historical readings for ...,2021-07-31T16:30:26.000Z,2020-04-01T01:21:18.000Z,2021-07-31 16:30:10+00:00,2021-07-31 16:30:26+00:00,"{'page_views_last_week': 10, 'page_views_last_...","[id, date, time, site_id, site_name, probe_id,...","[Number, Calendar date, Text, Number, Text, Nu...","[The unique id of the reading, The date in whi...",2020-04-01 01:43:33+00:00,"[soil, sensor, moisture, salinity, temperature...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/mv4n-8k4v
67,Microclimate Sensor Readings,u4vh-84j8,[],This dataset contains environmental readings f...,2021-07-31T16:30:18.000Z,2020-06-10T02:26:44.000Z,2021-07-31 16:30:05+00:00,2021-07-31 16:30:18+00:00,"{'page_views_last_week': 38, 'page_views_last_...","[id, site_id, sensor_id, value, local_time, ty...","[Number, Text, Text, Number, Calendar date, Te...","[Unique id for each record in the dataset, Loc...",2020-06-12 00:02:06+00:00,"[sensor, microclimate, environment, air qualit...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/u4vh-84j8
127,Microclimate Sensor Locations,irqv-hjr4,[],This dataset contains location and location de...,2021-07-31T16:30:15.000Z,2020-06-17T22:22:29.000Z,2021-07-31 16:30:05+00:00,2021-07-31 16:30:15+00:00,"{'page_views_last_week': 19, 'page_views_last_...","[site_id, description, last_data, latitude, lo...","[Text, Text, Calendar date, Number, Number, Po...","[location id , description of the location of ...",2020-06-23 04:59:37+00:00,"[climate, environment, air quality, sensor, en...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/irqv-hjr4
159,Tactile Ground Surface Indicator,v4zv-yqtp,[],This dataset contains tactile ground surface i...,2021-07-31T16:26:30.000Z,2018-01-16T21:38:53.000Z,2021-07-31 16:26:20+00:00,2021-07-31 16:26:30+00:00,"{'page_views_last_week': 4, 'page_views_last_m...","[asset_number, description, road_segment, lat,...","[Number, Text, Text, Number, Number, Point, Te...","[The councils unique id of this asset., Descri...",2018-02-05 05:55:06+00:00,"[accessibility, wayfinding, footpath, walking,...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/v4zv-yqtp
166,Pumping Station Locations,55z9-zqzu,[],A pump station is a holding chamber that pumps...,2021-07-31T16:08:03.000Z,2017-11-22T02:52:09.000Z,2021-07-31 16:07:52+00:00,2021-07-31 16:08:03+00:00,"{'page_views_last_week': 6, 'page_views_last_m...","[asset_number, asset_description, number_of_pe...","[Number, Text, Number, Number, Number, Number,...","[The councils unique id of this asset., Descri...",2018-02-05 04:54:40+00:00,"[pumpingstation, assets, engineering, asset, d...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/55z9-zqzu
58,Public toilets,ru3z-44we,[],Public toilets known about or operated by the ...,2021-07-31T16:07:42.000Z,2016-01-29T03:13:34.000Z,2021-07-31 16:07:32+00:00,2021-07-31 16:07:42+00:00,"{'page_views_last_week': 9, 'page_views_last_m...","[name, female, male, wheelchair, operator, bab...","[Text, Text, Text, Text, Text, Text, Text, Text]","[, , , , , , , ]",2017-09-18 04:19:15+00:00,"[toilets, opencouncildata, ocd-toilets-1.2, ev...",[{'key': 'Data-management_Source-data-update-f...,https://data.melbourne.vic.gov.au/d/ru3z-44we


In [95]:
## 006 ##
r1 = client.get_all('vh2v-4nfs')
df_r1 = pd.DataFrame.from_dict(r1)
df_r1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   bay_id                       1043 non-null   object
 1   st_marker_id                 1043 non-null   object
 2   status                       1043 non-null   object
 3   location                     1043 non-null   object
 4   lat                          1043 non-null   object
 5   lon                          1043 non-null   object
 6   :@computed_region_evbi_jbp8  1043 non-null   object
dtypes: object(7)
memory usage: 57.2+ KB


In [24]:
## 007 ##
# loop to read dataset in below hours and intervals, buffering our dataset in a csv

interval = 5
hours = 1
runs = int(hours*(60/interval))
ds_id = 'vh2v-4nfs'
ds_no = "003"
ds_fname = "datasets/"+ds_no+"_"+ds_id+"__bufferred"+".csv"

for i in range(runs):
    r1 = client.get_all(ds_id)  # get snapshot of dataset using sodapy api 
    df_temp = pd.DataFrame.from_dict(r1)  # read dict to dataframe
    df_temp['db_read_time']=datetime.now()  # add timestampe column
    if i == 0:
        df_temp.to_csv(ds_fname, mode='w', header=True)  # first write
    else:
        df_temp.to_csv(ds_fname, mode='a', header=False)  # subsequent append to csv        
    print(f"run [{i}] at [{datetime.now()}]")   # print loop progress to keep track
    print(f"--> [{df_temp.bay_id.nunique()}] unique bay_ids picked up in this run")
    time.sleep(60*interval)  # sleep 15min before next read and append
    

run [0] at [2021-08-02 21:14:55.177216]
--> [1116] unique bay_ids picked up in this run
run [1] at [2021-08-02 21:19:57.044347]
--> [1116] unique bay_ids picked up in this run
run [2] at [2021-08-02 21:24:59.229583]
--> [1116] unique bay_ids picked up in this run
run [3] at [2021-08-02 21:30:01.112287]
--> [1031] unique bay_ids picked up in this run
run [4] at [2021-08-02 21:35:03.094059]
--> [1031] unique bay_ids picked up in this run
run [5] at [2021-08-02 21:40:05.000949]
--> [1031] unique bay_ids picked up in this run
run [6] at [2021-08-02 21:45:07.015876]
--> [1116] unique bay_ids picked up in this run
run [7] at [2021-08-02 21:50:09.549744]
--> [1116] unique bay_ids picked up in this run
run [8] at [2021-08-02 21:55:11.505166]
--> [1032] unique bay_ids picked up in this run
run [9] at [2021-08-02 22:00:13.501655]
--> [1032] unique bay_ids picked up in this run
run [10] at [2021-08-02 22:05:15.281852]
--> [1116] unique bay_ids picked up in this run
run [11] at [2021-08-02 22:10:1

In [100]:
## 008 ##
df_fetched = pd.read_csv(ds_fname)
df_fetched.db_read_time = pd.to_datetime(df_fetched.db_read_time)
df_fetched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22991 entries, 0 to 22990
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Unnamed: 0                   22991 non-null  int64         
 1   bay_id                       22991 non-null  int64         
 2   st_marker_id                 22991 non-null  object        
 3   status                       22991 non-null  object        
 4   location                     22991 non-null  object        
 5   lat                          22991 non-null  float64       
 6   lon                          22991 non-null  float64       
 7   :@computed_region_evbi_jbp8  22991 non-null  int64         
 8   db_read_time                 22991 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 1.6+ MB


In [2]:
## 009 ##
# function to download dataset (assume json format) and save it to csv in datasets folder

def download_dataset(client,ds_no,ds_id):
    ds_fname = ds_no+"_"+ds_id+".csv"

    result = client.get_all(ds_id)  # get result of dataset using sodapy api 
    df_temp = pd.DataFrame.from_dict(result)  # read dict to dataframe
    df_temp['db_read_time']=datetime.now()  # add timestampe column
    
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    df_temp.to_csv("datasets/"+ds_no+"_"+ds_id+"_dl_at__"+timestamp+".csv",
                   mode='w', header=True)  # write to csv

In [4]:
## 010 ##
download_dataset(client,"075","wwkr-v8s7")

In [3]:
## 012 ##
download_dataset(client,"056","vdsi-4gtj")

In [18]:
## 013 ##
# more comprehensive than code block ## 007 ##
# this code block to run for some time --> to buffer some parking sensor data to play with

import os
import time
from datetime import datetime
import pandas as pd
from sodapy import Socrata

apptoken = os.environ.get("wMEkdLbVuXIpLiCFVic1PgiZ3")
domain = "data.melbourne.vic.gov.au"
client = Socrata(domain, apptoken)

interval, hours = 15, 1
runs = int(hours*(60/interval))
ds_id, ds_no = 'vh2v-4nfs', "003"
ds_fname = os.path.join('datasets', ds_no+"_"+ds_id+"__bufferred"+".csv")
uq_fname = os.path.join('datasets', ds_no+"_"+ds_id+"__uniqueBays"+".csv")
ds_col1 = ['bay_id', 'st_marker_id', 'status', 'lat', 'lon']
ds_col2 = ['bay_id', 'st_marker_id', 'lat', 'lon', 'location']
log_fname = os.path.join('datasets', ds_no+"_"+ds_id+"__log"+".txt")
for i in range(runs):
    r1 = client.get_all(ds_id)  # get snapshot of dataset using sodapy api
    df_temp = pd.DataFrame.from_dict(r1)  # read dict to dataframe
    df_temp1 = df_temp[ds_col1]
    df_temp2 = df_temp[ds_col2]
    df_temp1['db_read_time'] = datetime.now()  # add timestampe column
    if i == 0:
        df_temp1.to_csv(ds_fname, mode='w', header=True,
                        index=False)  # first write
        df_temp2.to_csv(uq_fname, mode='w', header=True, index=False)
        o = open(log_fname,'w')
        print(f"{datetime.now()}, dataset retrieved and written to csv", file=o)
        print(f"{datetime.now()}, unique parking bays csv written", file=o)
        o.close()
    else:
        df_temp1.to_csv(ds_fname, mode='a', header=False,
                        index=False)  # subsequent append to csv
        df_temp2prev = pd.read_csv(uq_fname)
        df_temp2new = pd.concat(
            [df_temp2prev, df_temp2], axis='index').drop_duplicates()
        df_temp2new.to_csv(uq_fname, mode='w', header=True, index=False)
        o = open(log_fname,'a')
        print(f"{datetime.now()}, dataset retrieved and appended to csv", file=o)
        print(f"{datetime.now()}, unique parking bays csv updated", file=o)
        o.close()
    time.sleep(60*interval)  # sleep 15min before next read and append


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyboardInterrupt: 