In [1]:
import os
import time
from datetime import datetime
import pandas as pd
from sodapy import Socrata

apptoken = os.environ.get("SODAPY_APPTOKEN")
domain = "data.melbourne.vic.gov.au"
client = Socrata(domain, apptoken)

In [103]:
# function to get attributes of a column in the df about datasets
def dseries(df, col, attrib, attrib_sub=None):  
    ls = []
    if attrib_sub == None:
        for d in df[col]:
            ls.append(d[attrib])
    else:
        for d in df[col]:
            ls.append(d[attrib][attrib_sub])
    return pd.Series(ls)

# function to make dataframe of interested info
def interesteddf(rdf):
    a = dseries(rdf, 'resource', 'name')
    b = dseries(rdf, 'resource', 'id')
    c = dseries(rdf, 'resource', 'parent_fxf')
    d = dseries(rdf, 'resource', 'description')
    e = dseries(rdf, 'resource', 'data_updated_at')
    f = dseries(rdf, 'resource', 'page_views', 'page_views_last_week')
    g = dseries(rdf, 'resource', 'page_views', 'page_views_last_month')
    h = dseries(rdf, 'resource', 'page_views', 'page_views_total')
    i = dseries(rdf, 'resource', 'download_count')
    j = dseries(rdf, 'classification', 'categories')
    k = dseries(rdf, 'classification', 'domain_category')
    l = dseries(rdf, 'classification', 'domain_tags')
    m = dseries(rdf, 'classification', 'domain_metadata')

    col =  ['name', 'id', 'parent_fxf', 'description', 'data_upd_at', 'pv_last_wk', 'pv_last_mth', 'pv_total',
            'download_count', 'categories', 'domain_category', 'domain_tags', 'domain_metadata']

    df = pd.concat([a,b,c,d,e,f,g,h,i,j,k,l,m], axis='columns')
    df.columns = col

    return df

# function to tokenise description and name columns
def tokenise2(df):
    # tokenize by [1] making all lower case [2] removing some unwanted stop characters [3] splitting string into list of word tokens

    df['description'] = df['description'].str.lower().str.replace(r'\. ', ' ').str.replace('\, ', ' ')
    df['description'] = df['description'].str.replace('– ',' ').str.replace('- ',' ').str.split()

    df['name'] = df['name'].str.replace('(','').str.replace(')','')
    df['name'] = df['name'].str.lower().str.replace(r'\. ', ' ').str.replace('\, ', ' ')
    df['name'] = df['name'].str.replace('– ',' ').str.replace('- ',' ').str.split()

    return df

In [104]:
rds = client.datasets()
rdf = pd.DataFrame.from_dict(rds)

ds_df = interesteddf(rdf)
ds_df.head(2).T

Unnamed: 0,0,1
name,Pedestrian Counting System - Monthly (counts p...,Tree canopies 2011 (Urban Forest)
id,b2ak-trbp,y79a-us3f
parent_fxf,[],[]
description,This dataset contains hourly pedestrian counts...,Tree canopy within City of Melbourne mapped us...
data_upd_at,2021-09-06T01:54:59.000Z,
pv_last_wk,223,21
pv_last_mth,1262,106
pv_total,71776,66769
download_count,8496,3178
categories,[finance],[environment]


In [105]:
ds_df_tok = tokenise2(ds_df)
ds_df_tok.head(2).T



Unnamed: 0,0,1
name,"[pedestrian, counting, system, monthly, counts...","[tree, canopies, 2011, urban, forest]"
id,b2ak-trbp,y79a-us3f
parent_fxf,[],[]
description,"[this, dataset, contains, hourly, pedestrian, ...","[tree, canopy, within, city, of, melbourne, ma..."
data_upd_at,2021-09-06T01:54:59.000Z,
pv_last_wk,223,21
pv_last_mth,1262,106
pv_total,71776,66769
download_count,8496,3178
categories,[finance],[environment]


In [None]:
## 000 ## other interesting info
#
## columns of dataset
# rdf['resource'][0]['columns_name']
# rdf['resource'][0]['columns_field_name']
# rdf['resource'][0]['columns_datatype']
# rdf['resource'][0]['columns_description']
#

In [5]:
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             223 non-null    object
 1   id               223 non-null    object
 2   parent_fxf       223 non-null    object
 3   description      223 non-null    object
 4   data_upd_at      201 non-null    object
 5   pv_last_wk       223 non-null    int64 
 6   pv_last_mth      223 non-null    int64 
 7   pv_total         223 non-null    int64 
 8   download_count   223 non-null    int64 
 9   categories       223 non-null    object
 10  domain_category  223 non-null    object
 11  domain_tags      223 non-null    object
 12  domain_metadata  223 non-null    object
dtypes: int64(4), object(9)
memory usage: 22.8+ KB
