# Data Workflow

### Python/SQL Setup

In [1]:
from sqlalchemy import create_engine, inspect
import psycopg2
import psycopg2.extras
import json
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import geoalchemy2
from shapely import wkt

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(sqlcmd, args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [2]:
credentials = "Credentials.json"
db, conn = pgconnect(credentials)

Connected successfully.


In [3]:
# Creating new schema
sql = """
CREATE SCHEMA IF NOT EXISTS sa2;
SET search_path TO sa2;
"""
conn.execute(sql)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fbbd79d45e0>

In [4]:
# Adding PostGIS to sa2 database
conn.execute("CREATE EXTENSION IF NOT EXISTS postgis SCHEMA sa2;")
query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


In [5]:
# Run if PostGIS is already installed, but not in sa2
sql = """
UPDATE pg_extension
SET extrelocatable = TRUE
WHERE extname = 'postgis';

ALTER EXTENSION postgis
SET SCHEMA sa2;
"""
#conn.execute(sql)

query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


### Task 1: Cleaning & Importing 

#### SA2 Regions dataset


In [6]:
# Loading and inspecting the dataset
path = "SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp"
regions = gpd.read_file(path)
print(regions.shape)
print(regions.columns)
print(regions.dtypes)

(2473, 17)
Index(['SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21',
       'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21',
       'STE_CODE21', 'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21',
       'LOCI_URI21', 'geometry'],
      dtype='object')
SA2_CODE21      object
SA2_NAME21      object
CHG_FLAG21      object
CHG_LBL21       object
SA3_CODE21      object
SA3_NAME21      object
SA4_CODE21      object
SA4_NAME21      object
GCC_CODE21      object
GCC_NAME21      object
STE_CODE21      object
STE_NAME21      object
AUS_CODE21      object
AUS_NAME21      object
AREASQKM21     float64
LOCI_URI21      object
geometry      geometry
dtype: object


In [7]:
# Filter for only the SA2 regions in greater Sydnet
regions = regions[regions["GCC_CODE21"] == "1GSYD"]

In [8]:
# Removing unnecessary columns
to_remove = ["CHG_FLAG21", "CHG_LBL21", "GCC_CODE21", "GCC_NAME21", "STE_CODE21", "STE_NAME21", "AUS_CODE21", "AUS_NAME21", "LOCI_URI21"]
regions.drop(to_remove, axis=1, inplace=True)

In [9]:
# Inspecting the dataset after modifcation
regions.head()

Unnamed: 0,SA2_CODE21,SA2_NAME21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,AREASQKM21,geometry
28,102011028,Avoca Beach - Copacabana,10201,Gosford,102,Central Coast,6.4376,"POLYGON ((151.41373 -33.46558, 151.41362 -33.4..."
29,102011029,Box Head - MacMasters Beach,10201,Gosford,102,Central Coast,32.0802,"POLYGON ((151.37484 -33.50052, 151.37507 -33.5..."
30,102011030,Calga - Kulnura,10201,Gosford,102,Central Coast,767.9512,"MULTIPOLYGON (((151.20449 -33.53280, 151.20448..."
31,102011031,Erina - Green Point,10201,Gosford,102,Central Coast,33.7934,"POLYGON ((151.37194 -33.43698, 151.37288 -33.4..."
32,102011032,Gosford - Springfield,10201,Gosford,102,Central Coast,16.9123,"POLYGON ((151.32349 -33.42779, 151.32342 -33.4..."


In [10]:
# Adding to postgresql database
regions.columns = regions.columns.str.lower()
regions.to_postgis("regions", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE regions
ADD PRIMARY KEY (sa2_code21);
"""
conn.execute(sql);

#### Businesses dataset

In [11]:
# Loading and inspecting the dataset
business = pd.read_csv("Businesses.csv")
print(business.shape)
print(business.columns)
print(business.dtypes)

(12217, 11)
Index(['industry_code', 'industry_name', 'sa2_code', 'sa2_name',
       '0_to_50k_businesses', '50k_to_200k_businesses',
       '200k_to_2m_businesses', '2m_to_5m_businesses', '5m_to_10m_businesses',
       '10m_or_more_businesses', 'total_businesses'],
      dtype='object')
industry_code             object
industry_name             object
sa2_code                   int64
sa2_name                  object
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object


In [12]:
# View of the particular industries accounted for in each sa2 region
business[["industry_code", "industry_name"]].drop_duplicates()

Unnamed: 0,industry_code,industry_name
0,A,"Agriculture, Forestry and Fishing"
643,B,Mining
1286,C,Manufacturing
1929,D,"Electricity, Gas, Water and Waste Services"
2572,E,Construction
3215,F,Wholesale Trade
3858,G,Retail Trade
4501,H,Accommodation and Food Services
5144,I,"Transport, Postal and Warehousing"
5787,J,Information Media and Telecommunications


In [13]:
# Removing counts of "Other Services" (is this useful?)
#business = business[business["industry_code"] != 'S'] 

In addition, we should also check that the count of the businesses in each size category adds to the total_businesses column:

In [14]:
# Checking if sum of businesses equals total businesses column
accounted_businesses = sum(business.loc[:,"0_to_50k_businesses":"5m_to_10m_businesses":1].sum(axis=1) == business["total_businesses"])
prop_correct_business_sum = accounted_businesses/business.shape[0]
round(prop_correct_business_sum, 2)

0.34

To improve the quality of the data analysis, these total_businesses values will be corrected to follow the sum of the number of businesses in each category:

In [15]:
business["total_businesses"] = business.loc[:,"0_to_50k_businesses":"5m_to_10m_businesses":1].sum(axis=1)

Finally, we need to export this DataFrame from pandas into the postgresql database:

In [16]:
# Adding to postgresql database
business.to_sql("business", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE business
ADD PRIMARY KEY (sa2_code, industry_code);
"""
conn.execute(sql);

#### Stops dataset
Since the stops file provided is in a GTFS format, we will first load it in as a csv file with pandas, then convert it into a GeoDataFrame with geopandas.

Useful link: https://developers.google.com/transit/gtfs/reference#stopstxt

In [17]:
## Loading and inspecting DataFrame
stops_df = pd.read_csv("Stops.txt")
print(stops_df.shape)
print(stops_df.columns)

## Correcting datatypes
# Replace NULL values in location_type to int
stops_df[["location_type"]] = stops_df[["location_type"]].fillna(0)
stops_df = stops_df.astype({"location_type":"int64"})

# Convert wheelchair_boarding to boolean
stops_df[["wheelchair_boarding"]] = stops_df[["wheelchair_boarding"]].replace({0:np.nan, 1:True, 2:False})

print(stops_df.dtypes)

(114718, 9)
Index(['stop_id', 'stop_code', 'stop_name', 'stop_lat', 'stop_lon',
       'location_type', 'parent_station', 'wheelchair_boarding',
       'platform_code'],
      dtype='object')
stop_id                 object
stop_code              float64
stop_name               object
stop_lat               float64
stop_lon               float64
location_type            int64
parent_station          object
wheelchair_boarding     object
platform_code           object
dtype: object


In [18]:
stops_df

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,wheelchair_boarding,platform_code
0,200039,200039.0,"Central Station, Eddy Av, Stand A",-33.882206,151.206665,0,200060,,
1,200054,200054.0,"Central Station, Eddy Av, Stand D",-33.882042,151.206991,0,200060,,
2,200060,,Central Station,-33.884084,151.206292,1,,,
3,201510,,Redfern Station,-33.891690,151.198866,1,,,
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",-33.893329,151.198882,0,201510,,
...,...,...,...,...,...,...,...,...,...
114713,212753,212753.0,"Sydney Olympic Park Wharf, Side B",-33.822016,151.078797,0,21271,True,B
114714,2137185,2137185.0,"Cabarita Wharf, Side A",-33.840669,151.116926,0,21371,True,1A
114715,2137186,2137186.0,"Cabarita Wharf, Side B",-33.840769,151.116899,0,21371,True,1B
114716,21501,21501.0,Parramatta Wharf,-33.813904,151.010577,0,2150112,True,


Now we need to convert the DataFrame into a GeoDataFrame with geopandas:

In [19]:
# Converting into GeoDataFrame
stops_gdf = gpd.GeoDataFrame(stops_df, geometry=gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat)).set_crs(epsg=4326)


# Removing stop_lon and stop_lat columns
stops_gdf.drop(["stop_lon", "stop_lat"], axis=1, inplace=True)
stops_gdf.head()

Unnamed: 0,stop_id,stop_code,stop_name,location_type,parent_station,wheelchair_boarding,platform_code,geometry
0,200039,200039.0,"Central Station, Eddy Av, Stand A",0,200060.0,,,POINT (151.20666 -33.88221)
1,200054,200054.0,"Central Station, Eddy Av, Stand D",0,200060.0,,,POINT (151.20699 -33.88204)
2,200060,,Central Station,1,,,,POINT (151.20629 -33.88408)
3,201510,,Redfern Station,1,,,,POINT (151.19887 -33.89169)
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",0,201510.0,,,POINT (151.19888 -33.89333)


Finally, we need to export this GeoDataFrame from geopandas into the postgresql database:

In [20]:
# Adding to postgresql database
stops_gdf.to_postgis("stops", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE stops
ADD PRIMARY KEY (stop_id)
"""
conn.execute(sql);

In [21]:
query(conn, "SELECT * FROM stops;")

Unnamed: 0,stop_id,stop_code,stop_name,location_type,parent_station,wheelchair_boarding,platform_code,geometry
0,200039,200039.0,"Central Station, Eddy Av, Stand A",0,200060,,,0101000020E6100000FFA631FF9CE66240A1FF6524ECF0...
1,200054,200054.0,"Central Station, Eddy Av, Stand D",0,200060,,,0101000020E61000002F928BAC9FE66240E33DC7C1E6F0...
2,200060,,Central Station,1,,,,0101000020E6100000817FA2F299E662408FF33DAC29F1...
3,201510,,Redfern Station,1,,,,0101000020E61000009E57611C5DE6624060304CE622F2...
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",0,201510,,,0101000020E6100000DBF9333D5DE662403DFA6B9D58F2...
...,...,...,...,...,...,...,...,...
114713,212753,212753.0,"Sydney Olympic Park Wharf, Side B",0,21271,True,B,0101000020E6100000AF9B3D8185E262408F52D7D537E9...
114714,2137185,2137185.0,"Cabarita Wharf, Side A",0,21371,True,1A,0101000020E6100000EB409ADCBDE3624089CE4C0B9BEB...
114715,2137186,2137186.0,"Cabarita Wharf, Side B",0,21371,True,1B,0101000020E6100000C4F9BEA2BDE362403EB375529EEB...
114716,21501,21501.0,Parramatta Wharf,0,2150112,True,,0101000020E6100000E443E4A456E0624025C1A4032EE8...


#### Polls dataset
Since the polls dataset already has a sql POINT object already implemented in the table, we will set the column to be the geometry of the GeoDataFrame. Since for the purposes of this assignment we need to assign each polling place a location, we will only use the entries where this column is not NULL.

In [22]:
## Loading and inspecting DataFrame
polls_df = pd.read_csv("PollingPlaces2019.csv")
print(polls_df.shape)
print(polls_df.columns)
print(polls_df.dtypes)

# Updating datatype of the_geom
polls_df = polls_df[polls_df['the_geom'].notna()]
polls_df["the_geom"] = polls_df["the_geom"].apply(wkt.loads)



(2930, 17)
Index(['FID', 'state', 'division_id', 'division_name', 'polling_place_id',
       'polling_place_type_id', 'polling_place_name', 'premises_name',
       'premises_address_1', 'premises_address_2', 'premises_address_3',
       'premises_suburb', 'premises_state_abbreviation', 'premises_post_code',
       'latitude', 'longitude', 'the_geom'],
      dtype='object')
FID                             object
state                           object
division_id                      int64
division_name                   object
polling_place_id                 int64
polling_place_type_id            int64
polling_place_name              object
premises_name                   object
premises_address_1              object
premises_address_2              object
premises_address_3              object
premises_suburb                 object
premises_state_abbreviation     object
premises_post_code             float64
latitude                       float64
longitude                      float64


Now we need to convert the DataFrame into a GeoDataFrame with geopandas:

In [23]:
# Loading into GeoDataFrame
polls_gdf = gpd.GeoDataFrame(polls_df, geometry="the_geom").set_crs(epsg=4283)

# Removing stop_lon and stop_lat columns
polls_gdf.drop(["longitude", "latitude"], axis=1, inplace=True)

polls_gdf.head()

Unnamed: 0,FID,state,division_id,division_name,polling_place_id,polling_place_type_id,polling_place_name,premises_name,premises_address_1,premises_address_2,premises_address_3,premises_suburb,premises_state_abbreviation,premises_post_code,the_geom
13,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,58,1,Oatley,Oatley Public School,51 Letitia St,,,OATLEY,NSW,2223.0,POINT (-33.98470 151.08100)
15,aec_federal_election_polling_places_2019.fid-4...,NSW,111,Chifley,392,1,Dharruk,Dawson Public School,7 Stuart Rd,,,DHARRUK,NSW,2770.0,POINT (-33.74750 150.81700)
16,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,31,1,Allawah,PJ Ferry Reserve Community Hall,147B Bellevue Pde,,,ALLAWAH,NSW,2218.0,POINT (-33.97679 151.11490)
17,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,67,1,Allawah South,St Raphael's Church Hall,84 George St,,,SOUTH HURSTVILLE,NSW,2221.0,POINT (-33.97560 151.11100)
18,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,56500,1,Beverly Hills North (Banks),Beverly Hills North Public School,1-3 Shorter Ave,,,BEVERLY HILLS,NSW,2209.0,POINT (-33.94130 151.07500)


Finally, we need to export this GeoDataFrame from geopandas into the postgresql database:

In [31]:
# Adding to postgresql database
polls_gdf.to_postgis("polls", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE polls
ADD PRIMARY KEY (polling_place_id);
"""
conn.execute(sql);
#query(conn, "select * from polls")

Unnamed: 0,FID,state,division_id,division_name,polling_place_id,polling_place_type_id,polling_place_name,premises_name,premises_address_1,premises_address_2,premises_address_3,premises_suburb,premises_state_abbreviation,premises_post_code,the_geom
0,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,58,1,Oatley,Oatley Public School,51 Letitia St,,,OATLEY,NSW,2223.0,0101000020BB100000832F4CA60AFE40C03BDF4F8D97E2...
1,aec_federal_election_polling_places_2019.fid-4...,NSW,111,Chifley,392,1,Dharruk,Dawson Public School,7 Stuart Rd,,,DHARRUK,NSW,2770.0,0101000020BB10000048E17A14AEDF40C0A01A2FDD24DA...
2,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,31,1,Allawah,PJ Ferry Reserve Community Hall,147B Bellevue Pde,,,ALLAWAH,NSW,2218.0,0101000020BB100000EA48E47107FD40C0A7EC4F3DADE3...
3,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,67,1,Allawah South,St Raphael's Church Hall,84 George St,,,SOUTH HURSTVILLE,NSW,2221.0,0101000020BB10000022FDF675E0FC40C0643BDF4F8DE3...
4,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,56500,1,Beverly Hills North (Banks),Beverly Hills North Public School,1-3 Shorter Ave,,,BEVERLY HILLS,NSW,2209.0,0101000020BB100000C6DCB5847CF840C06666666666E2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2785,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,2810,1,Warilla North,Warilla North Community Centre,2-6 Hill St,,,WARILLA,NSW,2528.0,0101000020BB1000001288D7F50B4641C0D8B628B341DB...
2786,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,2809,1,Warilla South,Warilla High School,10 Keross Ave,,,BARRACK HEIGHTS,NSW,2528.0,0101000020BB1000009C33A2B4374841C0FA7E6ABC74DB...
2787,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,58798,5,Warilla WHITLAM PPVC,2/144 Shellharbour Rd,,,,WARILLA,NSW,2528.0,0101000020BB10000011F28B5C814641C0BD32141C83DB...
2788,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,31242,1,Welby,Welby Community Hall,14 Currockbilly St,,,WELBY,NSW,2575.0,0101000020BB100000386744696F3841C021B0726891CD...


#### Schools dataset

In [38]:
# Loading and inspecting the dataset
path = "Catchments/catchments_primary.shp"
primary = gpd.read_file(path)
print(primary.shape)
print(primary.columns)
print(primary.dtypes)
primary.head()

(1662, 19)
Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')
USE_ID          object
CATCH_TYPE      object
USE_DESC        object
ADD_DATE        object
KINDERGART      object
YEAR1           object
YEAR2           object
YEAR3           object
YEAR4           object
YEAR5           object
YEAR6           object
YEAR7           object
YEAR8           object
YEAR9           object
YEAR10          object
YEAR11          object
YEAR12          object
PRIORITY        object
geometry      geometry
dtype: object


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,2838,PRIMARY,Parklea PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.93564 -33.71612, 150.93715 -33.7..."
1,2404,PRIMARY,Lindfield EPS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.18336 -33.74748, 151.18443 -33.7..."
2,4393,PRIMARY,Carlingford WPS,20220223,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.04518 -33.77303, 151.04526 -33.7..."
3,4615,PRIMARY,Caddies Ck PS,20181210,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((150.92567 -33.72960, 150.92602 -33.7..."
4,3918,PRIMARY,Killara PS,20211219,Y,Y,Y,Y,Y,Y,Y,N,N,N,N,N,N,,"POLYGON ((151.15379 -33.75586, 151.15404 -33.7..."


In [39]:
# Loading and inspecting the dataset
path = "Catchments/catchments_secondary.shp"
secondary = gpd.read_file(path)
print(secondary.shape)
print(secondary.columns)
print(secondary.dtypes)
secondary.head()

(436, 19)
Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'PRIORITY', 'geometry'],
      dtype='object')
USE_ID          object
CATCH_TYPE      object
USE_DESC        object
ADD_DATE        object
KINDERGART      object
YEAR1           object
YEAR2           object
YEAR3           object
YEAR4           object
YEAR5           object
YEAR6           object
YEAR7           object
YEAR8           object
YEAR9           object
YEAR10          object
YEAR11          object
YEAR12          object
PRIORITY        object
geometry      geometry
dtype: object


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,PRIORITY,geometry
0,8503,HIGH_COED,Billabong HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.67182 -35.31444, 146.68930 -35.3..."
1,8266,HIGH_COED,James Fallon HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((147.08734 -35.86271, 147.10413 -35.8..."
2,8505,HIGH_COED,Murray HS,20200507,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((146.81448 -35.78341, 146.81250 -35.7..."
3,8458,HIGH_COED,Kingswood HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"MULTIPOLYGON (((150.68600 -33.74031, 150.68631..."
4,8559,HIGH_COED,Jamison HS,20201016,N,N,N,N,N,N,N,Y,Y,Y,Y,Y,Y,,"POLYGON ((150.69513 -33.75627, 150.68936 -33.7..."


In [40]:
# Loading and inspecting the dataset
path = "Catchments/catchments_future.shp"
future = gpd.read_file(path)
print(future.shape)
print(future.columns)
print(future.dtypes)
future.head()

(30, 18)
Index(['USE_ID', 'CATCH_TYPE', 'USE_DESC', 'ADD_DATE', 'KINDERGART', 'YEAR1',
       'YEAR2', 'YEAR3', 'YEAR4', 'YEAR5', 'YEAR6', 'YEAR7', 'YEAR8', 'YEAR9',
       'YEAR10', 'YEAR11', 'YEAR12', 'geometry'],
      dtype='object')
USE_ID          object
CATCH_TYPE      object
USE_DESC        object
ADD_DATE        object
KINDERGART       int64
YEAR1            int64
YEAR2            int64
YEAR3            int64
YEAR4            int64
YEAR5            int64
YEAR6            int64
YEAR7            int64
YEAR8            int64
YEAR9            int64
YEAR10           int64
YEAR11           int64
YEAR12           int64
geometry      geometry
dtype: object


Unnamed: 0,USE_ID,CATCH_TYPE,USE_DESC,ADD_DATE,KINDERGART,YEAR1,YEAR2,YEAR3,YEAR4,YEAR5,YEAR6,YEAR7,YEAR8,YEAR9,YEAR10,YEAR11,YEAR12,geometry
0,8416,HIGH_COED,Ku-ring-gai HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.19849 -33.53990, 151.19945 -33.5..."
1,8161,HIGH_BOYS,Randwick BHS,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.27152 -33.91402, 151.27152 -33.9..."
2,8539,HIGH_COED,SSC Blackwattle Bay,20220609,0,0,0,0,0,0,0,0,0,0,0,2024,2024,"POLYGON ((151.15292 -33.83939, 151.16144 -33.8..."
3,8400,HIGH_COED,St Ives HS,20230114,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.17794 -33.69820, 151.17859 -33.6..."
4,8555,HIGH_COED,Rose Bay SC,20200220,0,0,0,0,0,0,0,2024,2024,2024,2024,2024,2024,"POLYGON ((151.28072 -33.83287, 151.28095 -33.8..."


#### Population dataset
The population dataset provides us with the population of each SA2 region, which is further paritioned by age. For the project, this table will be used to calculate the required metrics that follow a 'per capita' basis. The dataset provides a unique SA2 code for each row, so we will use this field as the primary key of the dataset, so that it can easily join with the SA2 dataset.

In [25]:
# Loading and inspecting the dataset
population = pd.read_csv("Population.csv")
print(population.shape)
print(population.columns)
print(population.dtypes)

(373, 21)
Index(['sa2_code', 'sa2_name', '0-4_people', '5-9_people', '10-14_people',
       '15-19_people', '20-24_people', '25-29_people', '30-34_people',
       '35-39_people', '40-44_people', '45-49_people', '50-54_people',
       '55-59_people', '60-64_people', '65-69_people', '70-74_people',
       '75-79_people', '80-84_people', '85-and-over_people', 'total_people'],
      dtype='object')
sa2_code               int64
sa2_name              object
0-4_people             int64
5-9_people             int64
10-14_people           int64
15-19_people           int64
20-24_people           int64
25-29_people           int64
30-34_people           int64
35-39_people           int64
40-44_people           int64
45-49_people           int64
50-54_people           int64
55-59_people           int64
60-64_people           int64
65-69_people           int64
70-74_people           int64
75-79_people           int64
80-84_people           int64
85-and-over_people     int64
total_people          

Similarly to the Business dataset, we should verify that the sum of the people in each of the columns equals the total_people column:

In [26]:
accounted_population = sum(population.loc[:,"0-4_people":"85-and-over_people":1].sum(axis=1) == population["total_people"])
prop_correct_population_sum = accounted_population/population.shape[0]
round(prop_correct_population_sum, 2)

1.0

As we can see the total_people column correctly adds up each of the rows of people, meaning we can use the field for our analysis. We will also create the young_people column to calculate the schools metric: 

In [27]:
# Adding young_people column
population["young_people"] = population.loc[:,"0-4_people":"15-19_people":1].sum(axis=1)

In [28]:
# Inspecting the dataset after modification
population.head()

Unnamed: 0,sa2_code,sa2_name,0-4_people,5-9_people,10-14_people,15-19_people,20-24_people,25-29_people,30-34_people,35-39_people,...,50-54_people,55-59_people,60-64_people,65-69_people,70-74_people,75-79_people,80-84_people,85-and-over_people,total_people,young_people
0,102011028,Avoca Beach - Copacabana,424,522,623,552,386,222,306,416,...,602,570,520,464,369,226,142,70,7530,2121
1,102011029,Box Head - MacMasters Beach,511,666,702,592,461,347,420,535,...,749,794,895,863,925,603,331,264,11052,2471
2,102011030,Calga - Kulnura,200,225,258,278,274,227,214,286,...,436,422,397,327,264,190,100,75,4748,961
3,102011031,Erina - Green Point,683,804,880,838,661,502,587,757,...,882,901,930,917,1065,976,773,1028,14803,3205
4,102011032,Gosford - Springfield,1164,1044,1084,1072,1499,1864,1750,1520,...,1241,1377,1285,1166,949,664,476,537,21346,4364


Finally, we need to import the DataFrame into the postgresql database:

In [29]:
# Taking only required rows
population = population[["sa2_code", "young_people", "total_people"]]

# Adding to postgresql database
population.to_sql("population", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE population
ADD PRIMARY KEY (sa2_code);
"""
conn.execute(sql);

### Task 2: Computing the "well-resourced" Score

### Task 3: Extending the "well-resourced" Score & Visualisations 

### Task 4:  Additional Analysis

## Extra commands

In [30]:
# Checks that all tables are in sa2 schema
inspect(db).get_table_names(schema="sa2")

['spatial_ref_sys', 'regions', 'business', 'stops', 'polls', 'population']

Todo:

Week 10 (tasks 1 and 2):
- Import each provided table into python, clean them wherever required, add only the necessary columns to sql database
- Work out how we will calculate score (probably discuss this in the tutorial)
- Figure out what to index

Week 11 (task 3):
- Find our own datasets, clean them, add to sql database
- Work out new score calculation

Week 12 (task 4):
