# Data Workflow

### Python/SQL Setup

In [59]:
from sqlalchemy import create_engine, inspect
import psycopg2
import psycopg2.extras
import json
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import geoalchemy2
from shapely import wkt

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(sqlcmd, args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [2]:
credentials = "Credentials.json"
db, conn = pgconnect(credentials)

Connected successfully.


In [3]:
# Creating new schema
sql = """
CREATE SCHEMA IF NOT EXISTS sa2;
SET search_path TO sa2;
"""
conn.execute(sql)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f3c31330d30>

In [4]:
# Adding PostGIS to sa2 database
conn.execute("CREATE EXTENSION IF NOT EXISTS postgis SCHEMA sa2;")
query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


In [5]:
# Run if PostGIS is already installed, but not in sa2
sql = """
UPDATE pg_extension
SET extrelocatable = TRUE
WHERE extname = 'postgis';

ALTER EXTENSION postgis
SET SCHEMA sa2;
"""
#conn.execute(sql)

query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


### Task 1: Cleaning & Importing 

#### Businesses dataset

In [6]:
# Loading and inspecting the dataset
business = pd.read_csv("Businesses.csv")
print(business.shape)
print(business.columns)
print(business.dtypes)

(12217, 11)
Index(['industry_code', 'industry_name', 'sa2_code', 'sa2_name',
       '0_to_50k_businesses', '50k_to_200k_businesses',
       '200k_to_2m_businesses', '2m_to_5m_businesses', '5m_to_10m_businesses',
       '10m_or_more_businesses', 'total_businesses'],
      dtype='object')
industry_code             object
industry_name             object
sa2_code                   int64
sa2_name                  object
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object


In [7]:
# View of the particular industries accounted for in each sa2 region
business[["industry_code", "industry_name"]].drop_duplicates()

Unnamed: 0,industry_code,industry_name
0,A,"Agriculture, Forestry and Fishing"
643,B,Mining
1286,C,Manufacturing
1929,D,"Electricity, Gas, Water and Waste Services"
2572,E,Construction
3215,F,Wholesale Trade
3858,G,Retail Trade
4501,H,Accommodation and Food Services
5144,I,"Transport, Postal and Warehousing"
5787,J,Information Media and Telecommunications


In [8]:
# Removing counts of "Other Services" (is this useful?)
#business = business[business["industry_code"] != 'S'] 

In addition, we should also check that the count of the businesses in each size category adds to the total_businesses column:

In [9]:
# Checking if sum of businesses equals total businesses column
accounted_businesses = sum(business["0_to_50k_businesses"]+business["50k_to_200k_businesses"]+business["200k_to_2m_businesses"]+business["2m_to_5m_businesses"]+business["5m_to_10m_businesses"] == business["total_businesses"])
prop_correct_business_sum = accounted_businesses/business.shape[0]
round(prop_correct_business_sum, 3)

0.338

To improve the quality of the data analysis, these total_businesses values will be corrected to follow the sum of the number of businesses in each category:

In [10]:
business["total_businesses"] = business["0_to_50k_businesses"]+business["50k_to_200k_businesses"]+business["200k_to_2m_businesses"]+business["2m_to_5m_businesses"]+business["5m_to_10m_businesses"]

Finally, we need to export this DataFrame from pandas into the postgresql database:

In [11]:
# Adding to postgresql database
business.to_sql("business", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE business
ADD PRIMARY KEY (sa2_code, industry_code)
"""
conn.execute(sql);

#### Stops dataset
Since the stops file provided is in a GTFS format, we will first load it in as a csv file with pandas, then convert it into a GeoDataFrame with geopandas.

Useful link: https://developers.google.com/transit/gtfs/reference#stopstxt

In [33]:
## Loading and inspecting DataFrame
stops_df = pd.read_csv("Stops.txt")
print(stops_df.shape)
print(stops_df.columns)

## Correcting datatypes
# Replace NULL values in location_type to int
stops_df[["location_type"]] = stops_df[["location_type"]].fillna(0)
stops_df = stops_df.astype({"location_type":"int64"})

# Convert wheelchair_boarding to boolean
stops_df[["wheelchair_boarding"]] = stops_df[["wheelchair_boarding"]].replace({0:np.nan, 1:True, 2:False})

print(stops_df.dtypes)

(114718, 9)
Index(['stop_id', 'stop_code', 'stop_name', 'stop_lat', 'stop_lon',
       'location_type', 'parent_station', 'wheelchair_boarding',
       'platform_code'],
      dtype='object')
stop_id                 object
stop_code              float64
stop_name               object
stop_lat               float64
stop_lon               float64
location_type            int64
parent_station          object
wheelchair_boarding     object
platform_code           object
dtype: object


In [34]:
stops_df

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,wheelchair_boarding,platform_code
0,200039,200039.0,"Central Station, Eddy Av, Stand A",-33.882206,151.206665,0,200060,,
1,200054,200054.0,"Central Station, Eddy Av, Stand D",-33.882042,151.206991,0,200060,,
2,200060,,Central Station,-33.884084,151.206292,1,,,
3,201510,,Redfern Station,-33.891690,151.198866,1,,,
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",-33.893329,151.198882,0,201510,,
...,...,...,...,...,...,...,...,...,...
114713,212753,212753.0,"Sydney Olympic Park Wharf, Side B",-33.822016,151.078797,0,21271,True,B
114714,2137185,2137185.0,"Cabarita Wharf, Side A",-33.840669,151.116926,0,21371,True,1A
114715,2137186,2137186.0,"Cabarita Wharf, Side B",-33.840769,151.116899,0,21371,True,1B
114716,21501,21501.0,Parramatta Wharf,-33.813904,151.010577,0,2150112,True,


Now we need to convert the DataFrame into a GeoDataFrame with geopandas:

In [35]:
# Converting into GeoDataFrame
stops_gdf = gpd.GeoDataFrame(stops_df, geometry=gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat)).set_crs(epsg=4326)


# Removing stop_lon and stop_lat columns
stops_gdf.drop(["stop_lon", "stop_lat"], axis=1, inplace=True)
stops_gdf.head()

Unnamed: 0,stop_id,stop_code,stop_name,location_type,parent_station,wheelchair_boarding,platform_code,geometry
0,200039,200039.0,"Central Station, Eddy Av, Stand A",0,200060.0,,,POINT (151.20666 -33.88221)
1,200054,200054.0,"Central Station, Eddy Av, Stand D",0,200060.0,,,POINT (151.20699 -33.88204)
2,200060,,Central Station,1,,,,POINT (151.20629 -33.88408)
3,201510,,Redfern Station,1,,,,POINT (151.19887 -33.89169)
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",0,201510.0,,,POINT (151.19888 -33.89333)


Finally, we need to export this GeoDataFrame from geopandas into the postgresql database:

In [42]:
# Adding to postgresql database
stops_gdf.to_postgis("stops", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE stops
ADD PRIMARY KEY (stop_id)
"""
conn.execute(sql);

In [43]:
query(conn, "SELECT * FROM stops;")

Unnamed: 0,stop_id,stop_code,stop_name,location_type,parent_station,wheelchair_boarding,platform_code,geometry
0,200039,200039.0,"Central Station, Eddy Av, Stand A",0,200060,,,0101000020E6100000FFA631FF9CE66240A1FF6524ECF0...
1,200054,200054.0,"Central Station, Eddy Av, Stand D",0,200060,,,0101000020E61000002F928BAC9FE66240E33DC7C1E6F0...
2,200060,,Central Station,1,,,,0101000020E6100000817FA2F299E662408FF33DAC29F1...
3,201510,,Redfern Station,1,,,,0101000020E61000009E57611C5DE6624060304CE622F2...
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",0,201510,,,0101000020E6100000DBF9333D5DE662403DFA6B9D58F2...
...,...,...,...,...,...,...,...,...
114713,212753,212753.0,"Sydney Olympic Park Wharf, Side B",0,21271,True,B,0101000020E6100000AF9B3D8185E262408F52D7D537E9...
114714,2137185,2137185.0,"Cabarita Wharf, Side A",0,21371,True,1A,0101000020E6100000EB409ADCBDE3624089CE4C0B9BEB...
114715,2137186,2137186.0,"Cabarita Wharf, Side B",0,21371,True,1B,0101000020E6100000C4F9BEA2BDE362403EB375529EEB...
114716,21501,21501.0,Parramatta Wharf,0,2150112,True,,0101000020E6100000E443E4A456E0624025C1A4032EE8...


#### Polls dataset
Since the polls dataset already has a sql POINT object already implemented in the table, we will set the column to be the geometry of the GeoDataFrame. Since for the purposes of this assignment we need to assign each polling place a location, we will only use the entries where this column is not NULL.

In [67]:
## Loading and inspecting DataFrame
polls_df = pd.read_csv("PollingPlaces2019.csv")
print(polls_df.shape)
print(polls_df.columns)
print(polls_df.dtypes)

# Updating datatype of the_geom
polls_df = polls_df[polls_df['the_geom'].notna()]
polls_df["the_geom"] = polls_df["the_geom"].apply(wkt.loads)



(2930, 17)
Index(['FID', 'state', 'division_id', 'division_name', 'polling_place_id',
       'polling_place_type_id', 'polling_place_name', 'premises_name',
       'premises_address_1', 'premises_address_2', 'premises_address_3',
       'premises_suburb', 'premises_state_abbreviation', 'premises_post_code',
       'latitude', 'longitude', 'the_geom'],
      dtype='object')
FID                             object
state                           object
division_id                      int64
division_name                   object
polling_place_id                 int64
polling_place_type_id            int64
polling_place_name              object
premises_name                   object
premises_address_1              object
premises_address_2              object
premises_address_3              object
premises_suburb                 object
premises_state_abbreviation     object
premises_post_code             float64
latitude                       float64
longitude                      float64


Unnamed: 0,FID,state,division_id,division_name,polling_place_id,polling_place_type_id,polling_place_name,premises_name,premises_address_1,premises_address_2,premises_address_3,premises_suburb,premises_state_abbreviation,premises_post_code,latitude,longitude,the_geom
13,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,58,1,Oatley,Oatley Public School,51 Letitia St,,,OATLEY,NSW,2223.0,-33.984700,151.081000,POINT (-33.98470 151.08100)
15,aec_federal_election_polling_places_2019.fid-4...,NSW,111,Chifley,392,1,Dharruk,Dawson Public School,7 Stuart Rd,,,DHARRUK,NSW,2770.0,-33.747500,150.817000,POINT (-33.74750 150.81700)
16,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,31,1,Allawah,PJ Ferry Reserve Community Hall,147B Bellevue Pde,,,ALLAWAH,NSW,2218.0,-33.976790,151.114897,POINT (-33.97679 151.11490)
17,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,67,1,Allawah South,St Raphael's Church Hall,84 George St,,,SOUTH HURSTVILLE,NSW,2221.0,-33.975600,151.111000,POINT (-33.97560 151.11100)
18,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,56500,1,Beverly Hills North (Banks),Beverly Hills North Public School,1-3 Shorter Ave,,,BEVERLY HILLS,NSW,2209.0,-33.941300,151.075000,POINT (-33.94130 151.07500)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,2810,1,Warilla North,Warilla North Community Centre,2-6 Hill St,,,WARILLA,NSW,2528.0,-34.547240,150.851770,POINT (-34.54724 150.85177)
2925,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,2809,1,Warilla South,Warilla High School,10 Keross Ave,,,BARRACK HEIGHTS,NSW,2528.0,-34.564200,150.858000,POINT (-34.56420 150.85800)
2926,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,58798,5,Warilla WHITLAM PPVC,2/144 Shellharbour Rd,,,,WARILLA,NSW,2528.0,-34.550823,150.859755,POINT (-34.55082 150.85975)
2927,aec_federal_election_polling_places_2019.fid-4...,NSW,150,Whitlam,31242,1,Welby,Welby Community Hall,14 Currockbilly St,,,WELBY,NSW,2575.0,-34.440900,150.424000,POINT (-34.44090 150.42400)


Now we need to convert the DataFrame into a GeoDataFrame with geopandas:

In [68]:
# Loading into GeoDataFrame
polls_gdf = gpd.GeoDataFrame(polls_df, geometry="the_geom").set_crs(epsg=4283)

# Removing stop_lon and stop_lat columns
polls_gdf.drop(["longitude", "latitude"], axis=1, inplace=True)

polls_gdf.head()

Unnamed: 0,FID,state,division_id,division_name,polling_place_id,polling_place_type_id,polling_place_name,premises_name,premises_address_1,premises_address_2,premises_address_3,premises_suburb,premises_state_abbreviation,premises_post_code,the_geom
13,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,58,1,Oatley,Oatley Public School,51 Letitia St,,,OATLEY,NSW,2223.0,POINT (-33.98470 151.08100)
15,aec_federal_election_polling_places_2019.fid-4...,NSW,111,Chifley,392,1,Dharruk,Dawson Public School,7 Stuart Rd,,,DHARRUK,NSW,2770.0,POINT (-33.74750 150.81700)
16,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,31,1,Allawah,PJ Ferry Reserve Community Hall,147B Bellevue Pde,,,ALLAWAH,NSW,2218.0,POINT (-33.97679 151.11490)
17,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,67,1,Allawah South,St Raphael's Church Hall,84 George St,,,SOUTH HURSTVILLE,NSW,2221.0,POINT (-33.97560 151.11100)
18,aec_federal_election_polling_places_2019.fid-4...,NSW,103,Banks,56500,1,Beverly Hills North (Banks),Beverly Hills North Public School,1-3 Shorter Ave,,,BEVERLY HILLS,NSW,2209.0,POINT (-33.94130 151.07500)


Finally, we need to export this GeoDataFrame from geopandas into the postgresql database:

In [77]:
# Adding to postgresql database
polls_gdf.to_postgis("polls", conn, schema="sa2", if_exists="replace", index=False)

# Setting primary key
sql = """
ALTER TABLE polls
ADD PRIMARY KEY (polling_place_id);
"""
conn.execute(sql);
#query(conn, "select * from polls")

## Extra commands

In [14]:
# Checks that all tables are in sa2 schema
inspect(db).get_table_names(schema="sa2")

['business', 'spatial_ref_sys']

Todo:

Week 10 (tasks 1 and 2):
- Import each provided table into python, clean them wherever required, add to sql database
- Find our own dataset and do the same
- Work out how we will calculate score (probably discuss this in the tutorial)

Week 11 (task 3):

Week 12 (task 4):
