# Data Workflow

### Python/SQL Setup

In [1]:
from sqlalchemy import create_engine, inspect
import psycopg2
import psycopg2.extras
import json
import os
import pandas as pd

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(sqlcmd, args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [2]:
credentials = "Credentials.json"
db, conn = pgconnect(credentials)

Connected successfully.


In [3]:
# Creating new schema
sql = """
CREATE SCHEMA IF NOT EXISTS SA2;
SET search_path TO SA2;
"""
conn.execute(sql)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fdb8806ae80>

In [5]:
# Adding PostGIS to SA2 database
conn.execute("CREATE EXTENSION IF NOT EXISTS postgis SCHEMA SA2;")
query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


In [6]:
# Run if PostGIS is already installed, but not in SA2
sql = """
UPDATE pg_extension
SET extrelocatable = TRUE
WHERE extname = 'postgis';

ALTER EXTENSION postgis
SET SCHEMA SA2;
"""
#conn.execute(sql)

query(conn, "SELECT PostGIS_version()")

Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


### Task 1: Cleaning & Importing 

#### Businesses dataset

In [23]:
# Loading and inspecting the dataset
business = pd.read_csv("Businesses.csv")
print(business.shape)
print(business.columns)
print(business.dtypes)

(12217, 11)
Index(['industry_code', 'industry_name', 'sa2_code', 'sa2_name',
       '0_to_50k_businesses', '50k_to_200k_businesses',
       '200k_to_2m_businesses', '2m_to_5m_businesses', '5m_to_10m_businesses',
       '10m_or_more_businesses', 'total_businesses'],
      dtype='object')
industry_code             object
industry_name             object
sa2_code                   int64
sa2_name                  object
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object


In [24]:
# View of the particular industries accounted for in each SA2 region
business[["industry_code", "industry_name"]].drop_duplicates()

Unnamed: 0,industry_code,industry_name
0,A,"Agriculture, Forestry and Fishing"
643,B,Mining
1286,C,Manufacturing
1929,D,"Electricity, Gas, Water and Waste Services"
2572,E,Construction
3215,F,Wholesale Trade
3858,G,Retail Trade
4501,H,Accommodation and Food Services
5144,I,"Transport, Postal and Warehousing"
5787,J,Information Media and Telecommunications


In [18]:
# Removing counts of "Other Services" (is this useful?)
#business = business[business["industry_code"] != 'S'] 

In addition, we should also check that the count of the businesses in each size category adds to the total_businesses column:

In [20]:
# Checking if sum of businesses equals total businesses column
accounted_businesses = sum(business["0_to_50k_businesses"]+business["50k_to_200k_businesses"]+business["200k_to_2m_businesses"]+business["2m_to_5m_businesses"]+business["5m_to_10m_businesses"] == business["total_businesses"])
prop_correct_business_sum = accounted_businesses/business.shape[0]
round(prop_correct_business_sum, 3)

1.0

To improve the quality of the data analysis, these total_businesses values will be corrected to follow the sum of the number of businesses in each category:

In [15]:
business["total_businesses"] = business["0_to_50k_businesses"]+business["50k_to_200k_businesses"]+business["200k_to_2m_businesses"]+business["2m_to_5m_businesses"]+business["5m_to_10m_businesses"]

In [32]:
# Adding to postgresql database
sql = """
DROP TABLE IF EXISTS business;
CREATE TABLE business (
    sa2_code INT,
    sa2_name VARCHAR(100),
    industry_code CHAR(1),
    industry_name VARCHAR(50),
    businesses_0_to_50k INT,
    businesses_50k_to_200k INT,
    businesses_200k_to_2m INT,
    businesses_2m_to_5m INT,
    businesses_5m_to_10m INT,
    businesses_10m_or_more INT,
    total_businesses INT,
    
    PRIMARY KEY (sa2_code, industry_code)
);"""
conn.execute(sql);

## Extra commands

In [12]:
# Checks that all tables are in SA2 schema
inspect(db).get_table_names(schema="SA2")

[]

Todo:

Week 10 (tasks 1 and 2):
- Import each provided table into python, clean them wherever required, add to sql database
- Find our own dataset and do the same
- Work out how we will calculate score (probably discuss this in the tutorial)

Week 11 (task 3):

Week 12 (task 4):
