# Data Workflow

### Python/SQL Setup

In [8]:
from sqlalchemy import create_engine, inspect
import psycopg2
import psycopg2.extras
import json
import os
import pandas as pd

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

In [9]:
credentials = "Credentials.json"
db, conn = pgconnect(credentials)

Connected successfully.


In [12]:
inspect(db).get_table_names(schema=None)

['students', 'subjects', 'a', 'b']

### Cleaning & Importing 

#### Businesses dataset

In [25]:
# Businesses dataset
business = pd.read_csv("Businesses.csv")
print(business.shape)
print(business.columns)
print(business.dtypes)

(12217, 11)
Index(['industry_code', 'industry_name', 'sa2_code', 'sa2_name',
       '0_to_50k_businesses', '50k_to_200k_businesses',
       '200k_to_2m_businesses', '2m_to_5m_businesses', '5m_to_10m_businesses',
       '10m_or_more_businesses', 'total_businesses'],
      dtype='object')
industry_code             object
industry_name             object
sa2_code                   int64
sa2_name                  object
0_to_50k_businesses        int64
50k_to_200k_businesses     int64
200k_to_2m_businesses      int64
2m_to_5m_businesses        int64
5m_to_10m_businesses       int64
10m_or_more_businesses     int64
total_businesses           int64
dtype: object


In [29]:
# Checking if sum of businesses equals total businesses column
sum(business["0_to_50k_businesses"]+business["50k_to_200k_businesses"]+business["200k_to_2m_businesses"]+business["2m_to_5m_businesses"]+business["5m_to_10m_businesses"] == business["total_businesses"])

4130

In [32]:
# View of all distinct sa2 regions
business[["sa2_code", "sa2_name"]].drop_duplicates()

Unnamed: 0,sa2_code,sa2_name
0,101021007,Braidwood
1,101021008,Karabar
2,101021009,Queanbeyan
3,101021010,Queanbeyan - East
4,101021012,Queanbeyan West - Jerrabomberra
...,...,...
638,128021538,Sutherland - Kirrawee
639,128021607,Engadine
640,128021608,Loftus - Yarrawarrah
641,128021609,Woronora Heights
