In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect
import numpy as np
import config

In [2]:
# Store csv path ad name
ship_csv = ("clean.csv")
postcode_csv = ("data/australian_postcodes.csv")
sa3_csv = ("data/SA3_2021_AUST.csv")

# Read in csv file
shipments_df = pd.read_csv(ship_csv, index_col=0)
postcode_df = pd.read_csv(postcode_csv)
sa3_df = pd.read_csv(sa3_csv)

### Shipment Data

In [3]:
# get names of indexes for which column cmlPostCode has value "H7E 1A1"
postcode_index_names = shipments_df[ shipments_df['cmlPostCode'] == "H7E 1A1" ].index

# get names of indexes for which column smpPlantID has value "P00"
index_names = shipments_df[ shipments_df['smpPlantID'] == "P00" ].index
  
# drop these row indexes from dataFrame

shipments_df.drop(postcode_index_names, inplace = True)
shipments_df.drop(index_names, inplace = True)

In [4]:
# Reset index

shipments_df = shipments_df.reset_index(drop=True)

In [5]:
# Convert datatype to numeric

shipments_df[["cmlPostCode"]] = shipments_df[["cmlPostCode"]].apply(pd.to_numeric)

In [6]:
# Convert postcode datatype to int

shipments_df[["cmlPostCode"]] = shipments_df[["cmlPostCode"]].astype(np.int64)

In [7]:
# Convert columns to date type

shipments_df['smpShipDate'] = pd.to_datetime(shipments_df['smpShipDate'].str.strip(), format='%d/%m/%Y %H:%M:%S %p')
shipments_df['smlCreatedDate'] = pd.to_datetime(shipments_df['smlCreatedDate'].str.strip(), format='%d/%m/%Y %H:%M:%S %p')

In [8]:
# Add new column to hold period

shipments_df['ShipPeriod'] = shipments_df['smpShipDate'].dt.to_period('M').astype(str)
shipments_df['ShipYear'] = shipments_df['smpShipDate'].dt.to_period('Y').astype(str)
shipments_df['ShipMonth'] = shipments_df['smpShipDate'].dt.month_name()

In [9]:
# Print Head

shipments_df.head()

Unnamed: 0,smpPlantID,smlCreatedDate,smlShipmentID,smlShipmentLineID,smlPartID,smlDescription,smlPartRevisionID,smpCustomerOrganizationID,smpShipDate,smpShipOrganizationID,...,cmlOrganizationID,cmlName,cmlAddressLine1,cmlCity,cmlState,cmlPostCode,quantityShipped,ShipPeriod,ShipYear,ShipMonth
0,P01,2021-07-23 04:41:16,129241,1,105689,Cardboard Carton Dc/Rsc 1190mm x 195mm x 700mm...,,FROAUS,2021-07-26 12:00:00,FROAUS,...,FROAUS,Frontline Australasia Pty Ltd,"Door 3, 55 Letcon Drive",Bangholme,VIC,3175,23.0,2021-07,2021,July
1,P01,2021-07-23 04:41:16,129241,2,105689,Cardboard Carton Dc/Rsc 1190mm x 195mm x 700mm...,,FROAUS,2021-07-26 12:00:00,FROAUS,...,FROAUS,Frontline Australasia Pty Ltd,"Door 3, 55 Letcon Drive",Bangholme,VIC,3175,3.0,2021-07,2021,July
2,P09,2021-07-26 06:35:23,129247,1,305834,WESTRO Pallet 1165mm x 1165mm x 138mm,P09,WESOCE,2021-07-26 06:35:17,WESOCE,...,WESOCE,Westrock Oceania Pty Ltd,"Gate 1, Beaumont Avenue",North Richmond,NSW,2754,20.0,2021-07,2021,July
3,P09,2021-07-26 06:36:49,129248,1,306036,FIRES Pallet 570mm x 570mm,P09,FIRPTY,2021-07-26 06:36:44,FIRPTY,...,FIRPTY,Firesense Pty Ltd,18-20 Brookhollow Avenue,Baulkham Hills,NSW,2153,32.0,2021-07,2021,July
4,P09,2021-07-26 06:36:49,129248,2,305915,Std Heavy Duty Second Hand Pallets,HEAVY DUTY,FIRPTY,2021-07-26 06:36:44,FIRPTY,...,FIRPTY,Firesense Pty Ltd,18-20 Brookhollow Avenue,Baulkham Hills,NSW,2153,20.0,2021-07,2021,July


### Cleaning Postcode Data

In [10]:
# Drop unnecessary columns
filtered_postcode_df = postcode_df[["postcode","lat","long","sa3","sa3name"]]

# Drop NaN values in sa3 column
filtered_postcode_df = filtered_postcode_df.dropna(subset=['sa3'])

# Check to see how many uniqu postcodes are in the df
len(filtered_postcode_df["postcode"].unique())


3073

In [11]:
# Ceate new df with duplicated postcodes removed
unique_postcode_df = filtered_postcode_df.drop_duplicates(['postcode'], keep='first')

# Set Postcode as the index
unique_postcode_df = unique_postcode_df.set_index("postcode", inplace = False)

In [12]:
# Confirm no. of unique rows are still in the df
len(unique_postcode_df)

3073

In [13]:
# Print Head
unique_postcode_df.head(5)

Unnamed: 0_level_0,lat,long,sa3,sa3name
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
800,-12.458684,130.83668,70101.0,Darwin City
801,-12.458684,130.83668,70101.0,Darwin City
804,-12.428017,130.873315,70102.0,Darwin Suburbs
810,-12.381806,130.866242,70102.0,Darwin Suburbs
811,-12.381806,130.866242,70102.0,Darwin Suburbs


### Cleaning SA3 Data

In [14]:
# Drop unnecessary columns
filtered_sa3_df = sa3_df[['SA3_CODE_2021',"SA3_NAME_2021","ASGS_LOCI_URI_2021"]]

#Rename Columns
filtered_sa3_df = filtered_sa3_df.rename(columns = {'SA3_CODE_2021':'sa3','SA3_NAME_2021':'sa3name', 'ASGS_LOCI_URI_2021':'location_URI'
                              }, inplace = False)

# Set sa3 as the index
filtered_sa3_df = filtered_sa3_df.set_index("sa3", inplace = False)

# Drop Index ZZZZZZ
clean_sa3_df = filtered_sa3_df.drop('ZZZZZ')

# Print Head
clean_sa3_df.head(5)

Unnamed: 0_level_0,sa3name,location_URI
sa3,Unnamed: 1_level_1,Unnamed: 2_level_1
10102,Queanbeyan,http://linked.data.gov.au/dataset/asgsed3/SA3/...
10103,Snowy Mountains,http://linked.data.gov.au/dataset/asgsed3/SA3/...
10104,South Coast,http://linked.data.gov.au/dataset/asgsed3/SA3/...
10105,Goulburn - Mulwaree,http://linked.data.gov.au/dataset/asgsed3/SA3/...
10106,Young - Yass,http://linked.data.gov.au/dataset/asgsed3/SA3/...


### Load Data

In [15]:
#Connecting to local database using config for all personal data
rds_connection_string = f'{config.protocol}://{config.username}:{config.password}@{config.host}:{config.port}/{config.database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

In [16]:
#Check to see if queries.sql code has been run in PGAdmin
insp.get_table_names()

['sa3_table', 'australian_postcodes', 'shipment_table']

In [17]:
#Import sa3_table data frame into SQL
clean_sa3_df.to_sql(name='sa3_table', con=engine, if_exists='append', index=True)

In [18]:
#Checking if sa3_table data has been loaded into the database
pd.read_sql_query('select * from sa3_table', con=engine).head()

Unnamed: 0,sa3,sa3name,location_URI
0,10102,Queanbeyan,http://linked.data.gov.au/dataset/asgsed3/SA3/...
1,10103,Snowy Mountains,http://linked.data.gov.au/dataset/asgsed3/SA3/...
2,10104,South Coast,http://linked.data.gov.au/dataset/asgsed3/SA3/...
3,10105,Goulburn - Mulwaree,http://linked.data.gov.au/dataset/asgsed3/SA3/...
4,10106,Young - Yass,http://linked.data.gov.au/dataset/asgsed3/SA3/...


In [19]:
#Import australian_postcodes data frame into SQL
unique_postcode_df.to_sql(name='australian_postcodes', con=engine, if_exists='append', index=True)

In [20]:
#Checking if australian_postcodes data has been loaded into the database
pd.read_sql_query('select * from australian_postcodes', con=engine).head()

Unnamed: 0,postcode,lat,long,sa3,sa3name
0,800,-12.458684,130.83668,70101,Darwin City
1,801,-12.458684,130.83668,70101,Darwin City
2,804,-12.428017,130.873315,70102,Darwin Suburbs
3,810,-12.381806,130.866242,70102,Darwin Suburbs
4,811,-12.381806,130.866242,70102,Darwin Suburbs


In [21]:
#Import australian_postcodes data frame into SQL
shipments_df.to_sql(name='shipment_table', con=engine, if_exists='append', index=False)

In [22]:
#Checking if australian_postcodes data has been loaded into the database
pd.read_sql_query('select * from shipment_table', con=engine).head()

Unnamed: 0,id,smpPlantID,smlCreatedDate,smlShipmentID,smlShipmentLineID,smlPartID,smlDescription,smlPartRevisionID,smpCustomerOrganizationID,smpShipDate,...,cmlOrganizationID,cmlName,cmlAddressLine1,cmlCity,cmlState,cmlPostCode,quantityShipped,ShipPeriod,ShipYear,ShipMonth
0,1,P01,2021-07-23,129241,1,105689,Cardboard Carton Dc/Rsc 1190mm x 195mm x 700mm...,,FROAUS,2021-07-26,...,FROAUS,Frontline Australasia Pty Ltd,"Door 3, 55 Letcon Drive",Bangholme,VIC,3175,23.0,2021-07,2021,July
1,2,P01,2021-07-23,129241,2,105689,Cardboard Carton Dc/Rsc 1190mm x 195mm x 700mm...,,FROAUS,2021-07-26,...,FROAUS,Frontline Australasia Pty Ltd,"Door 3, 55 Letcon Drive",Bangholme,VIC,3175,3.0,2021-07,2021,July
2,3,P09,2021-07-26,129247,1,305834,WESTRO Pallet 1165mm x 1165mm x 138mm,P09,WESOCE,2021-07-26,...,WESOCE,Westrock Oceania Pty Ltd,"Gate 1, Beaumont Avenue",North Richmond,NSW,2754,20.0,2021-07,2021,July
3,4,P09,2021-07-26,129248,1,306036,FIRES Pallet 570mm x 570mm,P09,FIRPTY,2021-07-26,...,FIRPTY,Firesense Pty Ltd,18-20 Brookhollow Avenue,Baulkham Hills,NSW,2153,32.0,2021-07,2021,July
4,5,P09,2021-07-26,129248,2,305915,Std Heavy Duty Second Hand Pallets,HEAVY DUTY,FIRPTY,2021-07-26,...,FIRPTY,Firesense Pty Ltd,18-20 Brookhollow Avenue,Baulkham Hills,NSW,2153,20.0,2021-07,2021,July
