In [1]:
import pandas as pd
import numpy as np
import os
import re

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/08 02:37:30 WARN Utils: Your hostname, DESKTOP-IK201ES resolves to a loopback address: 127.0.1.1; using 172.29.212.84 instead (on interface eth0)
22/09/08 02:37:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/08 02:37:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
      /_/

Using Python version 3.8.10 (default, Jun 22 2022 20:18:18)
Spark context Web UI available at http://172.29.212.84:4040
Spark context available as 'sc' (master = local[*], app id = local-1662568653237).
SparkSession available as 'spark'.
22/09/08 02:37:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Loading all datasets
- We will be performing basic analysis on each of the dataset
- Choose certain features that seem to be appropriate for geospatial visualisation

### Customer transaction details

In [2]:
customer_join_transaction = spark.read.parquet("../data/curated/customer_join_transaction.parquet/")

                                                                                

In [3]:
customer_join_transaction.count()

                                                                                

3643266

In [4]:
customer_join_transaction.show()

+-------+------------+------------------+--------------------+--------------+--------+-----+------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|postcode|state|gender|
+-------+------------+------------------+--------------------+--------------+--------+-----+------+
|  14409| 64403598239|116.41150553221357|5474e46e-7073-442...|    2021-08-21|    3874|  VIC|  Male|
|  14409| 45629217853|28.600613410021932|f539493d-175d-48b...|    2021-04-09|    3874|  VIC|  Male|
|  14409| 77505747217|440.68714072131945|99f8e136-0ca2-491...|    2021-08-22|    3874|  VIC|  Male|
|  14409| 68216911708|22.960774428094698|f4ea2269-3509-411...|    2021-04-10|    3874|  VIC|  Male|
|  14409| 24852446429|30.776665837082245|4b1413e3-3ee9-430...|    2021-08-16|    3874|  VIC|  Male|
|  14409| 98072939449| 1118.263275036643|e597537b-d4a9-446...|    2021-04-10|    3874|  VIC|  Male|
|  14409| 22718657980|307.02232595923795|82c8d263-9d95-424...|    2021-08-27|    3874|  VIC|  Male|


### Customer behaviour
- Only notable feature is no_orders of each customer at a particular date

In [5]:
customer_behaviour = spark.read.parquet("../data/curated/customer_purchase_behaviour.parquet/")

In [6]:
customer_behaviour

user_id,order_datetime,dollar_spent,no_orders
18585,2021-08-20,28.66078522665458,2
271,2021-08-20,1851.9709565996084,2
559,2021-08-20,519.619820268801,2
19156,2021-08-20,302.72316980059384,2
756,2021-08-20,169.25922159143067,2
785,2021-08-20,49.4051194612238,1
970,2021-08-20,12.28895749317062,1
997,2021-08-20,57.1290091592229,1
1069,2021-08-20,65.389116401826,1
1256,2021-08-20,337.0946186819374,2


In [7]:
merchant_sales = spark.read.parquet("../data/curated/merchant_sales.parquet/")

### Merchant Sales
- Sales revenue per day
- Number of orders

In [8]:
merchant_sales

merchant_abn,order_datetime,sales_revenue,no_orders
80324045558,2021-08-20,13598.201854339311,369
98072939449,2021-08-20,5281.817739501697,6
97785987616,2021-08-20,953.5840137769342,5
28829519760,2021-08-20,159.9365755216398,3
88795682549,2021-08-20,1120.173012819486,3
26505333735,2021-08-20,2476.215150920252,5
70561019208,2021-08-20,562.9418605668311,2
55778594682,2021-08-20,153.90330586197558,3
71946255432,2021-08-20,1279.8546655899863,7
75242363611,2021-08-20,47.071978784780754,1


### Sales by region
- Information regarding sales made by customers of specific locations in Australia

In [9]:
sales_by_region = pd.read_parquet("../data/curated/sales_by_region.parquet/")

In [10]:
sales_by_region.head()

Unnamed: 0,state,postcode,order_datetime,dollar_spent,no_orders
0,NSW,2323,2021-04-25,629.114612,6
1,WA,6057,2021-03-29,1310.850726,10
2,WA,6057,2021-07-17,812.452904,10
3,WA,6415,2021-03-08,1519.568907,8
4,WA,6034,2021-07-20,451.85365,8


In [11]:
sales_by_region.nunique()

state                  8
postcode            3165
order_datetime       181
dollar_spent      565366
no_orders             31
dtype: int64

In [12]:
revenue_by_region = sales_by_region\
                    .groupby(['state', 'postcode'], as_index=False)\
                    .agg(
                        {
                            'dollar_spent': 'sum'
                        }
                    ).rename(columns={'dollar_spent': 'total_revenue_generated'})


In [13]:
revenue_by_region['postcode'] = revenue_by_region['postcode'].astype(int)

### Checking for unique values of the columns
Mainly just looking out for inconsistencies in state and gender

In [14]:
from pyspark.sql import functions as F

In [15]:
customer_join_transaction.select("gender").distinct().show()

+-----------+
|     gender|
+-----------+
|Undisclosed|
|     Female|
|       Male|
+-----------+



In [16]:
customer_join_transaction.select("state").distinct().show()

+-----+
|state|
+-----+
|   NT|
|  ACT|
|   SA|
|  TAS|
|   WA|
|  QLD|
|  VIC|
|  NSW|
+-----+



## External dataset
- External dataset allows linking respective postcodes to their SA2 level index
- External dataset contains geometry for SA2 level which allows for geospatial analysis


In [17]:
import pandas as pd
import geopandas as gpd
import io
import requests

url = "https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))



Dataset contains alot of useless features that are not required for our analysis, hence we will be retaining only some of the important features.
- For instance, in this dataset, we have long, lat, long_precise, and lat_precise, we will retain the precise longitude and latitde as they are generated from Google Maps API 

In [18]:
postcode_sdf = spark.createDataFrame(df[['postcode', 'SA2_MAINCODE_2016']])
postcode_df = postcode_sdf.toPandas()

In [19]:
revenue_by_region = revenue_by_region.merge(postcode_df, how='left', left_on='postcode', right_on='postcode')
revenue_by_region.count()

state                      18416
postcode                   18416
total_revenue_generated    18416
SA2_MAINCODE_2016          18239
dtype: int64

In [28]:
revenue_by_region = revenue_by_region.rename({'SA2_MAINCODE_2016': 'SA2_CODE21'}, axis=1)
revenue_by_region 

Unnamed: 0,state,postcode,total_revenue_generated,SA2_CODE21
0,ACT,200,86107.658731,801051049.0
1,ACT,200,86107.658731,801051049.0
2,ACT,2600,141718.545481,801061129.0
3,ACT,2600,141718.545481,801061070.0
4,ACT,2600,141718.545481,801061068.0
...,...,...,...,...
18411,WA,6989,216606.759192,506041136.0
18412,WA,6990,154148.478358,506041134.0
18413,WA,6991,239751.982000,506011114.0
18414,WA,6992,173698.464927,506011110.0


## Statistical Areas Level 2 - 2021 - Shapefile

In [22]:
sf = gpd.read_file("../data/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")

In [31]:
filter_cols = ['SA2_CODE21', 'geometry']
sf = sf[filter_cols]
sf = sf.dropna(how='any',axis=0) 
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf['SA2_CODE21'] = sf['SA2_CODE21'].astype(int)

In [32]:
gdf = gpd.GeoDataFrame(
    pd.merge(revenue_by_region, sf, on=	'SA2_CODE21', how='left')
)

In [33]:
geoJSON = gdf[['postcode', 'geometry']].drop_duplicates('postcode').to_json()

In [34]:
import folium

_map = folium.Map(location=[144.96, -37.82], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
_map.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
))