In [None]:
import geopandas as gpd
import folium
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [None]:
sf = gpd.read_file("../data/external/SA2_2021/SA2_2021_AUST_GDA2020.shp")
sf = sf[["SA2_CODE21", "SA2_NAME21", "geometry"]]
sf.rename(columns={"SA2_CODE21":"SA2_code", "SA2_NAME21": "SA2_name"}, inplace=True)
sf = sf.loc[sf.geometry != None]
sf["SA2_code"] = sf["SA2_code"].astype(int)
gdf = gpd.GeoDataFrame(sf)

# create a JSON 
geoJSON = gdf.drop_duplicates("SA2_code").to_json()

# derive zone centroids 
gdf['centroid'] = gdf['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
gdf.head()

In [None]:
sdf_all = spark.read.parquet("../data/curated/full_data_without_fraud/")
sdf_all.count()

In [None]:
def draw_map(df, columns, legend_name):
    """
    This function plots a choropleth map for a given dataframe and legend name.
    """
    map = folium.Map(location=[-30, 144], width=800, height=500, tiles="cartodbpositron", zoom_start=4)

    c = folium.Choropleth(
        geo_data=geoJSON, # geoJSON 
        name='choropleth', 
        data=df.reset_index(), # data source
        columns=columns, # the columns required
        key_on='properties.SA2_code', # this is from the geoJSON's properties
        fill_color='YlOrRd', # color scheme
        line_opacity=0.1,
        fill_opacity=0.7,
        legend_name=legend_name
    )
    c.add_to(map)
    return map

## Mean Total Income v.s. SA2 

In [22]:
income_df = pd.read_csv("../data/curated/processed_income.csv")
income_df = income_df.loc[income_df['SA2_code'].str.isnumeric()]
income_df['SA2_code'] = income_df['SA2_code'].astype(int)

# join the computed dataframe with geo dataframe
income_df = income_df.merge(gdf[['SA2_code', 'SA2_name', 'geometry']], on='SA2_code')
income_df

Unnamed: 0,SA2_code,mean_total_income,SA2_name,geometry
0,101021007,51149,Braidwood,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,66335,Karabar,"POLYGON ((149.21899 -35.36738, 149.21800 -35.3..."
2,101021009,65874,Queanbeyan,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,69860,Queanbeyan - East,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,85607,Queanbeyan West - Jerrabomberra,"POLYGON ((149.19572 -35.36126, 149.19970 -35.3..."
...,...,...,...,...
2142,801101136,98692,Denman Prospect,"POLYGON ((149.01710 -35.30726, 149.01680 -35.3..."
2143,801101137,0,Molonglo,"POLYGON ((149.03732 -35.30891, 149.03704 -35.3..."
2144,801101139,86007,Wright,"POLYGON ((149.03122 -35.32194, 149.03139 -35.3..."
2145,801111140,73435,ACT - South West,"POLYGON ((148.88381 -35.26411, 148.94988 -35.2..."


In [23]:
# find the distrits that has very high mean income
income_df.loc[income_df["mean_total_income"]>200000]

Unnamed: 0,SA2_code,mean_total_income,SA2_name,geometry
1640,503011032,281099,Cottesloe,"POLYGON ((115.75101 -31.99292, 115.75101 -31.9..."
1645,503021037,276862,Kings Park (WA),"POLYGON ((115.83968 -31.96616, 115.83964 -31.9..."
1769,507021167,211893,O'Connor (WA),"POLYGON ((115.77964 -32.05654, 115.77959 -32.0..."


In [None]:
# plot the map of mean total income for each SA2 district
income_map = draw_map(income_df, ['SA2_code','mean_total_income'], "Mean Total Income ($)")
income_map.save('../plots/mean_income_vs_SA2_map.html')

### Number of Consumers v.s. SA2

In [None]:
num_consumer = sdf_all.select("SA2_code", "consumer_id").distinct().dropna().groupBy("SA2_code").count().toPandas()
num_consumer["SA2_code"] = num_consumer["SA2_code"].astype(int)

In [None]:
# join the computed dataframe with geo dataframe
consumer_df = num_consumer.merge(gdf[['SA2_code', 'SA2_name', 'geometry']], on='SA2_code')
consumer_df.head()

In [None]:
# plot the map of mean total income for each SA2 district
num_consumer_map = draw_map(consumer_df, ['SA2_code','count'], "Number of Consumers")
num_consumer_map.save('../plots/number_of_consumer_vs_SA2_map.html')

### Number of Transactions v.s. SA2

In [None]:
num_transaction = sdf_all.select("SA2_code", "order_id").distinct().dropna().groupBy("SA2_code").count().toPandas()
num_transaction["SA2_code"] = num_transaction["SA2_code"].astype(int)

# join the computed dataframe with geo dataframe
transaction_df = num_transaction.merge(gdf[['SA2_code', 'SA2_name', 'geometry']], on='SA2_code')
transaction_df.head()

In [None]:
# plot the map of mean total income for each SA2 district
num_transaction_map = draw_map(transaction_df, ['SA2_code','count'], "Number of Transactions")
num_transaction_map.save('../plots/number_of_transactions_vs_SA2_map.html')