# ML4CPS Project-2 | NB-5

In [None]:
import os
import sys

basepath = os.path.abspath(os.path.join(".."))
if not basepath in sys.path:
    sys.path.append(basepath)

%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [None]:
from utils.dataset import DatasetLoaderXL, DataProcessor, LocationProcessor

## Load dataset

In [None]:
dsxl = DatasetLoaderXL(dataset_dir="../dataset/raw", seed=42, verbose=True)
dsxl.load_all_datasets()

In [None]:
dsxl.list_suburbs()

In [None]:
dsxl.list_categories()

In [None]:
dsxl.list_subcategories("2012 population", console_print=False);
dsxl.list_subcategories("2007 population", console_print=False);
dsxl.list_subcategories("2007-2012 population change", console_print=False);

In [None]:
subcategories = [
    # 2012 population
    "2012 ERP age 0-4, persons",
    "2012 ERP age 10-14, persons",
    "2012 ERP age 15-19, persons",
    "2012 ERP age 20-24, persons",
    "2012 ERP age 25-44, persons",
    "2012 ERP age 45-64, persons",
    "2012 ERP age 5-9, persons",
    "2012 ERP age 65-69, persons",
    "2012 ERP age 70-74, persons",
    "2012 ERP age 75-79, persons",
    "2012 ERP age 80-84, persons",
    "2012 ERP age 85+, persons",
    "2012 ERP, total",
    # 2007 population
    "2007 ERP age 0-4, persons",
    "2007 ERP age 10-14, persons",
    "2007 ERP age 15-19, persons",
    "2007 ERP age 20-24, persons",
    "2007 ERP age 25-44, persons",
    "2007 ERP age 45-64, persons",
    "2007 ERP age 5-9, persons",
    "2007 ERP age 65-69, persons",
    "2007 ERP age 70-74, persons",
    "2007 ERP age 75-79, persons",
    "2007 ERP age 80-84, persons",
    "2007 ERP age 85+, persons",
    "2007 ERP, total",
]
df = dsxl.get_subcategories_across_all_suburbs(subcategories)
df = df.replace('n/a', pd.NA).dropna()

In [None]:
dp = DataProcessor(df)
dp.normalize()
pairs = dp.get_topk_abs_correlations(k=10)
dp.correlations_to_latex(pairs, filename="correlations.tex")
pairs

In [None]:
subcategories = sorted(set(pairs['Variable1']).union(set(pairs['Variable2'])))
df = dsxl.get_subcategories_across_all_suburbs(subcategories)
df = df.replace('n/a', pd.NA).dropna()
dp = DataProcessor(df)
dp.normalize()

In [None]:
anova_results = dp.run_anova_analysis(subcategories)
dp.export_anova_to_latex(anova_results, filename="anova-popul.tex")
anova_results

In [None]:
dp.plot_dendrogram()

In [None]:
dfs = dp.get_similarity_matrix(metric="euclidean")
dp.get_similar_suburbs(dfs, n_neighbours=3)
dp.plot_heatmap(dfs, "Similarity heatmap")

In [None]:
locations = dsxl.get_subcategory_across_all_suburbs("Location")
lp = LocationProcessor(locations)
coordinates = lp.get_coordinates()
proximity_matrix = lp.calculate_proximity_matrix()
proximity_matrix = pd.DataFrame(proximity_matrix).apply(pd.to_numeric, errors='coerce')
dp.plot_heatmap(proximity_matrix, title="Proximity Matrix")

In [None]:
coordinates = lp.reflect_coordinates(coordinates)
coordinates = lp.rotate_coordinates(coordinates, angle=90)
lp.plot_coordinates(coordinates)