# ML4CPS Project-2 | NB-2

In [None]:
import os
import sys

basepath = os.path.abspath(os.path.join(".."))
if not basepath in sys.path:
    sys.path.append(basepath)

%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
from utils.dataset import DatasetLoaderXL

In [None]:
SEED = 42

## Load dataset

In [None]:
dsxl = DatasetLoaderXL(dataset_dir="../dataset")
dsxl.load_all_datasets()

In [None]:
dsxl.list_categories()

In [None]:
dsxl.get_category_across_all_suburbs("Geography")
df = dsxl.get_values_for_subcategory_across_all_suburbs()

In [None]:
subset_df = df.iloc[:, [0, 5, 7]]
subset_df

In [None]:
df.columns

In [None]:
df.index

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

subset_df_numeric = subset_df[['Area (km^2)', 'Distance to GPO (km)']].apply(pd.to_numeric, errors='coerce')
subset_df_numeric = subset_df_numeric.fillna(0)

geo_proximity_dist_matrix = euclidean_distances(subset_df_numeric)
geo_proximity_sim_matrix = 1 / (1 + geo_proximity_dist_matrix)
geo_proximity_sim_matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(geo_proximity_dist_matrix, cmap='viridis')
plt.title('Euclidean Distance Matrix')
plt.xlabel('Suburbs')
plt.ylabel('Suburbs')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(geo_proximity_sim_matrix, cmap='viridis')
plt.title('Geographic Proximity Similarity Matrix')
plt.xlabel('Suburbs')
plt.ylabel('Suburbs')
plt.show()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=SEED)
clusters = kmeans.fit_predict(geo_proximity_dist_matrix)

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x=subset_df['Area (km^2)'],
    y=subset_df['Distance to GPO (km)'],
    hue=clusters,
    palette="viridis",
    s=100,
    alpha=0.7,
    edgecolor='k'
)
plt.xlabel("Area (km^2)")
plt.ylabel("Distance to GPO (km)")
plt.title("KMeans Clustering Visualization")
plt.legend(title="Cluster")
plt.show()

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(geo_proximity_dist_matrix)

plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis')
plt.title('PCA of Euclidean Distance Matrix')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()