In [None]:
# importing required libraries
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import sklearn

from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.preprocessing import StandardScaler

In [None]:
gdf = gpd.read_file('tabuk_metrics.gpkg')

In [None]:
gdf.head()

In [None]:
gdf.info()

In [None]:
gdf.drop(columns='geometry').dtypes

In [None]:
gdf['log10_area_m2'] = np.log10(gdf['area_m2'])

In [None]:
sns.pairplot(
    gdf.drop(columns='geometry')
)

### Standardise

In [None]:
gdf.drop(columns = ['area_m2','geometry']).columns

In [None]:
features = gdf.drop(columns = ['area_m2','geometry']).columns  # whatever subset you’re using
scaler = StandardScaler()

Z = scaler.fit_transform(gdf[features])

Z_df = pd.DataFrame(Z, columns=features, index=gdf.index)

In [None]:
# now you can do correlation or pairplot
sns.heatmap(Z_df.corr(), annot=True, cmap="coolwarm", center=0)

In [None]:
sns.pairplot(Z_df)

In [None]:
X = gdf.drop(columns = ['area_m2', 'geometry'])

In [None]:
Z = StandardScaler().fit_transform(X)

In [None]:
sns.pairplot(
    pd.DataFrame(Z)
)

### PCA biplot

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pcs = pca.fit_transform(Z_df)

pc_df = pd.DataFrame(pcs, columns=["PC1","PC2"], index=gdf.index)

# Biplot
plt.figure(figsize=(8,6))
sns.scatterplot(x="PC1", y="PC2", data=pc_df, s=30, alpha=0.6)

# add loadings as arrows
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
for i, col in enumerate(Z_df.columns):
    plt.arrow(0, 0, loadings[i,0], loadings[i,1], 
              color="red", alpha=0.5, head_width=0.05)
    plt.text(loadings[i,0]*1.1, loadings[i,1]*1.1, col, color="red")

plt.axhline(0, color="grey", lw=1)
plt.axvline(0, color="grey", lw=1)
plt.title(f"PCA biplot (PC1 {pca.explained_variance_ratio_[0]:.2%}, "
          f"PC2 {pca.explained_variance_ratio_[1]:.2%})")
plt.show()

### Cluster

In [None]:
Z_df[['perimeter', 'pp','shape_idx','log10_area_m2']]

In [None]:
Z_df_sel = Z_df[['perimeter', 'rect', 'pp','shape_idx','log10_area_m2']]

In [None]:
dbscan_clustering = DBSCAN(eps=0.3, min_samples=7).fit(Z_df_sel)

In [None]:
hdbscan_clustering = HDBSCAN().fit(Z_df_sel)

In [None]:
fig = px.scatter_3d(
    Z_df_sel,
    x='perimeter',
    y='pp',
    z='shape_idx',
    opacity=0.5,            # <== Makes points 50% transparent
    size_max=0.1, # <== Caps point size if `size=` is used
    color = hdbscan_clustering.labels_,
    color_discrete_sequence=['green']  # optional
)

# Manually adjust marker size and transparency if not using 'size='
fig.update_traces(marker=dict(size=3, opacity=0.5))

fig.show()

In [None]:
blocks.shape

In [None]:
clustering.labels_

In [None]:
len( clustering.labels_ )

In [None]:
gdf.plot(
    figsize=(10,10),
    column=hdbscan_clustering.labels_,
    cmap='rainbow',
    legend=True
)