# ML4CPS Project-2 | NB-4

In [None]:
import os
import sys

basepath = os.path.abspath(os.path.join(".."))
if not basepath in sys.path:
    sys.path.append(basepath)

%load_ext autoreload
%autoreload 2

In [None]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
from utils.dataset import DatasetLoaderXL

## Load dataset

In [None]:
dsxl = DatasetLoaderXL(dataset_dir="../dataset/raw", seed=42, verbose=True)
dsxl.load_all_datasets()

In [None]:
dsxl.list_suburbs()

In [None]:
dsxl.list_categories()

In [None]:
dsxl.list_subcategories("Geography");

In [None]:
subcategories = [
    "Population Density",
    "Travel time to GPO (minutes)",
    "Distance to GPO (km)",
    "Area (km^2)",
]
df = dsxl.get_subcategories_across_all_suburbs(subcategories)
df

In [None]:
from sklearn.preprocessing import minmax_scale

df_norm = minmax_scale(df, axis=0)
df_norm = pd.DataFrame(df_norm, columns=df.columns, index=df.index)
df_norm

In [None]:
correlation_matrix = df.corr()
correlation_matrix

In [None]:
correlation_pairs = correlation_matrix.unstack().reset_index()
correlation_pairs.columns = ['Variable1', 'Variable2', 'Correlation']
correlation_pairs['AbsCorrelation'] = correlation_pairs['Correlation'].abs()
correlation_pairs = correlation_pairs[correlation_pairs['Variable1'] != correlation_pairs['Variable2']]
correlation_pairs = correlation_pairs.sort_values(by='AbsCorrelation', ascending=True)
correlation_pairs = correlation_pairs[correlation_pairs['Variable1'] < correlation_pairs['Variable2']]
correlation_pairs

In [None]:
from scipy.stats import f_oneway

anova_results = {}
for subcategory in subcategories:
    anova_results[subcategory] = f_oneway(*[df[subcategory].values for suburb in df.index])
anova_results

In [None]:
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
dendrogram = sch.dendrogram(sch.linkage(df_norm, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Suburbs')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

In [None]:
def get_similarity_matrix(df, metric):
    return pd.DataFrame(metric(df), index=df.index, columns=df.index)

In [None]:
euclidean_similarity = lambda x: 1/(1 + euclidean_distances(x))

In [None]:
dfs = get_similarity_matrix(df_norm, euclidean_similarity)
dfs

In [None]:
dfs = get_similarity_matrix(df_norm, cosine_similarity)
dfs

In [None]:
def get_similar_suburbs(similarity_matrix, n=5):
    similarity_matrix = similarity_matrix.copy()
    np.fill_diagonal(similarity_matrix.values, 0)
    return similarity_matrix.apply(lambda x: x.nlargest(n).index.tolist(), axis=1)

In [None]:
get_similar_suburbs(dfs, 5)

In [None]:
def run_mds_and_plot(similarity_matrix, n_components=2, random_state=42):
    mds = MDS(n_components=n_components, dissimilarity="precomputed", random_state=random_state)
    dissimilarity_matrix = 1 - similarity_matrix
    mds_results = mds.fit_transform(dissimilarity_matrix)
    mds_df = pd.DataFrame(mds_results, index=similarity_matrix.index, columns=[f"MDS{i+1}" for i in range(n_components)])

    plt.figure(figsize=(10, 7))
    sns.scatterplot(x="MDS1", y="MDS2", data=mds_df)
    for i in mds_df.index:
        plt.text(mds_df.loc[i, "MDS1"], mds_df.loc[i, "MDS2"], i, fontsize=9)
    plt.title('MDS Plot')
    plt.xlabel('MDS1')
    plt.ylabel('MDS2')
    plt.show()

    return mds_df

mds_df = run_mds_and_plot(dfs)