# ML4CPS Project-2 | NB-2

In [None]:
import os
import sys

basepath = os.path.abspath(os.path.join(".."))
if not basepath in sys.path:
    sys.path.append(basepath)

%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
from utils.dataset import DatasetLoaderXL

In [None]:
SEED = 42

## Load dataset

In [None]:
dsxl = DatasetLoaderXL(dataset_dir="../dataset")
dsxl.load_all_datasets()
dsxl.list_suburbs();

In [None]:
suburb = "Ascot-Vale"

In [None]:
dsxl.get_data(suburb)
dsxl.list_categories();

In [None]:
df = dsxl.get_category("Geography")

## PCA

In [None]:
X_scaled = StandardScaler().fit_transform(X)

In [None]:
n_components = 4
pca = PCA(n_components=n_components)
pca.fit(X_scaled)

In [None]:
pca.components_

In [None]:
pca.explained_variance_ratio_

In [None]:
percent_variance = np.round(pca.explained_variance_ratio_ * 100, decimals=2)
plt.bar(x=range(4), height=pca.explained_variance_ratio_, tick_label="")
plt.ylabel("Percentate of variance explained")
plt.xlabel("Principal component")
plt.title("PCA Scree plot")
plt.show()

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
X_pca = pca.transform(X)

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    hue=y.flatten(),
    palette="viridis",
)
plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")
plt.title("PCA")
plt.legend(title="Target")
plt.show()

## MDS

In [None]:
from sklearn.manifold import MDS
from sklearn import datasets
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

In [None]:
def mapData(dist_matrix, X, y, metric, title):
    mds = MDS(metric=metric, dissimilarity="precomputed", random_state=0)
    pts = mds.fit_transform(dist_matrix)

    fig = plt.figure(2, (15, 6))
    ax = fig.add_subplot(1, 2, 1)
    ax = sns.scatterplot(
        x=pts[:, 0],
        y=pts[:, 1],
        hue=y,
        palette=["r", "g", "b", "c"],
    )

    ax = fig.add_subplot(1, 2, 2)
    plt.scatter(pts[:, 0], pts[:, 1])
    for x, ind in zip(X, range(pts.shape[0])):
        im = x.reshape(64, 64)
        imagebox = OffsetImage(im, zoom=0.3, cmap=plt.cm.gray)
        i = pts[ind, 0]
        j = pts[ind, 1]
        ab = AnnotationBbox(imagebox, (i, j), frameon=False)
        ax.add_artist(ab)
    plt.title(title)
    plt.show()

In [None]:
faces = datasets.fetch_olivetti_faces()
X_faces = faces.data
y_faces = faces.target
ind = y_faces < 4
X_faces = X_faces[ind,:]
y_faces = y_faces[ind]

In [None]:
dist_euclid = euclidean_distances(X_faces)

In [None]:
mapData(dist_euclid, X_faces, y_faces, True, 'Metric MDS with Euclidean')

In [None]:
mapData(dist_euclid, X_faces, y_faces, False, 'Non-metric MDS with Euclidean')

In [None]:
stress = []
max_dims = 21
for dim in range(1, max_dims):
    mds = MDS(n_components=dim, dissimilarity="precomputed", random_state=SEED)
    pts = mds.fit_transform(dist_euclid)
    stress.append(mds.stress_)

plt.plot(range(1, max_dims), stress)
plt.xticks(range(1, max_dims, 2))
plt.xlabel("n_components")
plt.ylabel("stress")
plt.show()