# Visualization on Vote Dataset

In [None]:
from src.read.processing import Processing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

dataclass = Processing(source_path='input/datasets/')
df = dataclass.read('vote')

# Fix problem with K column name
df.columns = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']

# Set class names from "Type" column
d_class = {'build_wind_float':'float', 'vehic_wind_float':'float',
           'build_wind_non-float':'non_float', 'containers':'non_wind', 
           'tableware':'non_wind', 'headlamps':'non_wind'}

df['Class'] = df['Type'].map(d_class)
df = df[df['Class'].isin(['float', 'non_float'])]
d_class_num = {"non_float":0, "float":1}
df.head()

In [None]:
from src.decomposition.PCA import PCA

pca = PCA(n_components=2)
X_transformed = pca.fit_transform(df.iloc[:, :-2])

In [None]:
# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=df['Class'].map(d_class_num))
plt.title("PCA on Glass dataset (our implementation)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
# Same with sklearn PCA
from sklearn.decomposition import PCA as sklearnPCA

sklearn_pca = sklearnPCA(n_components=2)
X_transformed = sklearn_pca.fit_transform(df.iloc[:, :-2])

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=df['Class'].map(d_class_num))
plt.title("PCA with sklearn")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
from sklearn.decomposition import IncrementalPCA

ipca = IncrementalPCA(n_components=2)
X_transformed = ipca.fit_transform(df.iloc[:, :-2])

plt.figure(figsize=(10, 6))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=df['Class'].map(d_class_num))
plt.title("PCA with IncrementalPCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
from sklearn.cluster import Birch, KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, v_measure_score, confusion_matrix, classification_report

In [None]:
print(df['Type'].unique())

In [None]:
X = df.iloc[:, :-2]
y = df['Class'].map(d_class_num).values.reshape(-1, 1)

n_clusters = len(np.unique(y))

birch = model_dbs = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Birch())
])

kmeans = model_dbs = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KMeans(n_clusters=n_clusters))
])

In [None]:
def evaluate_model(model, X, y):
    model.fit(X)
    y_pred = model['model'].labels_.reshape(-1, 1)

    print(f"Silhouette score: {silhouette_score(X, y_pred)}")
    print(f"V-measure score: {v_measure_score(y.squeeze(), y_pred.squeeze())}")
    # print(f"Confusion matrix: \n{confusion_matrix(y, y_pred)}")
    # print(f"Classification report: \n{classification_report(y, y_pred)}")

# Cluster the transformed Data using BIRCH

In [None]:
print("Evaluation results on BIRCH using the original dataset")
evaluate_model(birch, X, y)

print("-"*50)

print("Evaluation results on KMeans using the transformed dataset")
evaluate_model(birch, X_transformed, y)

# Cluster the transformed Data using K-Means

In [None]:
print("Evaluation results on BIRCH using the original dataset")
evaluate_model(kmeans, X, y)

print("-"*50)

print("Evaluation results on KMeans using the transformed dataset")
evaluate_model(kmeans, X_transformed, y)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)

X_transformed_svd = svd.fit_transform(df.iloc[:, :-2])
y = df['Class'].map(d_class_num).values.reshape(-1, 1)

# Cluster de transformed Data from the TruncatedSVD using BIRCH

In [None]:
print("Evaluation results on BIRCH using the original dataset")
evaluate_model(birch, X_transformed_svd, y)

In [None]:
print("Evaluation results on BIRCH using the original dataset")
evaluate_model(kmeans, X_transformed_svd, y)

In [None]:
# Visualize the original Dataset
plt.figure(figsize=(10, 6))
plt.scatter(X.to_numpy()[:, 0], X.to_numpy()[:, 1], c=df['Class'].map(d_class_num))
plt.title("First 2 dimensions from Original Dataset")
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()

In [None]:
birch.fit(X)
kmeans.fit(X)

# Visualize the results of BIRCH and KMenas on the original Dataset
figure, axs = plt.subplots(1, 2, figsize=(15, 6))
axs[0].scatter(X.to_numpy()[:, 0], X.to_numpy()[:, 1], c=birch['model'].labels_)
axs[0].set_title("BIRCH results on the original Dataset")
axs[0].set_xlabel("X1")
axs[0].set_ylabel("X2")

axs[1].scatter(X.to_numpy()[:, 0], X.to_numpy()[:, 1], c=kmeans['model'].labels_)
axs[1].set_title("KMeans results on the original Dataset")
axs[1].set_xlabel("X1")
axs[1].set_ylabel("X2")
plt.show()

In [None]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=2)

X_transformed = pca.fit_transform(X)
X_transformed_isomap = isomap.fit_transform(X)

# Visualize results of BIRCH and KMeans on the original Dataset using PCA and ISOMAP
figure, axs = plt.subplots(2, 2, figsize=(15, 12))
axs[0, 0].scatter(X_transformed[:, 0], X_transformed[:, 1], c=birch['model'].labels_)
axs[0, 0].set_title("BIRCH on the original Dataset using PCA")
axs[0, 0].set_xlabel("PC1")
axs[0, 0].set_ylabel("PC2")

axs[0, 1].scatter(X_transformed[:, 0], X_transformed[:, 1], c=kmeans['model'].labels_)
axs[0, 1].set_title("KMeans on the original Dataset using PCA")
axs[0, 1].set_xlabel("PC1")
axs[0, 1].set_ylabel("PC2")

axs[1, 0].scatter(X_transformed_isomap[:, 0], X_transformed_isomap[:, 1], c=birch['model'].labels_)
axs[1, 0].set_title("BIRCH on the original Dataset using ISOMAP")
axs[1, 0].set_xlabel("PC1")
axs[1, 0].set_ylabel("PC2")

axs[1, 1].scatter(X_transformed_isomap[:, 0], X_transformed_isomap[:, 1], c=kmeans['model'].labels_)
axs[1, 1].set_title("KMeans on the original Dataset using ISOMAP")
axs[1, 1].set_xlabel("PC1")
axs[1, 1].set_ylabel("PC2")
plt.show()
