<h1 align="center">Trabalho 2 - Aprendizagem de Máquina</h1>

## Andre Brun
### Daniel Boll & Mateus Karvat
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import style, cm
import matplotlib.tri as mtri
from mpl_toolkits.mplot3d import Axes3D

import time

# Analitics
from sklearn.metrics import accuracy_score, silhouette_score, pairwise_distances
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance

# Clusters
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

# Configurations
style.use('ggplot')
%matplotlib qt
np.set_printoptions(precision=3, suppress=True)
pd.set_option("display.precision", 3)

In [None]:
data = pd.read_csv('./Base9.csv')

# Manteremos uma cópia dos dados originais
# para garantia
raw_data = data.copy()

Separando as coordenadas x e y das labels nas variáveis ***X*** e ***label***

In [None]:
X = np.array(data.values[:, :2])
label = np.array(data.values[:, 2])

Plota o *dataset* com as cores verde para a primeira classe e vermelha para a segunda.

In [None]:
colors = ["g.", "r.", "c.", "b."] ## No caso de ter até 4 classes
for i in range(len(X)):
    plt.plot(X[i][0], X[i][1], colors[int(label[i])])
plt.show()

---
## Metódos de avaliação (non-built-in)

In [None]:
def cohesion_score(X, labels):
    """
    N - N dentro de uma mesma label
    pra cada label
    """
    each_label = np.unique(labels)
    total = 0
    for lab in each_label:
        indices = np.where(labels == lab)
        indices = indices[0]

        subX = np.take(X, indices, axis=0)
        total += np.sum(pairwise_distances(subX, metric='sqeuclidean', n_jobs=-1))

    return np.sqrt(total)

In [None]:
def separation_score(X, labels):
    """
    N - M
    """
    each_label = np.unique(labels)
    total = 0
    for lab in each_label:
        indices_x = np.where(labels == lab)
        indices_x = indices_x[0]

        indices_y = np.where(labels != lab)
        indices_y = indices_y[0]

        
        subX = np.take(X, indices_x, axis=0)
        subY = np.take(X, indices_y, axis=0)

        total += np.sum(pairwise_distances(subX, subY, metric='sqeuclidean', n_jobs=-1))
    return np.sqrt(total)

In [None]:
def entropy_score(X, label_class, label_dataset):
    """
        banana 🍌
        B1 = label_dataset == 1
        B2 = label_dataset == 0
    """
    cluster_labels = np.unique(label_dataset)
    total_entropy = 0
    for label in cluster_labels:
        cluster_entropy = 0
        # Cluster indices tem o índice
        # de uma banana (dataset original)
        cluster_indices = np.where(label_dataset == label)
        cluster_indices = cluster_indices[0]

        # Tem todas as instâncias da label_class
        # dentro da banana atual (cluster_indices)
        cluster_classes = np.take(label_class, cluster_indices)
        probs = np.unique(cluster_classes, return_counts=True)[1]
        probs = probs / np.sum(probs)
        for p in probs:
            cluster_entropy += p * np.log2(p)

        total_entropy += cluster_entropy

        total_entropy /= -len(cluster_labels)
    return total_entropy

---

In [None]:
def plotScore3D(xp, yp, zp, title):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    x, y = np.meshgrid(xp, yp)
    z = zp

    surf = ax.plot_surface(y, x, z, cmap=cm.coolwarm,
                        linewidth=0, antialiased=True)           
                        
    ax.zaxis.set_major_locator(LinearLocator(10))
    ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
    # Add a color bar which maps values to colors.
    fig.colorbar(surf, shrink=0.5, aspect=5)
    plt.title(title)
    plt.show()

---

## KMeans

In [None]:
kmc_parameters = {
    "n_clusters": [i for i in range(2, 9)],
    "max_iter": [j for j in range(1, 11)]
}

cluster_size = np.shape(kmc_parameters['n_clusters'])[0]
iter_size    = np.shape(kmc_parameters['max_iter'])[0]

kmc_score_matrix_cohesion   = np.zeros((cluster_size, iter_size))
kmc_score_matrix_separation = np.zeros((cluster_size, iter_size))
kmc_score_matrix_entropy    = np.zeros((cluster_size, iter_size))
kmc_score_matrix_silhouette = np.zeros((cluster_size, iter_size))

i = 0
j = 0
for n_cluster in kmc_parameters["n_clusters"]:
    j = 0
    for iteration in kmc_parameters["max_iter"]:
        kmc = KMeans(n_clusters=n_cluster, max_iter=iteration).fit(X)

        kmc_labels = kmc.labels_

        cohesion    = cohesion_score(X, kmc_labels)
        separation  = separation_score(X, kmc_labels)
        entropy     = entropy_score(X, kmc_labels, label)
        silhouette  = silhouette_score(X, kmc_labels, metric='euclidean')
        
        kmc_score_matrix_cohesion[i, j]     = cohesion
        kmc_score_matrix_separation[i, j]   = separation
        kmc_score_matrix_entropy[i, j]      = entropy
        kmc_score_matrix_silhouette[i, j]   = silhouette

        j += 1
    i += 1

In [None]:
plotScore3D(kmc_parameters['max_iter'], kmc_parameters['n_clusters'], kmc_score_matrix_cohesion, "Cohesion")

plotScore3D(kmc_parameters['max_iter'], kmc_parameters['n_clusters'], kmc_score_matrix_separation, "Separation")

plotScore3D(kmc_parameters['max_iter'], kmc_parameters['n_clusters'], kmc_score_matrix_entropy, "Entropy")

plotScore3D(kmc_parameters['max_iter'], kmc_parameters['n_clusters'], kmc_score_matrix_silhouette, "Silhouette")

In [None]:
# Set the template data
template_shape      = np.shape(kmc_score_matrix_cohesion)

# Get the best score to each evaluation method
maxScore_cohesion_kmeans   = np.max(kmc_score_matrix_cohesion)
minScore_separation_kmeans = np.min(kmc_score_matrix_separation)
minScore_entropy_kmeans    = np.min(kmc_score_matrix_entropy)
maxScore_silhouette_kmeans = np.max(kmc_score_matrix_silhouette)

# Get the index of this score, so we can access the specific
# parameter.
index_cohesion      = np.unravel_index(np.argmax(kmc_score_matrix_cohesion), template_shape)

index_separation    = np.unravel_index(np.argmin(kmc_score_matrix_separation), template_shape)

index_entropy       = np.unravel_index(np.argmin(kmc_score_matrix_entropy), template_shape)

index_silhouette    = np.unravel_index(np.argmax(kmc_score_matrix_silhouette), template_shape)

# -------------------------------------------------------------------
#           GET THE BEST PARAMATERS FOR EACH EVALUATION METHOD
# -------------------------------------------------------------------
bestIter_cohesion   = kmc_parameters['max_iter'][index_cohesion[1]]
bestN_cohesion      = kmc_parameters['n_clusters'][index_cohesion[0]]

bestIter_separation = kmc_parameters['max_iter'][index_separation[1]]
bestN_separation    = kmc_parameters['n_clusters'][index_separation[0]]

bestIter_entropy    = kmc_parameters['max_iter'][index_entropy[1]]
bestN_entropy       = kmc_parameters['n_clusters'][index_entropy[0]]

bestIter_silhouette = kmc_parameters['max_iter'][index_silhouette[1]]
bestN_silhouette    = kmc_parameters['n_clusters'][index_silhouette[0]]
# -------------------------------------------------------------------
# -------------------------------------------------------------------

# -------------------------------------------------------------------
# Cohesion
print(f"COHESION:\nThe best score ({maxScore_cohesion_kmeans}) was acquired by the parameters:\n {bestIter_cohesion} iterations and {bestN_cohesion} clusters.")

# Separation
print(f"SEPARATION:\nThe best score ({minScore_separation_kmeans}) was acquired by the parameters:\n {bestIter_separation} iterations and {bestN_separation} clusters.")

# Entropy
print(f"ENTROPY:\nThe best score ({minScore_entropy_kmeans}) was acquired by the parameters:\n {bestIter_entropy} iterations and {bestN_entropy} clusters.")

# Silhouette
print(f"SILHOUETTE:\nThe best score ({maxScore_silhouette_kmeans}) was acquired by the parameters:\n {bestIter_silhouette} iterations and {bestN_silhouette} clusters.")
# -------------------------------------------------------------------

---
## DBScan

In [None]:
banana_sup_index = distance.cdist(X, [[.0, 1.0]]).argmin()
banana_inf_index = distance.cdist(X, [[1.0, -0.5]]).argmin()

In [None]:
dbs_parameters = {
    "eps": [i/100 for i in range(1, 16)],
    "min_samples": [i for i in range(3, 14)]
}

validated_params = []

for epsx in dbs_parameters["eps"]:
    j = 0
    for min_sample in dbs_parameters["min_samples"]:
        dbs = DBSCAN(eps=epsx, min_samples=min_sample).fit(X)
        dbs_labels = dbs.labels_

        if(dbs_labels[banana_sup_index] != dbs_labels[banana_inf_index] and len(np.unique(dbs_labels)) <= 8):
            validated_params.append([epsx, min_sample])

In [None]:
for val_params in validated_params:
    plt.plot(val_params[0], val_params[1], "k.")
plt.show()

In [None]:
validated_params = np.array(validated_params)

In [None]:
unique_eps = np.unique(validated_params[:, 0]) 
unique_samples = np.unique(validated_params[:, 1]) 

matrix_size = ((np.shape(unique_eps)[0], np.shape(unique_samples)[0]))

dbs_score_matrix_cohesion   = np.zeros(matrix_size)
dbs_score_matrix_separation = np.zeros(matrix_size)
dbs_score_matrix_entropy    = np.zeros(matrix_size)
dbs_score_matrix_silhouette = np.zeros(matrix_size)

i = 0
j = 0
for eps in unique_eps:
    j = 0
    for sample in unique_samples:
        if [eps, sample] in validated_params.tolist():
            dbs = DBSCAN(eps=eps, min_samples=sample, n_jobs=-1).fit(X)

            dbs_labels = dbs.labels_ # <- Problema está aqui

            cohesion    = cohesion_score(X, dbs_labels)
            separation  = separation_score(X, dbs_labels)
            entropy     = entropy_score(X, dbs_labels, label)
            silhouette  = silhouette_score(X, dbs_labels, metric='euclidean')
            
            dbs_score_matrix_cohesion[i, j]     = cohesion
            dbs_score_matrix_separation[i, j]   = separation
            dbs_score_matrix_entropy[i, j]      = entropy
            dbs_score_matrix_silhouette[i, j]   = silhouette
        else: 
            dbs_score_matrix_cohesion[i, j]     = float("inf")
            dbs_score_matrix_separation[i, j]   = float("inf")
            dbs_score_matrix_entropy[i, j]      = float("inf")
            dbs_score_matrix_silhouette[i, j]   = float("inf")
        j+=1
    i+=1

In [None]:
#D̤̘͇͎͔͗̇ͪ͢Ơ̱͈͇͚̰͓̖ͪͧ ̧̫͎̮̾̇͛̚N̜͔͎ͭͩ̽͠ͅO̦͎͚͔̮͔̟̯̔̔ͫͯ͟T͈̜̟̫̘̪̪͌́̚ ̨̺̪̤̦̓̊̅A̗̜͕̱̞̮̪ͭ̀Ş̜̞͔͚̑̔͗͌Ḱ͚̖̝́


dbs_score_matrix_cohesion_ = np.where(dbs_score_matrix_cohesion == float("inf"), np.min(dbs_score_matrix_cohesion)*.99, dbs_score_matrix_cohesion)

dbs_score_matrix_separation_ = np.where(dbs_score_matrix_separation == float("inf"), np.min(dbs_score_matrix_separation)*.99, dbs_score_matrix_separation)

dbs_score_matrix_entropy_ = np.where(dbs_score_matrix_entropy == float("inf"), np.min(dbs_score_matrix_entropy)*.99, dbs_score_matrix_entropy)

dbs_score_matrix_silhouette_ = np.where(dbs_score_matrix_silhouette == float("inf"), np.min(dbs_score_matrix_silhouette)*.99, dbs_score_matrix_silhouette)


dbs_score_matrix_cohesion = np.where(dbs_score_matrix_cohesion == float("inf"), 0, dbs_score_matrix_cohesion)
dbs_score_matrix_silhouette = np.where(dbs_score_matrix_silhouette == float("inf"), 0, dbs_score_matrix_silhouette)

In [None]:
plotScore3D(unique_samples, unique_eps, dbs_score_matrix_cohesion_, "Cohesion")
plotScore3D(unique_samples, unique_eps, dbs_score_matrix_separation_, "Separation")
plotScore3D(unique_samples, unique_eps, dbs_score_matrix_entropy_, "Entropy")
plotScore3D(unique_samples, unique_eps, dbs_score_matrix_silhouette_, "Silhouette")

In [None]:
# Set the template data
template_shape      = np.shape(dbs_score_matrix_cohesion)

# Get the best score to each evaluation method
maxScore_cohesion_dbs   = np.max(dbs_score_matrix_cohesion)
minScore_separation_dbs = np.min(dbs_score_matrix_separation)
minScore_entropy_dbs    = np.min(dbs_score_matrix_entropy)
maxScore_silhouette_dbs = np.max(dbs_score_matrix_silhouette)

# Get the index of this score, so we can access the specific
# parameter.
index_cohesion      = np.unravel_index(np.argmax(dbs_score_matrix_cohesion), template_shape)

index_separation    = np.unravel_index(np.argmin(dbs_score_matrix_separation), template_shape)

index_entropy       = np.unravel_index(np.argmin(dbs_score_matrix_entropy), template_shape)

index_silhouette    = np.unravel_index(np.argmax(dbs_score_matrix_silhouette), template_shape)

# -------------------------------------------------------------------
#           GET THE BEST PARAMATERS FOR EACH EVALUATION METHOD
# -------------------------------------------------------------------
bestEps_cohesion            = unique_eps[index_cohesion[0]]
bestMinSamples_cohesion     = unique_samples[index_cohesion[1]]

bestEps_separation          = unique_eps[index_separation[0]]
bestMinSamples_separation   = unique_samples[index_separation[1]]

bestEps_entropy             = unique_eps[index_entropy[0]]
bestMinSamples_entropy      = unique_samples[index_entropy[1]]

bestEps_silhouette          = unique_eps[index_silhouette[0]]
bestMinSamples_silhouette   = unique_samples[index_silhouette[1]]
# -------------------------------------------------------------------
# -------------------------------------------------------------------

# -------------------------------------------------------------------
# Cohesion
print(f"COHESION:\nThe best score ({maxScore_cohesion_dbs}) was acquired by the parameters:\n {bestEps_cohesion} eps value and {bestMinSamples_cohesion} samples.\n")

# Separation
print(f"SEPARATION:\nThe best score ({minScore_separation_dbs}) was acquired by the parameters:\n {bestEps_separation} eps value and {bestMinSamples_separation} samples.\n")

# Entropy
print(f"ENTROPY:\nThe best score ({minScore_entropy_dbs}) was acquired by the parameters:\n {bestEps_entropy} eps value and {bestMinSamples_entropy} samples.\n")

# Silhouette
print(f"SILHOUETTE:\nThe best score ({maxScore_silhouette_dbs}) was acquired by the parameters:\n {bestEps_silhouette} eps value and {bestMinSamples_silhouette} samples.\n")
# -------------------------------------------------------------------

---
## AGNES

In [None]:
linkage_dict = {
    0: "ward",
    1: "complete",
    2: "single"
}

agc_parameters = {
    'n_clusters': [i for i in range(2, 9)],
    'linkage': [0, 1, 2]
}

cluster_size = np.shape(agc_parameters['n_clusters'])[0]
linkage_size = np.shape(agc_parameters['linkage'])[0]

agc_score_matrix_cohesion   = np.zeros((cluster_size, linkage_size))
agc_score_matrix_separation = np.zeros((cluster_size, linkage_size))
agc_score_matrix_entropy    = np.zeros((cluster_size, linkage_size))
agc_score_matrix_silhouette = np.zeros((cluster_size, linkage_size))

i = 0
j = 0
for n_cluster in agc_parameters['n_clusters']:
    j = 0
    for linkage in agc_parameters['linkage']:
        agc = AgglomerativeClustering(linkage=linkage_dict[linkage], n_clusters=n_cluster).fit(X)

        # agc_labels = np.where(agc.labels_ == 1, 0, 1)
        agc_labels = agc.labels_ 

        cohesion    = cohesion_score(X, agc_labels)
        separation  = separation_score(X, agc_labels)
        entropy     = entropy_score(X, agc_labels, label)
        silhouette  = silhouette_score(X, agc_labels, metric='euclidean')

        agc_score_matrix_cohesion[i, j]     = cohesion
        agc_score_matrix_separation[i, j]   = separation
        agc_score_matrix_entropy[i, j]      = entropy
        agc_score_matrix_silhouette[i, j]   = silhouette

        j += 1
    i += 1

In [None]:
plotScore3D(agc_parameters['linkage'], agc_parameters['n_clusters'], agc_score_matrix_cohesion, "Cohesion")

plotScore3D(agc_parameters['linkage'], agc_parameters['n_clusters'], agc_score_matrix_separation, "Separation")

plotScore3D(agc_parameters['linkage'], agc_parameters['n_clusters'], agc_score_matrix_entropy, "Entropy")

plotScore3D(agc_parameters['linkage'], agc_parameters['n_clusters'], agc_score_matrix_silhouette, "Silhouette")

In [None]:
# Set the template data
template_shape      = np.shape(agc_score_matrix_cohesion)

# Get the best score to each evaluation method
maxScore_cohesion_ag   = np.max(agc_score_matrix_cohesion)
minScore_separation_ag = np.min(agc_score_matrix_separation)
minScore_entropy_ag    = np.min(agc_score_matrix_entropy)
maxScore_silhouette_ag = np.max(agc_score_matrix_silhouette)

# Get the index of this score, so we can access the specific
# parameter.
index_cohesion      = np.unravel_index(np.argmax(agc_score_matrix_cohesion), template_shape)

index_separation    = np.unravel_index(np.argmin(agc_score_matrix_separation), template_shape)

index_entropy       = np.unravel_index(np.argmin(agc_score_matrix_entropy), template_shape)

index_silhouette    = np.unravel_index(np.argmax(agc_score_matrix_silhouette), template_shape)

# -------------------------------------------------------------------
#           GET THE BEST PARAMATERS FOR EACH EVALUATION METHOD
# -------------------------------------------------------------------
bestNAg_cohesion            = agc_parameters['n_clusters'][index_cohesion[0]]
bestLinkage_cohesion     = agc_parameters['linkage'][index_cohesion[1]]

bestNAg_separation          = agc_parameters['n_clusters'][index_separation[0]]
bestLinkage_separation   = agc_parameters['linkage'][index_separation[1]]

bestNAg_entropy             = agc_parameters['n_clusters'][index_entropy[0]]
bestLinkage_entropy      = agc_parameters['linkage'][index_entropy[1]]

bestNAg_silhouette          = agc_parameters['n_clusters'][index_silhouette[0]]
bestLinkage_silhouette   = agc_parameters['linkage'][index_silhouette[1]]
# -------------------------------------------------------------------
# -------------------------------------------------------------------

# -------------------------------------------------------------------
# Cohesion
print(f"COHESION:\nThe best score ({maxScore_cohesion_ag}) was acquired by the parameters:\n {bestNAg_cohesion} clusters number and {linkage_dict[bestLinkage_cohesion]} method.\n")

# Separation
print(f"SEPARATION:\nThe best score ({minScore_separation_ag}) was acquired by the parameters:\n {bestNAg_separation} clusters number and {linkage_dict[bestLinkage_separation]} method.\n")

# Entropy
print(f"ENTROPY:\nThe best score ({minScore_entropy_ag}) was acquired by the parameters:\n {bestNAg_entropy} clusters number and {linkage_dict[bestLinkage_entropy]} method.\n")

# Silhouette
print(f"SILHOUETTE:\nThe best score ({maxScore_silhouette_ag}) was acquired by the parameters:\n {bestNAg_silhouette} clusters number and {linkage_dict[bestLinkage_silhouette]} method.\n")
# -------------------------------------------------------------------

In [None]:
agc = AgglomerativeClustering(linkage="single", n_clusters=2).fit(X)
agc_labels = agc.labels_

print(cohesion_score(X, agc_labels))
print(separation_score(X, agc_labels))
print(entropy_score(X, agc_labels, label))
print(silhouette_score(X, agc_labels))

color = ["r.", "g.", "b.", "y.", "m.", "c.", "w.", "k."]
for i in range(len(X)):
    plt.plot(X[i][0], X[i][1], color[agc_labels[i]])
plt.title(accuracy_score(label, agc_labels))
plt.show()

---

### asas

In [None]:

cohesion_row = [maxScore_cohesion_kmeans,maxScore_cohesion_dbs,maxScore_cohesion_ag]

separation_row = [minScore_separation_kmeans,minScore_separation_dbs,minScore_separation_ag]

entropy_row = [minScore_entropy_kmeans,minScore_entropy_dbs,minScore_entropy_ag]

silhouette_row = [maxScore_silhouette_kmeans,maxScore_silhouette_dbs,maxScore_silhouette_ag]

storage = [
    cohesion_row,
    separation_row,
    entropy_row,
    silhouette_row
]

pd.DataFrame(storage, index=["Cohesion", "Separation", "Entropy", "Silhouette"], columns=["KMeans", "DBScan", "Agnes"])