In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn_extra.cluster import KMedoids
import json
import seaborn as sns
import pandas as pd
import numpy as np
from preparo import pca_scaled,df,scaled_data,LISTA, n_comp
N_CLUSTER=3

## Funcs

In [None]:
#FAZ DIC DE CLUSTER:[PAISES]
def faz_dic_cluster(tipo,df=df):
    dic_cluster={}
    for i in df[tipo].value_counts().index:
        dic_cluster[i]=list(df["country"].loc[df[tipo]==i].values)
    return dic_cluster

In [None]:
def faz_boxplots(df,cluster,modelo,LISTA=LISTA):
    for coluna in LISTA:
        _,_ = plt.subplots(1,1, figsize=(10, 6))
        plt.boxplot(df[coluna].loc[df[modelo]==cluster])
        plt.title(f"cluster {cluster}, {coluna}")
        plt.ylabel(f"{coluna}")

## Clusterização K-means

In [None]:
model_means=KMeans(n_clusters=N_CLUSTER,n_init=100,max_iter=1_000,random_state=42)
model_means.fit(pca_scaled)
df["cluster_meanpca"]=model_means.predict(pca_scaled)
df["cluster_meanpca"].value_counts()

### Dimensões

In [None]:
#Cluster 0
faz_boxplots(df,0,"cluster_meanpca")

In [None]:
#Cluster 1
faz_boxplots(df,1,"cluster_meanpca")

In [None]:
#Cluster 2
faz_boxplots(df,2,"cluster_meanpca")

### Centro em país

In [None]:
center = model_means.cluster_centers_

In [None]:
df_pca = pd.DataFrame(pca_scaled, columns=[f'PC{i+1}' for i in range(n_comp)])
df_pca['country']=df['country']
df_pca['KM3']=df['cluster_meanpca']

In [None]:
summing=np.zeros((df_pca.shape[0],))
for col in range(n_comp):
    summing+=(df_pca.iloc[:,col].values - center[df_pca['KM3'],col])**2
df_pca['dist']=summing 

In [None]:
countries=dict(zip(['country', 'cluster', 'dist'],[[],[],[]]))
for cluster in range(N_CLUSTER):
    filtered=df_pca[df_pca['KM3']==cluster]
    minimum=filtered['dist'].min()
    # print(filtered[filtered['dist']==minimum]['country'].values)
    country=filtered[filtered['dist']==minimum]['country'].values[0]
    countries['country'].append(country)
    countries['cluster'].append(cluster)
    countries['dist'].append(minimum)
df_countries=pd.DataFrame(countries, columns=countries.keys())

In [None]:
df_countries

## Clusterização Hierarquica

In [None]:
model_hi=AgglomerativeClustering(distance_threshold=None,n_clusters=3,linkage="ward")
df["cluster_hierarquicopca"]=model_hi.fit_predict(pca_scaled)
df["cluster_hierarquicopca"].value_counts()

### Dendograma

In [None]:
# method=["ward","average","centroid","complete","weighted","median","single"]
distance_threshold=17
paises=list(df["country"])
_, ax=plt.subplots(1, 1, figsize=(16, 8))
plt.style.use('tableau-colorblind10')
linkage=sch.linkage(scaled_data, method="ward")
dendrogram=sch.dendrogram(linkage, labels=paises, leaf_rotation=90)
ax.set_xlabel('Paises')
ax.set_ylabel('Distances')
ax.set_title("ward")
ax.axhline(distance_threshold, color='black', ls=":")

### Dimensões

In [None]:
#Cluster 0
faz_boxplots(df,0,"cluster_hierarquicopca")

In [None]:
#Cluster 1
faz_boxplots(df,1,"cluster_hierarquicopca")

In [None]:
#Cluster 2
faz_boxplots(df,2,"cluster_hierarquicopca")

## Clusterização K-medoid

In [None]:
model_medoid=KMedoids(n_clusters=N_CLUSTER,random_state=42)
model_medoid.fit(pca_scaled)
df["cluster_medoidpca"]=model_medoid.predict(pca_scaled)
df["cluster_medoidpca"].value_counts()

## Comparação de resultados de clusterização

In [None]:
dic={}
dic["K-means"]=faz_dic_cluster("cluster_meanpca")
dic["dic_hierearquico"]=faz_dic_cluster("cluster_hierarquicopca")

In [None]:
with open("dic_clusterizacao.json","w") as f:
    json.dump(dic,f,indent=4)

In [None]:
lista=LISTA.copy()
lista.append("cluster_meanpca")
titulo=sns.pairplot(df[lista],hue="cluster_meanpca")
titulo.fig.suptitle("Clusters K-means")

In [None]:
lista=LISTA.copy()
lista.append("cluster_hierarquicopca")
titulo=sns.pairplot(df[lista],hue="cluster_hierarquicopca")
titulo.fig.suptitle("Clusters Hierarquico")

In [None]:
lista=LISTA.copy()
lista.append("cluster_medoidpca")
titulo=sns.pairplot(df[lista],hue="cluster_medoidpca")
titulo.fig.suptitle("Clusters K-medoids")