# **Aprendendo sobre clusters com Alura**
<center>
<img src="../img/andreson.png" />
</center>

**Autor:** Andreson Almeida Azevedo

In [63]:
# dados
import pandas as pd
import numpy as np

# pre-processamento
from sklearn.preprocessing import Normalizer

# algoritmos cluster
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import DBSCAN
# k-vizinhos mais próximos
from sklearn.neighbors import NearestNeighbors

# métricas de validação
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

# Visualizações
import matplotlib.pyplot as plt
import seaborn as sns

# salvar e carregar modelo

import pickle

In [100]:
def clustering_algorithm(n_clusters, df_ref, n_init = 10, max_iter = 300, random_state = 101):
    kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, random_state=random_state)
    labels = kmeans.fit_predict(df_ref)
    s = silhouette_score(df_ref, labels, metric="euclidean")
    dbs = davies_bouldin_score(df_ref, labels)
    calinsky = calinski_harabasz_score(df_ref, labels)
    return [n_clusters, s, dbs, calinsky]


def Validar_cluster(n_groups, df, n_init=10,
    max_iter=300,
    random_state=101):
    df_metricas = pd.DataFrame(columns=['k','silhoutte', 'davies_bouldin', 'calinski_harabaz'])
    i = 0
    for k in n_groups:
        print(i)
        df_metricas.loc[i] = clustering_algorithm(k, df, n_init=n_init,
    max_iter=max_iter,
    random_state=random_state)
        i = i+1  
    return df_metricas

link para a base de dados do [Titanic]([https://www.kaggle.com/competitions/titanic/data) disponivel no Kaggle

In [90]:
df = pd.read_csv('../data/titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [91]:
df_ref = df.copy()
df.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'], inplace=True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [94]:
df.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [93]:
df['Age'].fillna(df.Age.median(), inplace=True)

df = df[df['Embarked'].isna() == False]

df['Pclass'] = df['Pclass'].astype('category')

In [98]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [99]:
# normalizando os dados
df_normalize = Normalizer().fit_transform(df.values)
print(df_normalize)

[[0.94623656 0.04301075 0.         ... 0.         0.         0.04301075]
 [0.47027294 0.0123756  0.         ... 0.0123756  0.         0.        ]
 [0.95461507 0.         0.         ... 0.         0.         0.03671596]
 ...
 [0.76435929 0.02729855 0.05459709 ... 0.         0.         0.02729855]
 [0.65430808 0.         0.         ... 0.0251657  0.         0.        ]
 [0.97056074 0.         0.         ... 0.         0.03033002 0.        ]]


In [130]:
df_a = Validar_cluster([2,3,4,5,6,7,8,9,10,20,30,40,50], df_normalize, n_init=100,
    max_iter=3000,
    random_state=101)
df_a

0
1
2
3
4
5
6
7
8
9
10
11
12


Unnamed: 0,k,silhoutte,davies_bouldin,calinski_harabaz
0,2.0,0.677883,0.454656,2973.451323
1,3.0,0.582785,0.547937,3396.896142
2,4.0,0.516815,0.613983,3525.453288
3,5.0,0.477937,0.687301,3775.085489
4,6.0,0.455228,0.711618,3678.972516
5,7.0,0.434911,0.724085,3815.691054
6,8.0,0.421,0.770099,3852.713178
7,9.0,0.392655,0.808357,3836.020909
8,10.0,0.391703,0.781794,3708.936078
9,20.0,0.293508,1.117154,2912.930728


In [131]:
df_a['ref'] = 'medidas'
print(df_a.groupby('ref').agg({'silhoutte': 'max', 'davies_bouldin': 'min', 'calinski_harabaz': 'max'}))
del df_a['ref']

         silhoutte  davies_bouldin  calinski_harabaz
ref                                                 
medidas   0.677883        0.454656       3852.713178


In [108]:
df_a['ref'] = 'medidas'
print(df_a.groupby('ref').agg({'silhoutte': 'max', 'davies_bouldin': 'min', 'calinski_harabaz': 'max'}))
del df_a['ref']

         silhoutte  davies_bouldin  calinski_harabaz
ref                                                 
medidas   0.582785        0.547937       3852.713178


Pelas métricas de silhouette e davies bouldin, k = 3 é melhor número de grupos

In [132]:
# setando o modelo
model = KMeans(n_clusters = 2, n_init=100,
    max_iter=3000,
    random_state=101)

# aplicando o predict

y_pred = model.fit_predict(df_normalize)

In [133]:
df_normalize.shape

(889, 12)

## Validar a estabilidade dos clusters

Separamos a base em parte diferente, para avaliar se as métricas irão ter mudanças
se as metricas não tiverem grandes mudanças temos evidências que os cluster estão estáveis

In [136]:

random_data = np.random.rand(889, 12)

df_metricas = pd.DataFrame()
for k in [2]:
    df_metricas= df_metricas.append(pd.DataFrame(clustering_algorithm(k, random_data)).T)
df_metricas = df_metricas.rename(columns={0:'k', 1:'silhoutte', 2: 'davies_bouldin', 3:'calinski_harabaz'})
df_metricas.head()

  df_metricas= df_metricas.append(pd.DataFrame(clustering_algorithm(k, random_data)).T)


Unnamed: 0,k,silhoutte,davies_bouldin,calinski_harabaz
0,2.0,0.070508,3.543279,69.504734


## Validar a estabilidade dos clusters

Separamos a base em parte diferente, para avaliar se as métricas irão ter mudanças
se as metricas não tiverem grandes mudanças temos evidências que os cluster estão estáveis

In [137]:
set1, set2, set3 = np.array_split(df_normalize, 3)
print(clustering_algorithm(2, set1,n_init=100,
    max_iter=3000,
    random_state=101))
print(clustering_algorithm(2, set2,n_init=100,
    max_iter=3000,
    random_state=101))
print(clustering_algorithm(2, set3,n_init=100,
    max_iter=3000,
    random_state=101))

[2, 0.697455883012028, 0.4248914876841068, 1047.4366828246302]
[2, 0.6796180592570722, 0.44757328664488344, 1011.9039550542167]
[2, 0.666870477881215, 0.47751636105152284, 961.4550302315964]


Os dados não apresentam uma grande variação nas métricas, indicios de que os grupos estão bem definidos

In [138]:
df_centroids = pd.DataFrame(columns=['VARIAVEL', 'VARIANCIA'])
max = len(model.cluster_centers_[0])
for i in range(max):
    df_centroids.loc[i] = [df.columns.values[i], "{:4f}".format(model.cluster_centers_[:, i].var())]
    #print(df.columns.values[i], "\n{:4f}".format(kmeans.cluster_centers_[:, i].var()))

In [139]:
df['cluster'] = model.labels_
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,cluster,Survived
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,1,1
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1,1,1
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1,0,0


In [140]:
df_centroids.sort_values('VARIANCIA', ascending=False)

Unnamed: 0,VARIAVEL,VARIANCIA
0,Age,0.077057
3,Fare,0.070805
1,SibSp,8.6e-05
2,Parch,5.1e-05
6,Pclass_3,4.8e-05
8,Sex_male,3.7e-05
11,Embarked_S,1.6e-05
4,Pclass_1,6e-06
10,Embarked_Q,3e-06
5,Pclass_2,1e-06


In [141]:
description = df[['Age','Fare','SibSp','Parch','Pclass_3','cluster','Sex_male']].groupby('cluster')
n_clientes = description.size()
description = description.mean()
description['n_cliente'] = n_clientes
print(description.reset_index())

   cluster        Age       Fare     SibSp     Parch  Pclass_3  Sex_male  \
0        0  31.378422  13.307360  0.246377  0.204509  0.676329  0.716586   
1        1  24.534216  75.634623  1.167910  0.794776  0.264925  0.492537   

   n_cliente  
0        621  
1        268  


CLUSTER 0: Maior grupo de passageiros, oriundos da 3° Classe, pagaram menos na passagem, prodominantemente do sexo masculino

CLUSTER 1: Passageiros mais jovens, Pagaram maiores valores em passagem, e viajando com irmãos ou conjugue e/ou filhos, com menos pessoas da 3° classe

In [126]:
df

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,cluster
0,22.0,1,0,7.2500,0,0,1,0,1,0,0,1,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,1
2,26.0,0,0,7.9250,0,0,1,1,0,0,0,1,0
3,35.0,1,0,53.1000,1,0,0,1,0,0,0,1,2
4,35.0,0,0,8.0500,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0,1,0,0,1,0,0,1,0
887,19.0,0,0,30.0000,1,0,0,1,0,0,0,1,2
888,28.0,1,2,23.4500,0,0,1,1,0,0,0,1,2
889,26.0,0,0,30.0000,1,0,0,0,1,1,0,0,2


In [142]:
df['Survived'] = df_ref[df_ref['Embarked'].isna() == False]['Survived']
df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,cluster,Survived
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1,0,0
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0,1,1
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1,0,1
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1,1,1
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1,0,0


In [143]:
df[['Survived','cluster']].groupby('cluster').mean()

Unnamed: 0_level_0,Survived
cluster,Unnamed: 1_level_1
0,0.278583
1,0.623134


# 62% das pessoas que sobreviveram estavam no cluster 1: De pessoas mais jovens, oriunda das classes 1 e 2

In [166]:
df_t = pd.read_csv('../data/titanic_test.csv')
df_t.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [163]:
df_t.isna().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked    418
dtype: int64

In [164]:
def preprocessing_df(df):
    
    df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)

    # tratando dado faltantes
    df['Age'].fillna(df.Age.median(), inplace=True)
    df['Fare'].fillna(df.Fare.median(), inplace=True)
    df['Embarked'] = df['Embarked'].fillna(df_ref['Embarked'].mode(), inplace=True)
    df['Pclass'] = df['Pclass'].astype('category')

    # criando variáveis dummy
    df = pd.get_dummies(df)

    # normalizando os dados
    df_normalize = Normalizer().fit_transform(df.values)
    
    return df_normalize

In [167]:
df_normalize = preprocessing_df(df_t)

In [168]:
df_normalize

array([[0.97442607, 0.        , 0.        , ..., 0.02824423, 0.        ,
        0.02824423],
       [0.98843377, 0.02103051, 0.        , ..., 0.02103051, 0.02103051,
        0.        ],
       [0.98776123, 0.        , 0.        , ..., 0.        , 0.        ,
        0.01593163],
       ...,
       [0.98208774, 0.        , 0.        , ..., 0.02550877, 0.        ,
        0.02550877],
       [0.95710837, 0.        , 0.        , ..., 0.03544846, 0.        ,
        0.03544846],
       [0.76895508, 0.02847982, 0.02847982, ..., 0.02847982, 0.        ,
        0.02847982]])

In [170]:
df_t['cluster'] = model.fit_predict(df_normalize)
df_t.head() 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cluster
0,3,male,34.5,0,0,7.8292,,0
1,3,female,47.0,1,0,7.0,,0
2,2,male,62.0,0,0,9.6875,,0
3,3,male,27.0,0,0,8.6625,,0
4,3,female,22.0,1,1,12.2875,,0
