In [54]:
import pandas as pd
import numpy as np
import itertools
from kmodes.kmodes import KModes
import plotly.offline as py
import plotly.express as px

In [None]:
phq= pd.read_csv('DATOS/phq_num.csv',sep=';')

In [None]:

phq.shape
phq.isnull().sum()

phq=phq.dropna()

In [None]:
### Seleccionar las variables
phq_ft=phq.drop(['phq_global','phq_s10','phq_cat'],axis=1)
### Pasar a categoricas
def reco_s00(val):
    if val == 0:
        return 'Nunca'
    elif val==1:
        return 'Algunos'
    elif val ==2:
        return 'Varios'
    elif val ==3:
        return 'Todos'
lista_rec=[phq_ft[i].apply(reco_s00) for i in phq_ft.columns]

phq_rec=pd.DataFrame(lista_rec).transpose()

# Visualización de los datos

In [None]:
lista=[round(100*phq_rec.value_counts(i)/phq_rec.shape[0],2) for i in phq_rec.columns ]

In [None]:
phq_gf=(pd.DataFrame(lista,index= phq_rec.columns).
 reset_index().
 rename(columns={'index':'sintoma'}).
 melt(id_vars='sintoma', 
      value_vars=['Nunca','Algunos','Todos','Varios']))


In [None]:
phq_gf.head()

In [None]:
fig=px.bar(phq_gf,
       x='sintoma',
       y='value',
       color='variable',
       barmode='group',
      labels=dict(sintoma='Síntoma',value='Proporción',variable='Frecuencia'),
      color_discrete_sequence=['#440154','#3b528b','#21918c','#fde725'])
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))



In [None]:
#Guardar grafico
py.plot(fig,filename='GRAFICOS/freq-sinto.html',auto_open=False)

### Algoritmo K-modes

In [None]:
num_random_state=1992

In [None]:

#Algortimo con inicización de Cao
km_cao = KModes(n_clusters=4,
                init = "Cao",
                n_init = 1, verbose=1,
               random_state=num_random_state)
fitClusters_cao = km_cao.fit_predict(phq_rec)

In [None]:
#Predicciones
fitClusters_cao

In [None]:
#Moda de clusters por variable
cluster_cao = pd.DataFrame(km_cao.cluster_centroids_)
cluster_cao.columns = phq_ft.columns
cluster_cao

In [None]:
#Algoritmo con inicialización de Huang
km_huang = KModes(n_clusters=4, init = "Huang", n_init = 1, verbose=1,random_state=num_random_state)
fitClusters_huang = km_huang.fit_predict(phq_rec)

In [None]:
fitClusters_huang

In [None]:
#Moda de clusters por variable
cluster_huang = pd.DataFrame(km_huang.cluster_centroids_)
cluster_huang.columns = phq_ft.columns
cluster_huang

## Número óptimo de clusters

In [None]:
cost = []
for num_clusters in list(range(1,10)):
    kmode = KModes(n_clusters=num_clusters, init = "Cao", n_init = 1, verbose=0,random_state=num_random_state)
    kmode.fit_predict(phq_rec)
    cost.append(kmode.cost_)

In [None]:
y = np.array([i for i in range(1,10,1)])

In [None]:
np.array([i for i in range(1,10,1)])

In [None]:
fig_cost=(px.line(x=y,
       y=cost,
       markers=True,
       labels=dict(y='Pérdida',x='Número de Clusters')).
       update_traces(line_color='#3b528b'))
      


In [None]:
#Guardar grafico
py.plot(fig_cost,filename='GRAFICOS/costo_cluster.html',auto_open=False)

### Cluster electo

In [None]:
### Se eligen 3
km_cao = KModes(n_clusters=3,
               init='Cao',
               n_init=1,
               verbose=0,
               random_state=num_random_state)
fit_kmcao=km_cao.fit_predict(phq_rec)

In [None]:
#Crear columna en df con el cluster asignado a cada observación
phq_rec['cluster']= fit_kmcao

In [None]:
phq_rec.head()

In [None]:
#Moda de clusters por variable
cluster_cao3 = pd.DataFrame(km_cao.cluster_centroids_)
cluster_cao3.columns = phq_rec.drop('cluster',axis=1).columns
cluster_cao3

In [None]:
cluster_cao3.iloc[0]

In [41]:
listas_dis=[[sum(cluster_cao3.iloc[j] !=phq_rec[phq_rec['cluster']==j].drop('cluster',axis=1).iloc[i]) for i in range(phq_rec[phq_rec['cluster']==j].shape[0])] for j in range(3)]

### Distancias promedio al interior del cluster

In [75]:
round(np.array(listas_dis[0]).mean(),2)

1.91

In [76]:
round(np.array(listas_dis[1]).mean(),2)

2.78

In [77]:
round(np.array(listas_dis[2]).mean(),2)

4.09

### Distancias entre clusters

In [71]:
#Ver todas las cpmbinaciones posibles entre clusters
combs=list(itertools.combinations(range(cluster_cao3.shape[0]), 2))
#Iterar sobre las combinaciones posibles
distancias_clusters=[sum(cluster_cao3.iloc[combs[i][0]]!=cluster_cao3.iloc[combs[i][1]]) for i in range(len(combs))]

In [74]:

pd.DataFrame({'Distancias':distancias_clusters,
             'Comparación':[str(combs[i]) for i in range(len(comb))]})

Unnamed: 0,Distancias,Comparación
0,6,"(0, 1)"
1,8,"(0, 2)"
2,6,"(1, 2)"


# Proporción de Clusters

In [None]:
#Proporciones por cluster
datos_cl=(pd.DataFrame({'prop':list(round(100*phq_rec.value_counts('cluster')/phq_rec.shape[0],2))}).
 reset_index().
 rename(columns={'index':'cluster'}))
#Pasar variable de int a categorica
datos_cl['cluster']=datos_cl.cluster.apply(str)

In [None]:
#Gráfico de proporciones por cluster
fg_prop_cl=px.bar(datos_cl,x='cluster',y='prop',
      labels=dict(prop='Proporción',
                  cluster='Cluster'),
      color_discrete_sequence=['#440154'])
fg_prop_cl

In [None]:
py.plot(fg_prop_cl,filename='GRAFICOS/props_cluster.html',auto_open=False)

In [None]:
#Calcular para cada síntoma la frecuencia relativa agrupado por cluster
lista_cl_sin=[round(100*phq_rec.groupby('cluster')[i].value_counts()/phq_rec.value_counts('cluster'),2) for i in phq_rec.drop('cluster',axis=1).columns]
#Pasar la lista anterior a data frame e invertirla
df_cl_sin=pd.DataFrame(lista_cl_sin).transpose().reset_index()
#Renombrar columnas
df_cl_sin.columns=['cluster','frecuencia','s01','s02','s03','s04','s05','s06','s07','s08','s09']
#Pasar de wide a long para que haya una unica variable con las proporciones y otra para el sintoma
gf_cl_sin=df_cl_sin.melt(id_vars=['cluster','frecuencia'])
gf_cl_sin.head()

In [None]:
fg_sint_cl=px.bar(gf_cl_sin,x='variable',
       y='value',
       color='frecuencia',
       facet_col='cluster',
       color_discrete_sequence=['#440154','#3b528b','#21918c','#fde725'],
      labels=dict(value='Proporción',variable='Síntoma',frecuencia='Frecuencia',cluster='Cluster'),
      category_orders={'frecuencia':['Nunca', 'Algunos','Varios','Todos']})
fg_sint_cl

In [None]:
py.plot(fg_sint_cl,filename='GRAFICOS/sintomas_por_cluster.html',auto_open=False)