# Additional analysis for project

---

## Imports

#### Python libraries

In [1]:
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

import sys

#### Ancillary modules

In [2]:
sys.path.append("../")

In [3]:
%load_ext autoreload
%autoreload 2

from utils.functions import (
    json_dump_dict
)

from utils.parameters import (
    gm_rename,
    yr,
    original_features
)

#### Data

In [4]:
df_ime_cmp = pd.read_csv("../base.csv")
df_ime_cmp["GM"] = df_ime_cmp["GM"].map(gm_rename)
df_ime = df_ime_cmp.loc[df_ime_cmp["A.O"] == yr, :].copy()
df_ime

Unnamed: 0.1,Unnamed: 0,CVE_ENT,NOM_ENT,POB_TOT,ANALF,SPRIM,OVSDE,OVSEE,OVSAE,VHAC,OVPT,PL.5000,PO2SM,IM,GM,LUGAR,A.O
128,129,1,Aguascalientes,1184996,3.27,14.75,1.06,0.62,0.99,30.33,1.76,25.16,33.65,-0.91,1_bajo,28,2010
129,130,2,Baja California,3155070,2.6,12.99,0.43,0.95,3.56,29.06,3.4,10.35,21.87,-1.14,0_muy_bajo,30,2010
130,131,3,Baja California Sur,637026,3.23,14.27,0.94,2.84,7.09,31.74,5.81,15.62,23.3,-0.68,1_bajo,23,2010
131,132,4,Campeche,822441,8.37,22.54,6.42,2.59,9.74,45.97,4.5,30.88,45.51,0.43,3_alto,10,2010
132,133,5,Coahuila de Zaragoza,2748391,2.65,12.17,1.09,0.54,1.39,30.27,1.42,12.15,30.04,-1.14,0_muy_bajo,29,2010
133,134,6,Colima,650555,5.16,18.48,0.69,0.59,1.17,31.32,4.69,14.48,32.04,-0.78,1_bajo,26,2010
134,135,7,Chiapas,4796580,17.91,37.13,5.06,3.82,22.37,53.9,15.66,57.86,69.85,2.32,4_muy_alto,2,2010
135,136,8,Chihuahua,3406465,3.7,16.07,2.64,3.78,4.95,28.39,3.55,17.05,35.93,-0.52,1_bajo,21,2010
136,137,9,Distrito Federal,8851080,2.11,8.72,0.08,0.08,1.79,26.08,1.08,0.67,28.51,-1.48,0_muy_bajo,32,2010
137,138,10,Durango,1632934,3.84,18.76,5.85,4.19,5.73,32.6,7.01,36.19,40.61,0.05,2_medio,15,2010


---

## Clustering exercise

In [5]:
feats = [feat for feat in original_features if
        (original_features[feat]["feature"] == True) &
        (original_features[feat]["selected"] == True)
        ]
feats

['ANALF',
 'SPRIM',
 'OVSDE',
 'OVSEE',
 'OVSAE',
 'VHAC',
 'OVPT',
 'PL.5000',
 'PO2SM']

In [6]:
df_ime_feats = df_ime.loc[:, feats]

In [7]:
df_ime_feats

Unnamed: 0,ANALF,SPRIM,OVSDE,OVSEE,OVSAE,VHAC,OVPT,PL.5000,PO2SM
128,3.27,14.75,1.06,0.62,0.99,30.33,1.76,25.16,33.65
129,2.6,12.99,0.43,0.95,3.56,29.06,3.4,10.35,21.87
130,3.23,14.27,0.94,2.84,7.09,31.74,5.81,15.62,23.3
131,8.37,22.54,6.42,2.59,9.74,45.97,4.5,30.88,45.51
132,2.65,12.17,1.09,0.54,1.39,30.27,1.42,12.15,30.04
133,5.16,18.48,0.69,0.59,1.17,31.32,4.69,14.48,32.04
134,17.91,37.13,5.06,3.82,22.37,53.9,15.66,57.86,69.85
135,3.7,16.07,2.64,3.78,4.95,28.39,3.55,17.05,35.93
136,2.11,8.72,0.08,0.08,1.79,26.08,1.08,0.67,28.51
137,3.84,18.76,5.85,4.19,5.73,32.6,7.01,36.19,40.61


In [8]:
kmeans = KMeans(n_clusters=5)

kmeans.fit(df_ime_feats)

y_km = kmeans.fit_predict(df_ime_feats)

In [9]:
df_ime["cluster"] = y_km

In [10]:
df_ime

Unnamed: 0.1,Unnamed: 0,CVE_ENT,NOM_ENT,POB_TOT,ANALF,SPRIM,OVSDE,OVSEE,OVSAE,VHAC,OVPT,PL.5000,PO2SM,IM,GM,LUGAR,A.O,cluster
128,129,1,Aguascalientes,1184996,3.27,14.75,1.06,0.62,0.99,30.33,1.76,25.16,33.65,-0.91,1_bajo,28,2010,1
129,130,2,Baja California,3155070,2.6,12.99,0.43,0.95,3.56,29.06,3.4,10.35,21.87,-1.14,0_muy_bajo,30,2010,1
130,131,3,Baja California Sur,637026,3.23,14.27,0.94,2.84,7.09,31.74,5.81,15.62,23.3,-0.68,1_bajo,23,2010,1
131,132,4,Campeche,822441,8.37,22.54,6.42,2.59,9.74,45.97,4.5,30.88,45.51,0.43,3_alto,10,2010,3
132,133,5,Coahuila de Zaragoza,2748391,2.65,12.17,1.09,0.54,1.39,30.27,1.42,12.15,30.04,-1.14,0_muy_bajo,29,2010,1
133,134,6,Colima,650555,5.16,18.48,0.69,0.59,1.17,31.32,4.69,14.48,32.04,-0.78,1_bajo,26,2010,1
134,135,7,Chiapas,4796580,17.91,37.13,5.06,3.82,22.37,53.9,15.66,57.86,69.85,2.32,4_muy_alto,2,2010,0
135,136,8,Chihuahua,3406465,3.7,16.07,2.64,3.78,4.95,28.39,3.55,17.05,35.93,-0.52,1_bajo,21,2010,1
136,137,9,Distrito Federal,8851080,2.11,8.72,0.08,0.08,1.79,26.08,1.08,0.67,28.51,-1.48,0_muy_bajo,32,2010,1
137,138,10,Durango,1632934,3.84,18.76,5.85,4.19,5.73,32.6,7.01,36.19,40.61,0.05,2_medio,15,2010,4


In [11]:
gm_vals = df_ime["GM"].unique()
gm_vals

array(['1_bajo', '0_muy_bajo', '3_alto', '4_muy_alto', '2_medio'],
      dtype=object)

In [12]:
c_vals = df_ime["cluster"].unique()
c_vals

array([1, 3, 0, 4, 2], dtype=int32)

In [13]:
gm_dict = {gm_v: list(df_ime.loc[df_ime["GM"] == gm_v, "NOM_ENT"]) for gm_v in gm_vals}
c_dict = {c_v: list(df_ime.loc[df_ime["cluster"] == c_v, "NOM_ENT"]) for c_v in c_vals}

In [14]:
def finding_clusters_match(c_dict, gm_dict):
    
    orig_c_key = [c_key for c_key in c_dict]
    for c_key in orig_c_key:

        match_num = 1
        best_match = "not_found"
        for gm_key in gm_dict:
            delta_check = len(set(gm_dict[gm_key]) - set(c_dict[c_key]))/len(set(gm_dict[gm_key]))

            if delta_check < match_num:
                match_num = delta_check
                best_match = gm_key

        c_dict[best_match] = c_dict.pop(c_key)
        
    return c_dict

In [15]:
c_dict = finding_clusters_match(c_dict, gm_dict)

In [18]:
gm_df = pd.DataFrame.from_dict(gm_dict, orient="index").transpose().fillna("-")
gm_df.sort_index(axis=1, inplace=True)
gm_df.index = pd.MultiIndex.from_tuples([(yr, i) for i in gm_df.index])
gm_df

Unnamed: 0,Unnamed: 1,0_muy_bajo,1_bajo,2_medio,3_alto,4_muy_alto
2010,0,Baja California,Aguascalientes,Durango,Campeche,Chiapas
2010,1,Coahuila de Zaragoza,Baja California Sur,Guanajuato,Hidalgo,Guerrero
2010,2,Distrito Federal,Colima,Morelos,Michoacán de Ocampo,Oaxaca
2010,3,Nuevo León,Chihuahua,Nayarit,Puebla,-
2010,4,-,Jalisco,Querétaro,San Luis Potosí,-
2010,5,-,México,Quintana Roo,Tabasco,-
2010,6,-,Sonora,Sinaloa,Veracruz de Ignacio de la Llave,-
2010,7,-,Tamaulipas,Tlaxcala,Yucatán,-
2010,8,-,-,Zacatecas,-,-


In [20]:
def cluster_metrics(c_dict, gm_dict):
    """
    """
    
    c_dict_res = {}
    for key in c_dict:
        c_dict_res[key] = {
            "Correct": len(set(gm_dict[key])) - len((set(gm_dict[key]) - set(c_dict[key]))),
            "Incorrect": len(set(c_dict[key]) - set(gm_dict[key]))
        }
        
    return c_dict_res

In [21]:
c_df = pd.DataFrame.from_dict(c_dict, orient="index").transpose().fillna("-")
c_df.sort_index(axis=1, inplace=True)
c_df = c_df.append(pd.DataFrame.from_dict(cluster_metrics(c_dict, gm_dict)))
c_df.index = pd.MultiIndex.from_tuples([(yr, i) for i in c_df.index])
c_df

Unnamed: 0,Unnamed: 1,1_bajo,2_medio,3_alto,4_muy_alto
2010,0,Aguascalientes,Durango,Hidalgo,Chiapas
2010,1,Baja California,Guanajuato,Michoacán de Ocampo,Guerrero
2010,2,Baja California Sur,Morelos,Puebla,Oaxaca
2010,3,Coahuila de Zaragoza,Nayarit,San Luis Potosí,-
2010,4,Colima,Querétaro,Tabasco,-
2010,5,Chihuahua,Sinaloa,Veracruz de Ignacio de la Llave,-
2010,6,Distrito Federal,-,Zacatecas,-
2010,7,Jalisco,-,-,-
2010,8,México,-,-,-
2010,9,Nuevo León,-,-,-


---

## *Notes*

### Clustering tutorial
- Sources
    - [An Introduction to Clustering Algorithms in Python](https://towardsdatascience.com/an-introduction-to-clustering-algorithms-in-python-123438574097)

#### Importing data

In [None]:
raw_sample_data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.6, random_state=50)

In [None]:
smp_data = raw_sample_data[0]
print(smp_data.shape)
smp_data

In [None]:
smp_labs = raw_sample_data[1]
print(smp_labs.shape)
smp_labs

In [None]:
plt.scatter(
    x=smp_data[:, 0],
    y=smp_data[:, 1],
    c=smp_labs
)

#### Applying k-means ++

In [None]:
# create kmeans object

kmeans = KMeans(n_clusters=4)# fit kmeans object to data

kmeans.fit(smp_data)# print location of clusters learned by kmeans object

print(kmeans.cluster_centers_)# save new clusters for chart

y_km = kmeans.fit_predict(smp_data)

In [None]:
print(y_km.shape)
y_km

---

### Creating features dicts

In [None]:
feat_dict = {}
for col in df_ime.columns:
    feat_dict[col] = {
        "feature": True,
        "selected": True
    }
    
json_dump_dict(feat_dict)

---

---
---