In [154]:
import os
import json
import numpy as np
import pandas as pd
import compress_pickle as cpickle

from scripts import utils

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import plotly.express as px

In [140]:
model = 'test'
dataset = 'saludmental'

basefolder = f'models/{model}/{dataset}/'

infolder = basefolder + 'input/data/'
configfolder = basefolder + 'config/'

CARGAR DATOS PREPARADOS

In [141]:
filename1 = utils.GET_LAST_FILE(infolder, 'prepared')
data = cpickle.load(filename1)

# cargo dataframes de trabajo
dataframe = data['dataset']
data['dataset'] = None

# cargo dataframe de informacion
infoframe = data['information']
data['information'] = None

# cargo diccionario de variables
variables = data['variables']
data['variables'] = None

ESTANDARIZACION DE DATOS

In [142]:
zscaler = StandardScaler()
dfz = pd.DataFrame(zscaler.fit_transform(dataframe), columns=dataframe.columns, index=dataframe.index)

EVALUACION DE COMPONENTES

In [143]:
def optimize_components_expvar(dataframe, threshold=0.95, seed=16):
    
    pca = PCA(n_components=dataframe.shape[1]-1, whiten=True, random_state=seed)
    pca.fit(dataframe)

    test = pd.Series(np.round(np.cumsum(pca.explained_variance_ratio_), 4), index=range(1, dataframe.shape[1])).to_frame().reset_index()
    test.columns = ['comps', 'exp']

    check = test.loc[test.exp >= threshold, :]
    
    comps = int(check.iloc[0, 0])
    expvar = check.iloc[0, 1]
    redu = np.round(check.iloc[0, 0] / (dataframe.shape[1]), 4)

    return comps, expvar, redu

In [144]:
print('Buscando componentes optimos ...')
componentes_optimos, explained_variance, reduccion_dimensional = optimize_components_expvar(dataframe=dfz, threshold=0.95, seed=16)
print(f'     ... componentes encontrados: {componentes_optimos} ({explained_variance} varianza explicada) ({reduccion_dimensional} dimension reducida)')

Buscando componentes optimos ...
     ... componentes encontrados: 9 (0.9691 varianza explicada) (0.5625 dimension reducida)


ENTRENAMIENTO MODELO PCA

In [145]:
pca = PCA(n_components=componentes_optimos, whiten=True, random_state=16)
pca.fit(dfz)

rd = pd.DataFrame(pca.transform(dfz), index=dfz.index)
drz = pd.DataFrame(pca.inverse_transform(rd).values, index=dfz.index, columns=dfz.columns)

In [146]:
# datos originales
sample = dfz.sample(3)
sample

Unnamed: 0_level_0,cantidad,valorprestacionclp,valorprestacionuf,costounitarioclp,valorreclamadoclp,valorpagouf,valorpagoclp,fonasa,clasif_psicologia,clasif_psicopedagogia,clasif_psiquiatria,day_sin,day_cos,month_sin,month_cos,titular
uniqueid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
id_1549,4.326399,3.300229,3.298561,-0.075156,3.276107,1.286901,1.287786,-0.992805,0.517545,-0.065172,-0.51097,0.852063,-0.795272,-0.083975,-0.074528,-0.974334
id_7447,-0.207504,-0.804086,-0.804616,-1.132511,-0.823395,-0.504296,-0.503626,1.007247,0.517545,-0.065172,-0.51097,-1.804914,-0.523175,-0.083975,-0.074528,-0.974334
id_0025,-0.207504,0.40732,0.407408,0.985258,0.219155,-0.006043,-0.006066,-0.992805,0.517545,-0.065172,-0.51097,-0.563357,-1.411894,-0.083975,-0.074528,1.026342


In [147]:
# datos dimension reducida
rd.loc[sample.index, :]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
uniqueid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
id_1549,2.491869,-2.276437,-0.28585,-0.193727,-0.350439,0.797288,-0.543548,1.070485,4.088281
id_7447,-0.878698,-0.375635,0.030281,-1.446328,1.117837,0.045852,-0.880844,-0.726658,0.112912
id_0025,0.196124,0.006303,-0.107053,-1.288172,-1.366676,0.393323,0.888584,0.204153,-0.321705


In [148]:
# datos reconstruidos
drz.loc[sample.index, :]

Unnamed: 0_level_0,cantidad,valorprestacionclp,valorprestacionuf,costounitarioclp,valorreclamadoclp,valorpagouf,valorpagoclp,fonasa,clasif_psicologia,clasif_psicopedagogia,clasif_psiquiatria,day_sin,day_cos,month_sin,month_cos,titular
uniqueid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
id_1549,4.509105,3.405904,3.405871,-0.177209,2.381911,1.524098,1.524028,-0.962721,0.341282,-0.112838,-0.325757,0.840612,-0.766289,-0.087337,-0.068719,-0.991513
id_7447,-0.239613,-0.813716,-0.814574,-0.974267,-0.628858,-0.586682,-0.585961,1.050791,0.614233,-0.036094,-0.613037,-1.799592,-0.53534,-0.078569,-0.08065,-0.966697
id_0025,-0.01808,0.23698,0.236162,0.355596,-0.008489,0.23432,0.235035,-1.266233,0.1528,-0.18697,-0.123995,-0.5818,-1.371166,-0.08184,-0.07481,0.999536


MEDICION ERROR DE RECONSTRUCCION

[DISTANCIA EUCLIDEANA] D^2 = (a2 - a1)^2 + (b2 - b1)^2 + (c2 - c1)^2

In [149]:
# calcular el error de reconstruccion entre dfz (original) y drz (reconstruido)
anomalyscore = np.sqrt(((drz-dfz) ** 2).sum(axis=1)).to_frame().rename(columns={0:'anomalyscore'})
anomalyscore.loc[sample.index, :]

Unnamed: 0_level_0,anomalyscore
uniqueid,Unnamed: 1_level_1
id_1549,1.023718
id_7447,0.316993
id_0025,1.016626


CONCATENACION DE RESULTADOS

In [150]:
result = pd.concat([anomalyscore, infoframe, dataframe], axis=1)

### VISUALIZACION RESULTADOS

In [151]:
dft.columns

Index(['anomalyscore', 'ruttitular', 'rutbeneficiario', 'numerosolicitud',
       'idgrupoprestacion', 'clasificaciongrupo', 'idsubgrupoprestacion',
       'clasificacionsubgrupo', 'idaperturaprestacion',
       'clasificacionapertura', 'fechaprestacion', 'fecharecepcionliquidacion',
       'prevision', 'rutprestador', 'nombreprestador', 'cantidad',
       'valorprestacionclp', 'valorprestacionuf', 'costounitarioclp',
       'valorreclamadoclp', 'valorpagouf', 'valorpagoclp', 'fonasa',
       'clasif_psicologia', 'clasif_psicopedagogia', 'clasif_psiquiatria',
       'slaliquidacion', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
       'titular', 'registro'],
      dtype='object')

In [None]:
dft = result.copy()
dft['registro'] = np.arange(dft.shape[0])

fig = px.line(dft, x='registro', y='anomalyscore', hover_data=['ruttitular', 'numerosolicitud', 'clasificaciongrupo', 'clasificacionsubgrupo', 'clasificacionapertura', 'fechaprestacion', 'fecharecepcionliquidacion', 'rutprestador', 'nombreprestador', 'cantidad', 'valorprestacionclp', 'valorpagoclp', 'costounitarioclp'])
fig.show()