In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm
import re
import unidecode
from datetime import datetime
import seaborn as sns
from matplotlib import pyplot as plt

In [27]:
tqdm.pandas() 

In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [36]:
from src.data import make_dataset
from src.modeling import predict_model

## Configuracion

In [54]:
DATA_PATH_CLEAN = '../../data/interim/'
DATA_PATH_OUTPUT = '../../data/final/'
MODEL_PATH = '../../models/'

file_name_tramites_clean = DATA_PATH_CLEAN+'tramites_fraudes/tramites_clean_{}.parquet'
file_name_consumo_clean = DATA_PATH_CLEAN+'facturacion/consumo_clean_{}.parquet'
file_name_notas_clean = DATA_PATH_CLEAN+'notas/notas_clean_{}.parquet'
file_name_hist_puntos_clean = DATA_PATH_CLEAN+'puntos/hist_puntos_clean_{}.parquet'

config_caidas = [(1,4,90),(1,3,90),(2,3,90),(1,6,90),(3,3,90),(6,5,10),(6,6,10),(6,4,10),(6,1,10),(5,5,10)]
config_constantes = [8,9,10,3,4,5]
cols_hist_ptos = ['localizacion','fecha','estado_medidor','estado_contrato','id_tarifa']
config_below_g =[(['control_manzana'],DATA_PATH_CLEAN+'/stats/stat_control_manzana_{}.parquet') ,
                 (['id_distrito','id_tarifa'],DATA_PATH_CLEAN+'/stats/stat_distrito_tarifa_{}.parquet') ,
                 (['id_distrito','id_cant_fases'],DATA_PATH_CLEAN+'/stats/stat_distrito_fases_{}.parquet') ]
var_puntos = ['localizacion','id_provincia','id_canton','id_distrito','id_tipo_medidor','tipo_conexion','area_peligrosa',
               'id_tarifa','id_estado_contrato','tip_instalacion','id_cant_fases','id_ciu','marca_medidor',
               'nic', 'tipo_cliente', 'medidor','ind_fraude', 'estado_medidor','id_clase','clasif_contrato','nise'
                ,'fecha_instalacion']
cols_to_send = ['id_seleccion','localizacion','medidor','nise','id_sucursal','pred_score']
file_name_inference = DATA_PATH_OUTPUT+'/inference_data_{}.parquet'
file_name_inference_to_send = DATA_PATH_OUTPUT+'/inference_to_send.csv'
file_stacking_model_name = MODEL_PATH+'combination_final_model.pkl'
file_cols_for_model = MODEL_PATH+'features.pkl'
cant_periodos = 12
fecha_fraud = '2023-09-01'
inference_version = '001'

## Load data sets
---

In [55]:
lista_anios = make_dataset.get_start_end_years_and_years_range([fecha_fraud],cant_periodos)

In [56]:
df_ordenes = make_dataset.load_data(file_name_tramites_clean,lista_anios)
df_consumo = make_dataset.load_data(file_name_consumo_clean,lista_anios)
df_notas = make_dataset.load_data(file_name_notas_clean,lista_anios)
df_hist_puntos = make_dataset.load_data(file_name_hist_puntos_clean,lista_anios,cols_hist_ptos)
df_static_puntos = make_dataset.load_data(file_name_hist_puntos_clean,[2023])

100%|██████████| 2/2 [00:00<00:00, 63.36it/s]
100%|██████████| 2/2 [00:05<00:00,  3.00s/it]
100%|██████████| 2/2 [00:00<00:00,  4.21it/s]
100%|██████████| 2/2 [00:03<00:00,  1.73s/it]
100%|██████████| 1/1 [00:10<00:00, 10.26s/it]


In [57]:
%%time
df_ordenes,df_consumo,df_notas,df_hist_puntos,df_static_puntos = make_dataset.\
                                                                        prepare_data(df_ordenes,df_consumo,df_notas,df_hist_puntos,df_static_puntos)

CPU times: user 45.2 s, sys: 9.34 s, total: 54.6 s
Wall time: 54.6 s


In [58]:
%%time
make_dataset.prepare_stat_consumo_below_group([fecha_fraud],config_below_g,df_consumo,cant_periodos)

100%|██████████| 1/1 [01:04<00:00, 64.92s/it]
100%|██████████| 1/1 [00:19<00:00, 19.62s/it]
100%|██████████| 1/1 [00:19<00:00, 19.66s/it]

CPU times: user 1min 35s, sys: 9.07 s, total: 1min 44s
Wall time: 1min 44s





## Create Prediction Data Sets

In [59]:
select_id_tarifa = ['tin']#[ 'tmt', 'tmtb','tg', 'tin', 'tprom']
select_localizacion = df_static_puntos[df_static_puntos.id_tarifa.isin(select_id_tarifa)].localizacion.unique().tolist()
len(select_localizacion)

1588

In [60]:
%%time
make_dataset.create_predict_data(fecha_fraud,select_localizacion,df_consumo,df_ordenes,df_hist_puntos,df_notas,df_static_puntos,
                                cant_periodos,config_below_g,config_caidas,config_constantes,var_puntos, file_name_inference)

ENTRO PARA CREAR
ENTRI a CREAR TSFEL
*** Feature extraction started ***


  df_result = tsfel.time_series_features_extractor(cfg, df[cols].values.tolist(),verbose=1,n_jobs=-1)



*** Feature extraction finished ***
df_result index: (11,)
*** Feature extraction started ***


  df_result = tsfel.time_series_features_extractor(cfg, df[cols].values.tolist(),verbose=1,n_jobs=-1)



*** Feature extraction finished ***
df_result index: (11,)
*** Feature extraction started ***


  df_result = tsfel.time_series_features_extractor(cfg, df[cols].values.tolist(),verbose=1,n_jobs=-1)



*** Feature extraction finished ***
df_result index: (11,)
temp_vars 14
stat_vars 40
spec_vars 86
df_tsfel (11, 141)
CPU times: user 14.6 s, sys: 7.35 s, total: 21.9 s
Wall time: 22.8 s


## Make Predictions

In [61]:
df_infernce = pd.read_parquet(file_name_inference.format('final_tsfel'))
df_infernce = df_infernce.merge(df_static_puntos[['localizacion','id_sucursal']], on='localizacion', how='left')

In [62]:
predict_model.make_predictions(df_infernce,file_cols_for_model,file_stacking_model_name,inference_version,cols_to_send,file_name_inference_to_send)