## PRUEBA DE MODELOS SIMPLES

In [None]:
#!pip install -r ../../requirements.txt
!pip install pyarrow
!pip install tsfel
!pip install imblearn
!pip install lightgbm
!pip install tensorflow
!pip install catboost
!pip install boruta

In [2]:
import os
import pandas as pd
import numpy as np
import warnings
import boto3
import io
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format #evita que muestre notacion cientifica
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
nombre_carpeta = '../../data/2022'

module_path = os.path.abspath(os.path.join('../../'))
import sys
if module_path not in sys.path:
    sys.path.append(module_path)
from src.preprocessing.preprocessing  import llenar_val_vacios_str,llenar_val_vacios_ciclo,TsfelVars, ExtraVars,ToDummy, TeEncoder, CardinalityReducer
from src.modeling.simple_models import ChangeTrendPercentajeIdentifierWide,ConstantConsumptionClassifierWide,PearsonIdentifierWide
from src.modeling.supervised_models import LGBMModel, NNModel, LSTMNNModel
from src.modeling.feature_selection import feature_selection_by_constant, feature_selection_by_boruta, feature_selection_by_correlation
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, f1_score, precision_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import _pickle as pickle
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin

Matplotlib is building the font cache; this may take a moment.


In [6]:
#Particionar por fecha: Quedan muy desbalanceados los conjuntos
#df_train = df[df.date_fiscalizacion<'2022-09-01'].copy()
df = pd.read_parquet(os.path.join(nombre_carpeta, 'data_normal_and_frauds_wide.parquet'))
df = df.rename(columns={'is_fraud':'target','id':'index'})
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Proportion of target class in train set: 23.231285988483684
Proportion of target class in validation set: 23.75588235294118
Proportion of target class in test set: 24.661585365853657


In [11]:
variables_consumo = [x for x in X.columns if '_anterior' in x and x!='0_anterior']
last_base_value,last_eval_value,threshold = 3,1,60
trend_perc_model = ChangeTrendPercentajeIdentifierWide(last_base_value,last_eval_value,threshold)
trend_pred = trend_perc_model.predict(df_test[variables_consumo])
print(100*trend_pred.is_fraud_trend_perc.value_counts(normalize=True))
min_count_constante =7
const_model = ConstantConsumptionClassifierWide(min_count_constante)
y_test_pred = const_model.predict(df_test[variables_consumo])
y_test = df_test.target.copy()
print(100*y_test_pred.value_counts(normalize=True))

0   77.89947
1   22.10053
Name: is_fraud_trend_perc, dtype: float64
0   89.06708
1   10.93292
dtype: float64


In [14]:
trend_pred.isnull().sum()

trend_perc             7
is_fraud_trend_perc    0
dtype: int64

In [15]:
print('Reporte para Trend Perc:')
print(classification_report(y_test, trend_pred['is_fraud_trend_perc']))
print('Reporte para Constant Consumption:')
print(classification_report(y_test, y_test_pred))

Reporte para Trend Perc:
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      5075
           1       0.00      0.13      0.01        38

    accuracy                           0.77      5113
   macro avg       0.50      0.45      0.44      5113
weighted avg       0.98      0.77      0.87      5113

Reporte para Constant Consumption:
              precision    recall  f1-score   support

           0       0.99      0.89      0.94      5075
           1       0.02      0.26      0.03        38

    accuracy                           0.89      5113
   macro avg       0.51      0.58      0.49      5113
weighted avg       0.99      0.89      0.93      5113



In [5]:
## CARGAR DATATASETS PREPROCESADOS PARA LOS MODELOS
carpeta=nombre_carpeta+'/dataset_nuevo/'#_extendido
feat_selec = pd.read_parquet(carpeta+'feat_selec.parquet')
X_train = pd.read_parquet(carpeta+'X_train.parquet')
y_train_aux = pd.read_parquet(carpeta+'y_train.parquet')
y_train = y_train_aux['target']
X_val = pd.read_parquet(carpeta+'X_val.parquet')
y_val_aux = pd.read_parquet(carpeta+'y_val.parquet')
y_val = y_val_aux['target']
X_test = pd.read_parquet(carpeta+'X_test.parquet')
y_test_aux = pd.read_parquet(carpeta+'y_test.parquet')
y_test = y_test_aux['target']

variables_consumo = [x for x in X_train.columns if '_anterior' in x]# and x!='0_anterior'
variables_categoricas = ['cant_consumo_est','cant_estado_0','cant_estado_1','cant_estado_2','cant_estado_3','cant_estado_4','mes','bimestre','trimestre','cuatrimestre','semestre','cant_categorias','ult_categoria','categ_mas_frecuente','cambios_categoria']
cols_excluir = ['index','instalacion','date_fiscalizacion'] + variables_consumo + variables_categoricas
cols_for_feature_sel = [x for x in X_train.columns if x not in cols_excluir]

feauture_selected = select_by_boruta = feat_selec['boruta'][0].tolist()
select_by_corr = feat_selec['corr'][0].tolist()
select_by_constant = feat_selec['constant'][0].tolist()
cols_for_model = variables_categoricas+variables_consumo+feauture_selected

In [39]:
%%time
#PEARSON, TENDENCIA A LA BAJA EN EL CONSUMO, DIFERENTES UMBRALES
for x in np.arange(0, 1, 0.1):
    obj=PearsonIdentifierWide(-x)
    predicciones_pearson=obj.predict(X_test)
    print(f'TH: {-x}')
    #print(classification_report(y_test, predicciones_pearson['is_fraud_pearson']))
    y_pred = predicciones_pearson['is_fraud_pearson']
    print(f'\tRecall: {recall_score(y_test,y_pred):.2f}, Precision: {precision_score(y_test,y_pred):.2f}, F-1: {f1_score(y_test,y_pred):.2f}')

TH: -0.0
	Recall: 0.41, Precision: 0.04, F-1: 0.06
TH: -0.1
	Recall: 0.38, Precision: 0.04, F-1: 0.07
TH: -0.2
	Recall: 0.34, Precision: 0.04, F-1: 0.07
TH: -0.30000000000000004
	Recall: 0.30, Precision: 0.04, F-1: 0.07
TH: -0.4
	Recall: 0.26, Precision: 0.04, F-1: 0.07
TH: -0.5
	Recall: 0.21, Precision: 0.04, F-1: 0.07
TH: -0.6000000000000001
	Recall: 0.17, Precision: 0.04, F-1: 0.07
TH: -0.7000000000000001
	Recall: 0.13, Precision: 0.05, F-1: 0.07
TH: -0.8
	Recall: 0.08, Precision: 0.07, F-1: 0.07
TH: -0.9
	Recall: 0.01, Precision: 0.07, F-1: 0.02
CPU times: user 19.5 s, sys: 3.96 ms, total: 19.5 s
Wall time: 19.5 s
