In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

In [2]:

# Cargar el dataset
file_path = 'dataset/itesal.csv'
data = pd.read_csv("dataset/itesal.csv", quotechar='"', delimiter=";")

columns_to_drop = ['EMPRESA', 'NUMERO_FACTURA', 'NUMERO_ASIENTO_BORRADOR', 'TIPO_FACTURA', 'DIARIO', 'IMPORTE_COBRADO_FRA', 'CUENTA_CONTABLE','NUM_EFECTOS_COBRADOS', 'NUM_EFECTOS_PARCIAL', 'NUM_EFECTOS_IMPAGADO', 'NUM_EFECTOS_FUERA_PLAZO', 'NUM_EFECTOS_PDTE_EN_PLAZO']
data.drop(columns=columns_to_drop, inplace=True)

In [3]:

# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)

In [4]:
# load the data into an autogluon tabular dataset
dataset = TabularDataset(train_data)

In [5]:
dataset.head()

Unnamed: 0,EJERCICIO,NUMERO_SERIE,ORGANIZACION_COMERCIAL,CLIENTE,FECHA_FACTURA,MES_FACTURA,DIVISA,FORMA_COBRO,CENTRO_CONTABLE,LIQUIDO_FACTURA,ALBARAN_FACTURA,ENVIO_ELECTRONICO,FECHA_CONTABILIZACION,TIENE_DESCUENTO,COUNT_TIPOS_TRANSACCION,MAX_FECHA_COBRO,NUM_EFECTOS_FACTURA,CATEGORIA_FACTURA
47759,2021,F01,101,100123,20/11/2021,11,EUR,TR10,1,13432407,N,SI,20/11/2021,1,1.0,07/12/2021,1,1
53583,2021,F73,703,700072,15/09/2021,9,EUR,PG15,7,24679,N,SI,15/09/2021,0,1.0,24/09/2021,1,2
20900,2019,F01,101,100158,31/01/2019,1,EUR,PG60,1,611274,N,SI,31/01/2019,0,1.0,,1,-1
73780,2023,F75,901,900000,03/01/2023,1,EUR,CON,7,11914,S,SI,03/01/2023,0,1.0,04/01/2023,1,1
3046,2018,F61,601,600670,15/11/2018,11,EUR,RN60,6,196834,N,NO,15/11/2018,0,1.0,28/12/2018,1,1


In [6]:
label = 'CATEGORIA_FACTURA'
dataset[label].describe()

count    63439.000000
mean         1.418843
std          0.901235
min         -2.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: CATEGORIA_FACTURA, dtype: float64

In [7]:
predictor = TabularPredictor(label=label).fit(dataset, time_limit=120, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20240626_105526"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.0
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.5.0: Wed May  1 20:13:18 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T6030
CPU Count:          11
Memory Avail:       13.76 GB / 36.00 GB (38.2%)
Disk Space Avail:   128.41 GB / 460.43 GB (27.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be

In [8]:
predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                    model  score_val eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       LightGBMXT_BAG_L2   0.901795    accuracy      31.539004  53.019026                2.280306           5.235901            2       True          7
1     WeightedEnsemble_L3   0.901795    accuracy      31.541865  53.490664                0.002861           0.471638            3       True          8
2  NeuralNetFastAI_BAG_L2   0.893031    accuracy      29.921015  65.771923                0.662316          17.988798            2       True          6
3     WeightedEnsemble_L2   0.888870    accuracy      26.916129  48.200179                0.003044           0.437035            2       True          5
4       LightGBMXT_BAG_L1   0.880704    accuracy      24.032576  10.663714               24.032576          10.663714            1       True          4
5  NeuralNetFastAI_B



{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFastAiTabular',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L3': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif_BAG_L1': 0.8098645943347152,
  'KNeighborsDist_BAG_L1': 0.8230741342076641,
  'NeuralNetFastAI_BAG_L1': 0.8477277384574158,
  'LightGBMXT_BAG_L1': 0.8807042986175696,
  'WeightedEnsemble_L2': 0.8888696227872445,
  'NeuralNetFastAI_BAG_L2': 0.8930311007424455,
  'LightGBMXT_BAG_L2': 0.9017954255268841,
  'WeightedEnsemble_L3': 0.9017954255268841},
 'model_best': 'WeightedEnsemble_L3',
 'model_paths': {'KNeighborsUnif_BAG_L1': ['KNeighborsUnif_BAG_L1'],
  'KNeighborsDist_BAG_L1': ['KNeighborsDi

In [9]:
y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

2     1
10    2
13   -1
20    2
21    1
Name: CATEGORIA_FACTURA, dtype: int64

In [10]:
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.9059899117276167,
 'balanced_accuracy': 0.6489736429920778,
 'mcc': 0.8230132562939034}

In [11]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L2,0.908071,0.901764,accuracy,5.629897,32.511357,302.778045,0.7208,0.779316,68.594108,2,True,6
1,WeightedEnsemble_L3,0.90599,0.904381,accuracy,5.631608,32.514757,303.284977,0.001711,0.0034,0.506932,3,True,7
2,WeightedEnsemble_L2,0.898676,0.897177,accuracy,4.271829,27.716147,234.879062,0.001905,0.004233,0.729232,2,True,5
3,NeuralNetFastAI_BAG_L1,0.896658,0.880625,accuracy,0.789692,0.727567,187.199307,0.789692,0.727567,187.199307,1,True,3
4,LightGBMXT_BAG_L1,0.884111,0.882927,accuracy,2.881481,23.130464,46.89936,2.881481,23.130464,46.89936,1,True,4
5,KNeighborsDist_BAG_L1,0.815637,0.823074,accuracy,0.598751,3.853883,0.051163,0.598751,3.853883,0.051163,1,True,2
6,KNeighborsUnif_BAG_L1,0.801955,0.809865,accuracy,0.639173,4.020127,0.034107,0.639173,4.020127,0.034107,1,True,1


In [12]:
predictor.feature_importance(test_data)

Computing feature importance via permutation shuffling for 17 features using 5000 rows with 5 shuffle sets...
	197.7s	= Expected runtime (39.54s per shuffle set)
	150.95s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
MAX_FECHA_COBRO,0.38564,0.007516,1.730396e-08,5,0.401115,0.370165
FECHA_FACTURA,0.17504,0.006845,2.799814e-07,5,0.189133,0.160947
FORMA_COBRO,0.1404,0.007559,1.004423e-06,5,0.155964,0.124836
NUMERO_SERIE,0.02884,0.00217,3.815949e-06,5,0.033308,0.024372
MES_FACTURA,0.00856,0.002161,0.00044826,5,0.013009,0.004111
NUM_EFECTOS_FACTURA,0.00424,0.001381,0.001179743,5,0.007084,0.001396
ALBARAN_FACTURA,0.00364,0.000518,4.779941e-05,5,0.004706,0.002574
CENTRO_CONTABLE,0.00272,0.000832,0.0009306966,5,0.004433,0.001007
ENVIO_ELECTRONICO,0.00264,0.00122,0.004202007,5,0.005152,0.000128
EJERCICIO,0.00168,0.002419,0.09770286,5,0.006661,-0.003301


In [None]:
predictor.plo