In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

In [2]:

#cargar el dataset de dataset/s_alimentacion.csv que está 2 carpetas por encima de la actual
file_path = '../../dataset/s_alimentacion.csv'
data = pd.read_csv(file_path, delimiter=';')

#convertir columnas de fechas a objetos datetime
date_columns = ['FECHA_FACTURA', 'MAX_FECHA_COBRO', 'FECHA_CONTABILIZACION']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

#extraer características de las fechas
for col in date_columns:
    data[col + '_YEAR'] = data[col].dt.year
    data[col + '_MONTH'] = data[col].dt.month
    data[col + '_DAY'] = data[col].dt.day
    data[col + '_WEEKDAY'] = data[col].dt.weekday

#eliminar las columnas originales de fecha si ya no son necesarias
data.drop(columns=date_columns, inplace=True)

columns_to_drop = ['EMPRESA', 'NUMERO_FACTURA', 'NUMERO_ASIENTO_BORRADOR', 'TIPO_FACTURA', 'DIARIO', 'IMPORTE_COBRADO_FRA', 'CUENTA_CONTABLE','NUM_EFECTOS_COBRADOS', 'NUM_EFECTOS_PARCIAL', 'NUM_EFECTOS_IMPAGADO', 'NUM_EFECTOS_FUERA_PLAZO', 'NUM_EFECTOS_PDTE_EN_PLAZO']
data.drop(columns=columns_to_drop, inplace=True)

data.fillna(0, inplace=True)

#convertir todas las entradas en las columnas categóricas a cadenas
categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data[column] = data[column].astype(str)

#convertir columnas categóricas a variables numéricas
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

#recodificar las clases de la variable objetivo
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(data['CATEGORIA_FACTURA'])

#separación de características y variable objetivo
X = data.drop('CATEGORIA_FACTURA', axis=1)

#sobremuestreo de la clase minoritaria utilizando SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

#división de los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#estandarización de las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#función para evaluar el rendimiento de los modelos
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"\n{name}:")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return [name, accuracy, precision, recall, f1]

#lista para almacenar resultados
results = []

#parámetros para GridSearchCV
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 20, 50],
    'min_samples_split': [2, 5, 10]
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 20],
    'criterion': ['gini', 'entropy']
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

#validación cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  data = pd.read_csv(file_path, delimiter=';')
  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')


In [3]:
#mostramos cuantos registros hay de cada clase sin usar smote 
print("Clases sin smote")
print(data['CATEGORIA_FACTURA'].value_counts())

Clases sin smote
CATEGORIA_FACTURA
 1    160429
 2     92679
-1     24541
 0     19118
-2        55
Name: count, dtype: int64


In [None]:

# Árboles de Decisión
dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=cv, refit=True, verbose=2)
dt.fit(X_train, y_train)
results.append(evaluate_model("Árboles de Decisión", dt, X_test, y_test))


In [None]:
# Random Forest
rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=cv, refit=True, verbose=2)
rf.fit(X_train, y_train)
y_pred_rf = label_encoder_y.inverse_transform(rf.predict(X_test))
results.append(evaluate_model("Random Forest", rf, X_test, y_test))

In [None]:
# XGBoost
xgboost = GridSearchCV(xgb.XGBClassifier(), param_grid_xgb, cv=cv, refit=True, verbose=2)
xgboost.fit(X_train, y_train)
y_pred_xgb = label_encoder_y.inverse_transform(xgboost.predict(X_test))

results.append(evaluate_model("XGBoost", xgboost, X_test, y_test))

In [None]:
# K-Nearest Neighbors (K-NN)
knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=cv, refit=True, verbose=2)
knn.fit(X_train, y_train)
y_pred_knn = label_encoder_y.inverse_transform(knn.predict(X_test))
results.append(evaluate_model("K-Nearest Neighbors", knn, X_test, y_test))

In [None]:
# SVM con búsqueda de hiperparámetros
svm = GridSearchCV(SVC(), param_grid_svm, cv=cv, refit=True, verbose=2)
svm.fit(X_train, y_train)
y_pred_svm = label_encoder_y.inverse_transform(svm.predict(X_test))
results.append(evaluate_model("Support Vector Machine", svm, X_test, y_test))

In [None]:
svm = SVC(kernel='rbf', C=10, gamma=0.1, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\nSupport Vector Machine:")
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))


In [None]:
results.append(evaluate_model("Support Vector Machine", svm, X_test, y_test))

In [None]:
# Stacking
estimators = [
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier()),
    ('svm', SVC()),
    ('knn', KNeighborsClassifier()),
    ('xgb', xgb.XGBClassifier())
]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(X_train, y_train)
results.append(evaluate_model("Stacking", stacking, X_test, y_test))

stacking: 1700

svm: 463

knn: 121

xgb: 63

rf: 193

dt: 6

In [None]:

# Crear un DataFrame con los resultados
results_df = pd.DataFrame(results, columns=["Modelo", "Exactitud", "Precisión", "Recall", "F1-Score"])

# Mostrar la tabla de resultados
print("\nTabla de comparación de modelos:")
print(results_df)

# Graficar los resultados
plt.figure(figsize=(12, 8))
results_df.set_index("Modelo").plot(kind='bar')
plt.title('Comparación de rendimiento de modelos')
plt.ylabel('Puntuación')
plt.xticks(rotation=45)
plt.legend(loc='best')
plt.show

In [None]:
# XGBoost
xgboost = GridSearchCV(xgb.XGBClassifier(), param_grid_xgb, cv=cv, refit=True, verbose=2)
xgboost.fit(X_train, y_train)
y_pred_xgb = label_encoder_y.inverse_transform(xgboost.predict(X_test))

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Definir las matrices de confusión
confusion_matrices = [
    np.array([[16577, 0, 0, 13, 2], [1, 16470, 0, 4, 3], [1, 0, 16352, 352, 108], [4, 4, 54, 15328, 1280], [19, 0, 17, 1237, 15286]]),
    np.array([[21016, 0, 3, 0], [0, 69590, 850, 21], [5, 1287, 67764, 755], [0, 60, 1156, 9985]]),
    np.array([[18003, 0, 0, 0, 1], [0, 17816, 39, 1, 22], [0, 12, 17998, 53, 17], [0, 0, 153, 16955, 802], [0, 0, 66, 889, 16940]]),
    np.array([[9685, 0, 0, 8, 17], [10, 8993, 54, 458, 162], [1, 45, 9626, 68, 15], [19, 548, 80, 8020, 946], [5, 238, 26, 976, 8259]]),
    np.array([[31937, 0, 0, 0, 0], [2, 28028, 2658, 1060, 439], [2, 2319, 27865, 26, 1961], [0, 956, 23, 31161, 105], [0, 906, 3305, 124, 27552]])
]

# Crear una matriz de confusión unificada de tamaño adecuado
# Encontrar el tamaño máximo de las matrices de confusión
max_size = max(matrix.shape[0] for matrix in confusion_matrices)
unified_confusion_matrix = np.zeros((max_size, max_size), dtype=int)

# Sumar todas las matrices de confusión
for matrix in confusion_matrices:
    unified_confusion_matrix[:matrix.shape[0], :matrix.shape[1]] += matrix

# Visualizar la matriz de confusión unificada
plt.figure(figsize=(10, 8))
sns.heatmap(unified_confusion_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicción')
plt.ylabel('Actual')
plt.title('Matriz de confusión unificada')
plt.show()