In [None]:
#Importaciones de librerias
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier #Algoritmo del árbol de decisión
from sklearn.metrics import f1_score
from sklearn import tree
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score


In [None]:
# Carga del dataset
# Lee CSV, tratando valores missing
df=pd.read_csv('model/company_data.csv', na_values=['?', 'None', 'nan'])
df.head(3)

In [None]:
df.isnull().sum()# Cuenta valores nulos por columna (pero no se usa el resultado)
df = df.replace([np.inf, -np.inf], np.nan) # Reemplaza infinitos por NaN
umbral = 0.5 * len(df) 
df = df.dropna(axis=1, thresh=umbral) # Elimina columnas con más del 50% de valores missing
df = df.fillna(df.median()) # Rellena valores missing con la mediana de cada columna

In [None]:
print(f"Distribución original de clases:")
print(df['Y'].value_counts())
print(f"Porcentaje de Y: {df['Y'].mean()*100:.2f}%")

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(20, 20))  
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title("Matriz de Correlación")
plt.show()


print("\nMatriz de Correlación Numérica:")
print(correlation_matrix)

In [None]:
 # --- DEFINICIÓN DE VARIABLES PREDICTORAS (features) ---

#X1: net profit / total assets = utilidad_neta / total_activos
    #X2: total liabilities / total assets = total_pasivos / total_activos
    #X3: working capital / total assets = (activo_corriente - pasivo_corriente) / total_activos
    #X4: current assets / short-term liabilities = activo_corriente / pasivo_corriente
    #X6: retained earnings / total assets = utilidad_neta / total_activos
    #X8: book value of equity / total liabilities = patrimonio / total_pasivos
    #X9: sales / total assets = ventas_totales / total_activos
    #X10: equity / total assets = patrimonio / total_activos
    #X17: total assets / total liabilities = total_activos / total_pasivos
    #X18: gross profit / total assets = (ventas_totales - costo_ventas) / total_activos
    #X19: gross profit / sales   = (ventas_totales - costo_ventas) / ventas_totales
    #X23: net profit / sales  = utilidad_neta / ventas_totales
    #X44: (receivables * 365) / sales   =  (cuentas_por_cobrar * 365) / ventas_totales
    #X50: current assets / total liabilities = activo_corriente / total_pasivos
    #X51: short-term liabilities / total assets = pasivo_corriente / total_activos
    #X60: sales / inventory = ventas_totales / inventario_final
    #X61: sales / receivables = ventas_totales / cuentas_por_cobra
variablex = df[["X1", "X2", "X3", "X4", "X6", "X8", "X9", "X10", "X17", "X18", "X19", "X23",  "X44", "X50", "X51", "X60", "X61"]].values
variabley = df[["Y"]].values

In [None]:
    # --- DIVISIÓN DE DATOS ---
    # 67% para entrenamiento, 33% para prueba
    # stratify=variabley asegura que la proporción de clases se mantenga en ambos conjuntos
X_train, X_test, y_train, y_test = train_test_split(variablex, variabley, test_size = 0.33, random_state = 42, stratify=variabley)

In [None]:
  # --- BALANCEO DE CLASES CON SMOTE ---
    # SMOTE crea muestras sintéticas de la clase minoritaria para equilibrar el dataset
   
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# --- CREACIÓN Y ENTRENAMIENTO DEL MODELO ---
model = DecisionTreeClassifier(criterion="log_loss", random_state=42, max_depth=10, class_weight='balanced')
model.fit(X_resampled, y_resampled)

In [None]:
# --- PREDICCIÓN Y EVALUACIÓN ---
y_pred = model.predict(X_test)
#y_prob = model.predict_proba(X_test)[:, 1]
y_prob = model.predict_proba(X_test)[:, 1] # Probabilidades para la clase positiva (índice 1)

In [None]:

    # --- REPORTE DE RESULTADOS ---
print("===DECISION TREE===")
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
    

In [None]:
porcent = model.score(X_test, y_test)
print(f"El modelo obtuvo {porcent*100} % de precision para clasificar")

f1 = f1_score(y_test, y_pred)
print(f"El modelo obtuvo un indice F1 Score de: {f1}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
    

In [None]:
# Reporte completo con varias métricas
print(classification_report(y_test, y_pred))