In [1]:
import numpy as np
import pandas as pd

def naive_bayes_train(X, y):
    """Entrena el modelo Naive Bayes calculando probabilidades previas y condicionales."""
    # Calcular las probabilidades previas de cada clase
    labels, label_counts = np.unique(y, return_counts=True)
    prior_probs = label_counts / len(y)
    priors = dict(zip(labels, prior_probs))  # Se mapea cada etiqueta con su probabilidad previa

    # Calcular las probabilidades condicionales para cada atributo dado cada clase
    cond_probs = {}
    for label in labels:
        subset = X[y == label]  # Se filtra los datos para la clase actual
        features_probs = {}
        for col in X.columns:
            levels, counts = np.unique(subset[col], return_counts=True)
            probs = counts / counts.sum()  # Probabilidad de cada valor del atributo dentro de la clase
            features_probs[col] = dict(zip(levels, probs))
        cond_probs[label] = features_probs  # Se mapea la clase con las probabilidades de sus atributos

    return priors, cond_probs

def naive_bayes_predict(X, priors, cond_probs):
    """Predice las clases para cada instancia usando el modelo Naive Bayes."""
    predictions = []
    probabilities = []  # Para almacenar las probabilidades de clase
    for _, row in X.iterrows():
        label_probs = {}
        for label, features_probs in cond_probs.items():
            total_prob = np.log(priors[label])
            for feature, value in row.items():
                feature_probs = features_probs.get(feature, {})
                prob = feature_probs.get(value, 1e-6)  # Laplace smoothing para valores no vistos
                total_prob += np.log(prob)
            label_probs[label] = np.exp(total_prob)  # Convertir de logaritmo a probabilidad
        total_sum = sum(label_probs.values())
        normalized_probs = {k: v / total_sum for k, v in label_probs.items()}  # Normalizar probabilidades
        probabilities.append(normalized_probs)
        predictions.append(max(normalized_probs, key=normalized_probs.get))
    return predictions, probabilities


In [2]:

def calculate_cll(y_true, probabilities, all_classes):
    """Calcula el Log-Verosimilitud Condicional (CLL)."""
    cll = 0
    for true_label, prob_dict in zip(y_true, probabilities):
        cll += np.log(prob_dict.get(true_label, 1e-6))  # Sumar log(probabilidad de la clase verdadera)
    return cll / len(y_true)  # Promedio por instancia

def simple_cross_validate(data, target, k=10):
    """Realiza validación cruzada simple y evalúa Accuracy y CLL."""
    X = data.drop(columns=[target])
    y = data[target]
    
    folds = np.array_split(data.sample(frac=1, random_state=42), k)  # Dividir datos aleatoriamente en k partes
    accuracies = []
    clls = []
    all_classes = np.unique(y)  # Todas las clases posibles

    for i in range(k):
        train = pd.concat([folds[j] for j in range(k) if j != i])
        test = folds[i]
        
        priors, cond_probs = naive_bayes_train(train.drop(target, axis=1), train[target])  # Entrenar modelo
        predictions, probabilities = naive_bayes_predict(test.drop(target, axis=1), priors, cond_probs)
        
        # Calcular métricas
        accuracy = np.mean(predictions == test[target])
        cll = calculate_cll(test[target].values, probabilities, all_classes)
        
        accuracies.append(accuracy)
        clls.append(cll)

    return {
        "mean_accuracy": np.mean(accuracies),
        "std_accuracy": np.std(accuracies),
        "mean_cll": np.mean(clls),
        "std_cll": np.std(clls)
    }

In [3]:
# Leer datos y seleccionar última columna como variable objetivo
data = pd.read_csv("C:\\Users\\Carlo\\Desktop\\IA\\tercer semestre\\teoria de la info\\bases de datos discretizadas bien\\soybean_dis.csv")
class_column = data.columns[-1]  # Identificar la última columna como la variable objetivo

# Realizar validación cruzada con la última columna automáticamente seleccionada
results = simple_cross_validate(data, target=class_column)

# Mostrar resultados
print(f"Simple CV - Mean Accuracy: {results['mean_accuracy']:.6f} ± {results['std_accuracy']:.6f}")
print(f"Simple CV - Mean CLL: {results['mean_cll']:.6f} ± {results['std_cll']:.6f}")

  return bound(*args, **kwds)


Simple CV - Mean Accuracy: 0.926726 ± 0.018774
Simple CV - Mean CLL: -0.399293 ± 0.115535
