In [37]:
import numpy as np
import pandas as pd

def naive_bayes_train(X, y):
    # Calcular las probabilidades previas de cada clase
    labels, label_counts = np.unique(y, return_counts=True)
    prior_probs = label_counts / len(y)
    priors = dict(zip(labels, prior_probs)) # Se mapea cada etiqueta con su probabilidad previa

    # Calcular las probabilidades condicionales para cada atributo dado cada clase
    cond_probs = {}
    for label in labels:
        subset = X[y == label] # Se filtra los datos para la clase actual
        features_probs = {}
        for col in X.columns:
            levels, counts = np.unique(subset[col], return_counts=True) 
            probs = counts / counts.sum() # Probabilidad de cada valor del atributo dentro de la clase
            features_probs[col] = dict(zip(levels, probs))
        cond_probs[label] = features_probs # Se mapea la clase con las probabilidades de sus atributos

    return priors, cond_probs

def naive_bayes_predict(X, priors, cond_probs):
    predictions = []
    for _, row in X.iterrows(): # Iterar sobre cada instancia para hacer predicciones
        label_probs = {}
        for label, features_probs in cond_probs.items():
            total_prob = np.log(priors[label]) 
            for feature, value in row.items():
                feature_probs = features_probs.get(feature, {})
                prob = feature_probs.get(value, 1e-6)  # Laplace smoothing para valores no vistos
                total_prob += np.log(prob)
            label_probs[label] = total_prob
        predictions.append(max(label_probs, key=label_probs.get)) # Escoger la clase con mayor probabilidad
    return predictions

In [38]:
def simple_cross_validate(data, target, k=10):
    folds = np.array_split(data.sample(frac=1, random_state=42), k) # Dividir datos aleatoriamente en k partes
    accuracies = []

    for i in range(k):
        train = pd.concat([folds[j] for j in range(k) if j != i])
        test = folds[i]
        priors, cond_probs = naive_bayes_train(train.drop(target, axis=1), train[target]) # Entrenar modelo
        predictions = naive_bayes_predict(test.drop(target, axis=1), priors, cond_probs)
        accuracy = np.mean(predictions == test[target])
        accuracies.append(accuracy)

    return np.mean(accuracies), np.std(accuracies)

In [39]:
def stratified_cross_validate(data, target, k=10):
    data = data.sample(frac=1, random_state=42).reset_index(drop=True) # Barajea los datos
    proportions = data[target].value_counts(normalize=True) # Proporciones de cada clase
    folds = [pd.DataFrame() for _ in range(k)]

     # Estratificar datos por clase
    for _, group_data in data.groupby(target):
        group_folds = np.array_split(group_data, k)
        for i in range(k):
            folds[i] = pd.concat([folds[i], group_folds[i]], ignore_index=True)

    accuracies = []
    for i in range(k):
        train = pd.concat([folds[j] for j in range(k) if j != i], ignore_index=True)
        test = folds[i]
        priors, cond_probs = naive_bayes_train(train.drop(target, axis=1), train[target]) # Entrenar modelo
        predictions = naive_bayes_predict(test.drop(target, axis=1), priors, cond_probs)
        accuracy = np.mean(predictions == test[target])
        accuracies.append(accuracy)

    return np.mean(accuracies), np.std(accuracies)

In [40]:
data = pd.read_csv("C:\\Users\\Carlo\\Desktop\\IA\\segundo semestre\\apredizaje automatico\\tarea\\base de datos discretizadas\\discretized_iris.csv")

# Ejecutar la validación cruzada simple
mean_accuracy, std_deviation = simple_cross_validate(data, 'class')
print(f"Simple CV - Mean Accuracy: {mean_accuracy:.4f}, Std Deviation: {std_deviation:.4f}")

# Ejecutar la validación cruzada estratificada
mean_accuracy, std_deviation = stratified_cross_validate(data, 'class')
print(f"Stratified CV - Mean Accuracy: {mean_accuracy:.4f}, Std Deviation: {std_deviation:.4f}")


Simple CV - Mean Accuracy: 0.9400, Std Deviation: 0.0554
Stratified CV - Mean Accuracy: 0.9333, Std Deviation: 0.0667


  return bound(*args, **kwds)
  return bound(*args, **kwds)
