In [36]:
import pandas as pd
import numpy as np

def calculate_caim(quanta_matrix):
    max_values = np.max(quanta_matrix, axis=0)
    sums = np.sum(quanta_matrix, axis=0)
    # Evitar la división por cero y situaciones de indeterminación matemática
    caim = np.sum((max_values ** 2) / np.where(sums == 0, 1, sums)) / len(sums) if np.sum(sums) != 0 else 0
    return caim


def create_quanta_matrix(data, attribute, intervals, classes, class_label):
    quanta_matrix = np.zeros((len(classes), len(intervals) - 1))
    for idx, cl in enumerate(classes):
        class_data = data[data[class_label] == cl][attribute]
        for i in range(1, len(intervals)):
            quanta_matrix[idx, i - 1] = class_data[(class_data >= intervals[i - 1]) & (class_data < intervals[i])].count()
    return quanta_matrix

def caim_discretization(data, attribute, class_label):
    values = data[attribute].dropna().unique()
    classes = data[class_label].unique()
    print(f'Classes found: {classes}')  # Imprimir las clases encontradas

    min_value = np.min(values)
    max_value = np.max(values)
    values_sorted = np.sort(values)
    mid_points = (values_sorted[:-1] + values_sorted[1:]) / 2
    boundaries = np.concatenate(([min_value], mid_points, [max_value]))
    intervals = [min_value, max_value]

    global_caim = 0

    while True:
        best_caim = global_caim
        best_interval = None

        for boundary in boundaries:
            if boundary not in intervals:
                test_intervals = sorted(intervals + [boundary])
                quanta_matrix = create_quanta_matrix(data, attribute, test_intervals, classes, class_label)
                caim_value = calculate_caim(quanta_matrix)

                if caim_value > best_caim:
                    best_caim = caim_value
                    best_interval = boundary

        if best_interval is not None and best_caim > global_caim:
            intervals.append(best_interval)
            intervals = sorted(intervals)
            global_caim = best_caim
        else:
            break

    data[attribute] = pd.cut(data[attribute], bins=intervals, labels=range(len(intervals) - 1), include_lowest=True, right=True)
    return intervals

In [56]:
data = pd.read_csv("C:\\Users\\Carlo\\Desktop\\IA\\segundo semestre\\apredizaje automatico\\tarea\\bases de datos\\winequality-white.csv")
print(data.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [57]:
# Especifica manualmente las columnas de atributos y la columna de clase
attributes = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
class_label = 'quality'
# Se eliminan las filas que contengan valores NaN en la columna 'class' y se actualiza
data = data.dropna(subset=[class_label]) 

# Aplicar la discretización CAIM a cada atributo especificado
discretization_results = {}
for attribute in attributes:
    intervals = caim_discretization(data, attribute, class_label)
    discretization_results[attribute] = intervals

# Imprimir los resultados
for attribute, intervals in discretization_results.items():
    print(f"Intervalos para {attribute}: {intervals}")

Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Classes found: [6 5 7 8 4 3 9]
Intervalos para fixed acidity: [3.8, 8.55, 14.2]
Intervalos para volatile acidity: [0.08, 0.3025, 1.1]
Intervalos para citric acid: [0.0, 0.195, 1.66]
Intervalos para residual sugar: [0.6, 17.65, 65.8]
Intervalos para chlorides: [0.009, 0.0495, 0.346]
Intervalos para free sulfur dioxide: [2.0, 13.5, 289.0]
Intervalos para total sulfur dioxide: [9.0, 172.5, 440.0]
Intervalos para density: [0.98711, 0.995735, 1.03898]
Intervalos para pH: [2.72, 3.1950000000000003, 3.82]
Intervalos para sulphates: [0.22, 0.325, 1.08]
Intervalos para alcohol: [8.0, 9.5166666665, 14.2]


In [58]:
data.to_csv('discretized_winequality-white.csv', index=False)