In [385]:
from ucimlrepo import fetch_ucirepo
from pandas import concat

Instrucciones.

Diseña una clase que implemente un clasificador bayesiano ingenuo (sin usar funciones de sklearn)


La clase debe construirse recibiendo un único parámetro que indica si la distribución de variables continuas se estimará usando la normal o KDE.


Un metodo ".fit()" que debe recibir como parámetros de entrada:


Los datos de entrenamiento X, y. Donde "X" guarda los valores de las variables y "y" la clase a la que pertenece cada dato.


Un diccionario que indique las variables (indices o nombres de columnas) que son "continuas", "enteras" y "categóricas".


El método ".fit()" debe calcular:
    Las probabilidades a priori de cada clase en Y. Es decir, P(Y=y_i)

    Las funciones de masa o densidad de probabilidad condicionales p(X_j = x | Y = y_i), para cada j=1,...,L y y_i para i=1,...,M, donde L es la dimensión de los datos y M el número de clases.


Para las variables categóricas debe usar la distribución Bernoulli (o en general distribución categórica).


Para las variables enteras calcular la media y estimar el pdf de una distribución Poisson.


Para las variables continuas calcular la media y la varianza de los datos para estimar el pdf de la normal, o estimar el KDE, de acuerdo con el parámetro de la clase.


El método ".predict()" el cual debe:
    Recibir un vector X_pred que debe tener el mismo número de columnas que X.
    Calcular el clasificador bayesiano ingenuo \PI_{j=1}^L p(X_j=x_j | Y=y_i)P(Y=y_i) para cada clase i=1,...,M y devolver el valor de y_i que lo maximiza.


Prueba tu clasificador usando los datos de Heart Disease.


Divide tus datos de entrenamiento y prueba de forma aleatoria. Usa proporciones 70-30 u 80-20.


Elige entre 5 y 7 variables para tu clasificación. Debe incluir por lo menos una categórica, una entera y una continua.


Calcula la exactitud global de tu clasificación.




Recomendaciones:

Para la normal se sugiere utilizar el método ".pdf()" de la función "scipy.stats.norm"


Para la poisson se sugiere utilizar el método ".pdf()" de la función "scipy.stats.poisson"


Para el KDE se sugiere utilizar el método ".evaluate()" de la función "scipy.stats.gaussian_kde"

NOTA: algunos comentarios están en inglés y otros en español dependiendo de quién hizo esa parte, una disculpa la molestia

In [386]:
class Classification:
    def __init__(self, classification_data):
        self.__data = classification_data
        self.__vector = self.create_vector()
        self.__classes = self.create_classes()
        self.__matrix = self.create_matrix()
        self.__set = self.create_set()
        self.__classes = self.create_classes()
        self.__elements_number = len(self.__vector)
    
    def create_vector(self):
        vector = []
        
        for key, value in self.__data.iterrows():
            vector.append(value[0])
            
        return vector
    
    def create_classes(self):
        # Creation of the index set, this indicates the classes exist within the
        # objects passed through the vector representation of the objects
        
        return set(self.__vector)    

    def create_matrix(self):
        # To get the indexes and initialize the matrix
        
        index = self.get_classes()
        matrix = []

        # For each of the classes, we identify from the vector what class
        # they belong to

        for i in self.__vector:
            row = []
            for j in index:
                row.append(1 if i == j else 0)
            matrix.append(row)
            
        return matrix
        
    def create_set(self):
        # Creation of the dictionary that indicates the classes and its objects
        
        set_rep = {i: set() for i in self.get_classes()}
        
        # For each element of the vector representation, we take the element as
        # the set it will be sent to and the index as the element label
        
        for i in range(len(self.__vector)):
            set_rep[self.__vector[i]].add(i)
        
        return set_rep

    def get_vector(self):
        return self.__vector
    
    def get_matrix(self):
        return self.__matrix
        
    def get_set(self):
        return self.__set
    
    def get_elements_number(self):
        return self.__elements_number
    
    def get_classes(self):
        return self.__classes

    def get_class_probability(self, class_id):
        # If the class ID is not in the classifier, the probability is 0,
        # otherwise, the probability is computed as the proportion of
        # instances
        
        if class_id not in self.get_classes():
            return 0
        
        return len(self.get_set()[class_id]) / self.get_elements_number()


In [387]:
from scipy.stats import norm, poisson, gaussian_kde
import numpy as np
from collections import Counter

class BayesianClassifier():
    def __init__(self, normal = True):
        self.__normal = normal
        self.__classification = None
        self.__data_type = None
        self.__p_apriori = None
        self.__x = None
        self.__y = None
        self.__pdfs = {}  # Dictionary to store PDFs for each class and feature
        
    def fit(self, x, y, data_type):
        """
        Fit the Bayesian classifier with training data.

        :params:
        x : Pandas.DataFrame
        A DataFrame containing the attributes of the patterns that will be used to learn the classes. Will have l columns and n rows.

        y : Pandas.DataFrame
        A DataFrame containing a single column that indicates the class of the patterns. Must have n rows.

        data_type : dict
        A dictionary whose length has to be the same as the attributes vector of the training dataframe (l entries). It indicates wether the data of a column is "continous", "integer" or "categoric".
        """
        
        # Obtener las clases únicas
        
        self.__classification = Classification(y)
        self.__data_type = data_type
        
        self.__x = x
        self.__y = y
        
        # Reset indexes to avoid mismatch between y and x
        
        self.__x = x.reset_index(drop=True)
        self.__y = y.reset_index(drop=True)
        
        # Calculate a priori probabilities
        
        self.__p_apriori = {i: self.__classification.get_class_probability(i) for i in self.__classification.get_classes()}
        
        for class_name in self.__classification.get_classes():
            # For each attribute, we calculate the probability density function
            
            self.__pdfs[class_name] = {}
            class_data = self.__x.iloc[list(self.__y.index[self.__y.iloc[:, 0] == class_name]), :]

            for feature_name in self.__x.columns:
                # For each class, we separate the attributes and save them so it doesn't calculate it each time it runs
                
                feature_data = class_data[feature_name]

                if self.__data_type[feature_name] == 'continous':
                    if self.__normal:
                        # Use Gaussian distribution for continuous data
                        mean = np.mean(feature_data)
                        std = np.std(feature_data, ddof=1)
                        self.__pdfs[class_name][feature_name] = {'type': 'normal', 'mean': mean, 'std': std}
                    else:
                        # Use Kernel Density Estimation for non-normal continuous data
                        kde = gaussian_kde(feature_data)
                        self.__pdfs[class_name][feature_name] = {'type': 'kde', 'kde': kde}
                elif self.__data_type[feature_name] == 'integer':
                    # Use Poisson distribution for integer data
                    lambda_ = np.mean(feature_data)
                    self.__pdfs[class_name][feature_name] = {'type': 'poisson', 'lambda': lambda_}
                else:
                    # Use frequency for categorical data
                    feature_counts = Counter(feature_data)
                    total = sum(feature_counts.values())
                    self.__pdfs[class_name][feature_name] = {'type': 'categorical', 'counts': feature_counts, 'total': total}
    
    def conditional_probability(self, class_name, feature_name, feature_value):
        """
        Calculate the conditional probability P(feature_value | class_name).

        :params:
        class_name: str
        The class for which to calculate the conditional probability.

        feature_name: str
        The name of the feature.

        feature_value: float or int or str
        The value of the feature.

        :returns: float
        The conditional probability.
        """
    
        pdf_info = self.__pdfs[class_name][feature_name]

        if pdf_info['type'] == 'normal':
            # Gaussian distribution
            mean = pdf_info['mean']
            std = pdf_info['std']
            return norm.pdf(feature_value, loc=mean, scale=std)
        elif pdf_info['type'] == 'kde':
            # Kernel Density Estimation
            kde = pdf_info['kde']
            return kde.evaluate(feature_value)[0]
        elif pdf_info['type'] == 'poisson':
            # Poisson distribution
            lambda_ = pdf_info['lambda']
            return poisson.pmf(feature_value, mu=lambda_)
        elif pdf_info['type'] == 'categorical':
            # Categorical frequency
            counts = pdf_info['counts']
            total = pdf_info['total']
            if total == 0:
                # Return a small probability if total is zero
                return 1e-10
            return counts.get(feature_value, 0) / total
        else:
            raise ValueError(f"Unknown PDF type: {pdf_info['type']}")    
    
    def predict(self, x_pred):
        """
        Predict the class for a new observation using Naive Bayes

        Arguments:
            x_pred: list or array with feature values in the same order as training data

        :returns: str
            The predicted class
        """
        
        if isinstance(x_pred, list):
            x_pred = np.array(x_pred)

        probs = {}
        for class_value in self.__classification.get_classes():
            # Start with prior probability P(class)
            prob = self.__p_apriori[class_value]

            # Multiply by each conditional probability P(feature|class)
            # This returns the corresponding pseudo-probability that will be compared for each class without having to calculate P[x]
            for i, feature_value in enumerate(x_pred):
                feature_name = self.__x.columns[i]
                prob *= self.conditional_probability(class_value, feature_name, feature_value)

            probs[class_value] = prob

        # Return the class with the highest probability
        return max(probs, key=probs.get)

In [388]:
# fetch dataset

heart_disease = fetch_ucirepo(id = 45) 
  
# data (as pandas dataframes)

x = heart_disease.data.features 
y = heart_disease.data.targets 
  
data = concat([y, x], axis = 1)
data

Unnamed: 0,num,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,2,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,1,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,0,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,0,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,2,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,3,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
301,1,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [389]:
# Create train-test splits
split = 0.8

# Dividir el DataFrame aleatoriamente
train_data = data.sample(n=int(split*len(data)), random_state=42)
test_data = data.drop(train_data.index)

class_dict = {'age': 'integer',
                      'sex': 'categorical',
                      'cp': 'categorical',
                      'trestbps': 'integer',
                      'chol': 'integer',
                      'fbs': 'categorical',
                      'restecg': 'categorical',
                      'thalach': 'integer',
                      'exang': 'categorical',
                      'oldpeak': 'continous',
                      'slope': 'categorical',
                      'ca': 'integer',
                      'thal': 'integer'}

In [390]:
X_test = test_data.drop("num", axis = 1)
y_test = test_data.loc[:, ["num"]]
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
13,44,1,2,120,263,0,0,173,0,0.0,1,0.0,7.0
20,64,1,1,110,211,0,2,144,1,1.8,2,0.0,3.0
21,58,0,1,150,283,1,2,162,0,1.0,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,58,0,4,170,225,1,2,146,1,2.8,2,2.0,6.0
290,67,1,3,152,212,0,2,150,0,0.8,2,0.0,7.0
297,57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [391]:
# Create our class iterations
classifier = BayesianClassifier(normal = True)
classifier.fit(X_test, y_test, class_dict)

  vector.append(value[0])


In [392]:
# Predict over the top element
classifier.predict([63, 1, 1, 145, 233, 1, 2, 150, 0, 2.3, 3, 0.0, 6.0])

0

In [393]:
# Create predictions over the entire test dataset
y_preds = []

for key, value in X_test.iterrows():
    x_vals = list(value)
    y = classifier.predict(x_vals)
    y_preds.append(y)
                   
y_preds

[4,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 1,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 2,
 2,
 1,
 2,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 2,
 3,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 1,
 2,
 0,
 0]

In [394]:
# We now measure the global precision

correct_preds = sum(1 for pred, real in zip(y_preds, y_test.values) if pred == real)
precision = correct_preds / len(X_test)

print("Global precision:", precision)

Global precision: 0.7704918032786885
