In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

data_path = 'water_potability.csv'
df = pd.read_csv(data_path)

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

X = df_imputed.drop(columns='Potability')
y = df_imputed['Potability']


In [None]:

def entropy(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

def information_gain(data, split_attribute, target_attribute):
    total_entropy = entropy(data[target_attribute])
    
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]][target_attribute])
        for i in range(len(values))
    ])
    
    return total_entropy - weighted_entropy


In [None]:

def best_attribute(data, attributes, target_attribute):
    gains = [(attr, information_gain(data, attr, target_attribute)) for attr in attributes]
    return max(gains, key=lambda x: x[1])[0]

def id3(data, original_data, attributes, target_attribute="Potability", parent_node_class=None):
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[
            np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])
        ]
    
    elif len(attributes) == 0:
        return parent_node_class
    
    else:
        parent_node_class = np.unique(data[target_attribute])[
            np.argmax(np.unique(data[target_attribute], return_counts=True)[1])
        ]
        
        best_attr = best_attribute(data, attributes, target_attribute)
        tree = {best_attr: {}}
        
        remaining_attributes = [attr for attr in attributes if attr != best_attr]
        
        for value in np.unique(data[best_attr]):
            subset = data[data[best_attr] == value]
            subtree = id3(subset, original_data, remaining_attributes, target_attribute, parent_node_class)
            tree[best_attr][value] = subtree
        
        return tree

attributes = X.columns.tolist()
tree = id3(df_imputed, df_imputed, attributes, target_attribute="Potability")
tree


In [None]:

def predict_with_tree(tree, sample):
    while isinstance(tree, dict):
        attribute = next(iter(tree))
        value = sample.get(attribute)
        if value in tree[attribute]:
            tree = tree[attribute][value]
        else:
            return None 
    return tree 


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_pred = [predict_with_tree(tree, X_test.iloc[i]) for i in range(len(X_test))]

valid_indexes = [i for i, pred in enumerate(y_pred) if pred is not None]
y_test_filtered = y_test.iloc[valid_indexes]
y_pred_filtered = [y_pred[i] for i in valid_indexes]

accuracy = accuracy_score(y_test_filtered, y_pred_filtered)
classification_report_text = classification_report(y_test_filtered, y_pred_filtered)

print(f"Precisión: {accuracy * 100:.2f}%")
print("Reporte de Clasificación:")
print(classification_report_text)
