In [29]:
import numpy as np
import pandas as pd
from graphviz import Digraph

# 1. Cargar el dataset
file_path = "insurance.csv"
df = pd.read_csv(file_path)

# 2. Extraer la columna 'bmi'
bmi_values = df["bmi"].values

# 3. Clase para el Árbol de Aislamiento modificada
class IsolationTree:
    def __init__(self, depth=0, max_depth=None):
        self.depth = depth
        self.max_depth = max_depth
        self.split_value = None  # Valor usado para dividir
        self.split_point = None  # Valor concreto del dato usado para dividir
        self.left = None
        self.right = None

    def fit(self, data):
        if self.max_depth is not None and self.depth >= self.max_depth or len(data) <= 1:
            return
        
        # Seleccionamos un punto aleatorio como valor de división
        self.split_point = np.random.choice(data)
        self.split_value = np.random.uniform(min(data), max(data))
        
        left_data = data[data < self.split_value]
        right_data = data[data >= self.split_value]
        
        if len(left_data) > 0:
            self.left = IsolationTree(depth=self.depth + 1, max_depth=self.max_depth)
            self.left.fit(left_data)
        if len(right_data) > 0:
            self.right = IsolationTree(depth=self.depth + 1, max_depth=self.max_depth)
            self.right.fit(right_data)

# 4. Construir árbol con profundidad 8
tree = IsolationTree(max_depth=8)
tree.fit(bmi_values)

# 5. Función para construir el gráfico mostrando solo el valor de partición
def build_graphviz(tree, graph=None, name="Raíz"):
    if graph is None:
        graph = Digraph()
        graph.attr('node', shape='box', style='rounded', color='lightblue2')
        graph.attr('edge', fontsize='10')
    
    # Mostrar solo el valor concreto usado para la partición
    node_label = f"BMI: {tree.split_point:.1f}" if tree.split_point is not None else "Raíz"
    graph.node(name, label=node_label)
    
    if tree.left is not None:
        left_name = f"{name}_Izq"
        graph.edge(name, left_name, label=f"< {tree.split_value:.2f}")
        build_graphviz(tree.left, graph, left_name)
    
    if tree.right is not None:
        right_name = f"{name}_Der"
        graph.edge(name, right_name, label=f"≥ {tree.split_value:.2f}")
        build_graphviz(tree.right, graph, right_name)
    
    return graph

# 6. Construir y visualizar el gráfico
graph = build_graphviz(tree)
graph.attr(label="Árbol de Aislamiento - Valores de BMI\n(Se muestra el valor concreto usado en cada partición)", 
           labelloc="t", fontsize="12", fontname="Arial")

# Ajustes de formato
graph.graph_attr['nodesep'] = '0.4'
graph.graph_attr['ranksep'] = '0.4'

# Guardar y mostrar
graph.render('isolation_tree_bmi_particion', view=True, format='png', cleanup=True)

'isolation_tree_bmi_particion.png'

## Inciso 2

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
numericas = ['age', 'bmi', 'children', 'charges']
df_numeric = df[numericas] #Seleccionar variables numéricas que es donde hay sentido los outliers

#Estandarización de los datos numéricos
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)
df_scaled = pd.DataFrame(df_scaled, columns=numericas)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier_iso'] = iso_forest.fit_predict(df_scaled) == -1

Metemos el dato que queramos buscar del arbol

In [27]:
resultado_iso = df.loc[df['outlier_iso'], ['bmi']]
busqueda = resultado_iso[resultado_iso['bmi'] == 28.6]
busqueda

Unnamed: 0,bmi
32,28.6
