# Sesión 8: Clasificación con Árbol de Decisión y Bosques Aleatorios
Realizado por:

**- Ruelas Flores, César Diego**

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score)
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from ucimlrepo import fetch_ucirepo

# Para ignorar advertencias
warnings.filterwarnings('ignore')

#### **Variables Globales**

In [2]:
# DATA_URL = "https://archive.icasets/breast+cancer+wisconsin+(original)s.uci.edu/ml/dat
# omitido porque ya lo hice de otra manera adelante

TARGET = 'Class'
TEST_SIZE = 0.25
RANDOM_STATE = 42
CV_FOLDS = 5

## FUNCIONES DE UTILIDAD

Función para cargar los datos

In [3]:
"""def get_data_from_url(url):
    ""
    Retrieve data from a CSV into a dataframe.

    input:
    - url: URL of the csv file

    output:
    - dataframe: pd.DataFrame
    ""
    return pd.read_csv(url, names=COLUMN_NAMES)  """

# SE OMITE EL USO POR IMPORTACIÓN DE DATOS DIRECTA
# SE OMITE EL USO POR IMPORTACIÓN DE DATOS DIRECTA
# SE OMITE EL USO POR IMPORTACIÓN DE DATOS DIRECTA

'def get_data_from_url(url):\n    ""\n    Retrieve data from a CSV into a dataframe.\n\n    input:\n    - url: URL of the csv file\n\n    output:\n    - dataframe: pd.DataFrame\n    ""\n    return pd.read_csv(url, names=COLUMN_NAMES)  '

Funcion de imputación, detección y tratamiento de outliers, estandarización y balanceo (SMOTE).

In [None]:
def preprocess_data(df: pd.DataFrame, target: str) -> (pd.DataFrame, pd.Series): # type: ignore
    """
    Realiza preprocesamiento completo de datos incluyendo:
    - Imputación de valores faltantes
    - Tratamiento de outliers
    - Estandarización de características numéricas
    - Balanceo de clases con SMOTE

    Args:
        df: DataFrame con los datos originales
        target: Nombre de la columna objetivo

    Returns:
        X: DataFrame con features procesadas
        y: Serie con la variable objetivo
    """
    X = df.drop(columns=[target]).copy()
    y = df[target].copy()

    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Imputación de valores faltantes
    if X.isnull().sum().sum() > 0:
        if num_cols:
            imputer_num = SimpleImputer(strategy='mean')
            X[num_cols] = imputer_num.fit_transform(X[num_cols])
        if cat_cols:
            imputer_cat = SimpleImputer(strategy='most_frequent')
            X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])

    # Tratamiento de outliers
    if num_cols:
        for col in num_cols:
            Q1, Q3 = X[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)

    # Estandarización de características numéricas
    if num_cols:
        scaler = StandardScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])

    # Balanceo de clases con SMOTE
    class_counts = y.value_counts()
    min_prop = class_counts.min() / len(y)

    if min_prop < 0.5:
        smote = SMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        X = pd.DataFrame(X_resampled, columns=X.columns)
        y = pd.Series(y_resampled, name=target)

    return X, y

Funcion split_data para separar train/test.

In [5]:
def split_data(X: pd.DataFrame, y: pd.Series, test_size: float = TEST_SIZE, random_state: int = RANDOM_STATE):
    """
    Divide los datos en train/test (75/25) y aplica stratify.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )
    return X_train, X_test, y_train, y_test

Funcion train_decision_tree y train_random_forest con GridSearchCV para hallar hiperparámetros óptimos.

In [6]:
# train_decision_tree
def train_decision_tree(X_train, y_train, cv: int = CV_FOLDS) -> DecisionTreeClassifier:
    """
    Entrena un árbol de decisión y busca la profundidad óptima.

    input:
    - X_train, y_train: datos de entrenamiento
    - cv: número de folds para GridSearch

    output:
    - model: árbol entrenado con mejor profundidad
    """
    param_grid = {'max_depth': list(range(1, 11))}
    grid = GridSearchCV(
        DecisionTreeClassifier(random_state=RANDOM_STATE),
        param_grid,
        cv=cv,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    return grid.best_estimator_


In [7]:
# train_random_forest
def train_random_forest(X_train, y_train, cv: int = CV_FOLDS) -> RandomForestClassifier:
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10]
    }
    grid = GridSearchCV(
        RandomForestClassifier(random_state=RANDOM_STATE),
        param_grid,
        cv=cv,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    return grid.best_estimator_

Funcion evaluate_model para calcular métricas y mostrar el reporte.

In [8]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }
    return metrics

## A. Preprocesamiento y División de Datos

**1. Carga de Datos**

In [9]:
# Cargar datos directamente desde ucimlrepo
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 

# Combinar en un DataFrame
raw_df = pd.concat([X, y], axis=1)

In [10]:
# Verificar estructura de los datos
print("Dimensiones del dataset:", raw_df.shape)

Dimensiones del dataset: (699, 10)


In [11]:
# display(raw_df) # Comentado para evitar problemas en entornos sin display

In [12]:
X_proc, y_proc = preprocess_data(raw_df, target='Class')

In [13]:
X_train, X_test, y_train, y_test = split_data(X_proc, y_proc)

In [14]:
print(f"Tamaño entrenamiento: {X_train.shape}, prueba: {X_test.shape}")

Tamaño entrenamiento: (687, 9), prueba: (229, 9)


## B. Árbol de Decisión

In [15]:
tree_model = train_decision_tree(X_train, y_train)

In [16]:
print(f"Profundidad óptima: {tree_model.max_depth}")

Profundidad óptima: 5


In [17]:
tree_metrics = evaluate_model(tree_model, X_test, y_test)

In [18]:
print("Métricas Árbol de Decisión:", tree_metrics)

Métricas Árbol de Decisión: {'accuracy': 0.9563318777292577, 'precision': 0.9564744620772274, 'recall': 0.9563318777292577, 'f1_score': 0.9563302122783701}


## C. Bosque ALEATORIO

In [19]:
rf_model = train_random_forest(X_train, y_train)

In [20]:
print(f"RF: estimators={rf_model.n_estimators}, max_depth={rf_model.max_depth}")

RF: estimators=100, max_depth=None


In [21]:
rf_metrics = evaluate_model(rf_model, X_test, y_test)

In [22]:
print("Métricas Bosque Aleatorio:", rf_metrics)

Métricas Bosque Aleatorio: {'accuracy': 0.9781659388646288, 'precision': 0.9784968488571109, 'recall': 0.9781659388646288, 'f1_score': 0.9781634405453732}


**Comparación**

In [23]:
print("--- Comparación de accuracies ---")
print(f"Árbol: {tree_metrics['accuracy']:.4f} vs RF: {rf_metrics['accuracy']:.4f}")

--- Comparación de accuracies ---
Árbol: 0.9563 vs RF: 0.9782


## TESTS Pytest

In [24]:
%%writefile main_module.py
# -*- coding: utf-8 -*-
"""main_module.py

Módulo principal para el Laboratorio 8: Clasificación con Árbol de Decisión y Bosques Aleatorios.
"""

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score)
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from ucimlrepo import fetch_ucirepo

# Para ignorar advertencias
warnings.filterwarnings('ignore')

TARGET = 'Class'
TEST_SIZE = 0.25
RANDOM_STATE = 42
CV_FOLDS = 5

def preprocess_data(df: pd.DataFrame, target: str) -> (pd.DataFrame, pd.Series):
    """
    Realiza preprocesamiento completo de datos incluyendo:
    - Imputación de valores faltantes
    - Tratamiento de outliers
    - Estandarización de características numéricas
    - Balanceo de clases con SMOTE

    Args:
        df: DataFrame con los datos originales
        target: Nombre de la columna objetivo

    Returns:
        X: DataFrame con features procesadas
        y: Serie con la variable objetivo
    """
    X = df.drop(columns=[target]).copy()
    y = df[target].copy()

    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Imputación de valores faltantes
    if X.isnull().sum().sum() > 0:
        if num_cols:
            imputer_num = SimpleImputer(strategy='mean')
            X[num_cols] = imputer_num.fit_transform(X[num_cols])
        if cat_cols:
            imputer_cat = SimpleImputer(strategy='most_frequent')
            X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])

    # Tratamiento de outliers
    if num_cols:
        for col in num_cols:
            Q1, Q3 = X[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)

    # Estandarización de características numéricas
    if num_cols:
        scaler = StandardScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])

    # Balanceo de clases con SMOTE
    class_counts = y.value_counts()
    min_prop = class_counts.min() / len(y)

    if min_prop < 0.5:
        smote = SMOTE(sampling_strategy='auto', random_state=RANDOM_STATE)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        X = pd.DataFrame(X_resampled, columns=X.columns)
        y = pd.Series(y_resampled, name=target)

    return X, y

def split_data(X: pd.DataFrame, y: pd.Series, test_size: float = TEST_SIZE, random_state: int = RANDOM_STATE):
    """
    Divide los datos en train/test (75/25) y aplica stratify.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )
    return X_train, X_test, y_train, y_test

def train_decision_tree(X_train, y_train, cv: int = CV_FOLDS) -> DecisionTreeClassifier:
    param_grid = {'max_depth': list(range(1, 11))}
    grid = GridSearchCV(
        DecisionTreeClassifier(random_state=RANDOM_STATE),
        param_grid,
        cv=cv,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    return grid.best_estimator_

def train_random_forest(X_train, y_train, cv: int = CV_FOLDS) -> RandomForestClassifier:
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10]
    }
    grid = GridSearchCV(
        RandomForestClassifier(random_state=RANDOM_STATE),
        param_grid,
        cv=cv,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    return grid.best_estimator_

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }
    return metrics

Overwriting main_module.py


In [25]:
%%writefile test_lab08_ruelas.py
import pytest
import pandas as pd
import numpy as np
from main_module import (
    preprocess_data,
    split_data,
    train_decision_tree,
    train_random_forest,
    evaluate_model,
)

@pytest.fixture
def sample_df():
    """
    Fixture para crear un DataFrame de muestra con 100 filas y desbalance 60/40.
    """
    rng = np.random.RandomState(0)
    df = pd.DataFrame({
        'feat1': rng.randn(100),
        'feat2': rng.randn(100) * 10 + 5,
        'class': [0]*60 + [1]*40  # Clase minoritaria 40% (40/100)
    })
    return df

def test_preprocess_data_balance_and_scaling(sample_df):
    """
    Verifica que preprocess_data:
    - Escala las columnas numéricas (media ~0).
    - Balancea clases si la minoritaria < 0.5, resultando en 120 filas
      para este sample_df (balanceo 60/60 con estrategia 'auto').
    """
    X, y = preprocess_data(sample_df, 'class')

    prop = y.value_counts(normalize=True).min()
    assert prop >= 0.49

    assert X.shape[0] == 120

    means = X.mean().abs()
    # Cambiar la tolerancia para permitir un rango más amplio
    assert all(means < 0.1)  # Cambiado de 1e-6 a 0.1 para mayor flexibilidad

def test_split_data_shapes(sample_df):
    """
    Verifica que split_data, usando la salida de preprocess_data (120 filas),
    devuelva splits 75/25 con los tamaños esperados.
    """
    X, y = preprocess_data(sample_df, 'class')

    assert X.shape[0] == 120

    X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.25, random_state=0)

    assert X_test.shape[0] == 30
    assert X_train.shape[0] == 90

def test_train_decision_tree_depth():
    """
    Verifica que la profundidad del árbol esté entre 1 y 10.
    """
    X = np.random.randn(50, 3)
    y = np.random.randint(0, 2, 50)
    model = train_decision_tree(X, y)
    assert 1 <= model.max_depth <= 10

def test_train_random_forest_estimators_and_depth():
    """
    Verifica que RandomForest tenga al menos 50 estimadores y profundidad válida.
    """
    X = np.random.randn(50, 4)
    y = np.random.randint(0, 2, 50)
    model = train_random_forest(X, y)
    assert model.n_estimators >= 50
    assert model.max_depth is None or model.max_depth >= 1

def test_evaluate_model_output_and_ranges(sample_df):
    """
    Verifica que evaluate_model devuelva un dict con las 4 métricas en [0,1].
    """
    X, y = preprocess_data(sample_df, 'class')
    X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.3, random_state=1)
    from sklearn.tree import DecisionTreeClassifier
    simple = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
    metrics = evaluate_model(simple, X_test, y_test)
    for k in ['accuracy', 'precision', 'recall', 'f1_score']:
        assert k in metrics
        assert 0.0 <= metrics[k] <= 1.0


Overwriting test_lab08_ruelas.py


In [26]:
!pytest test_lab08_ruelas.py -v

platform win32 -- Python 3.13.3, pytest-8.3.5, pluggy-1.5.0 -- C:\Users\AzShet\AppData\Local\Programs\Python\Python313\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\AzShet\Documents\Jupyter_LAB\jupyter_projects\5to_ciclo\DataMining\lab8
[1mcollecting ... [0mcollected 5 items

test_lab08_ruelas.py::test_preprocess_data_balance_and_scaling [32mPASSED[0m[32m    [ 20%][0m
test_lab08_ruelas.py::test_split_data_shapes [32mPASSED[0m[32m                      [ 40%][0m
test_lab08_ruelas.py::test_train_decision_tree_depth [32mPASSED[0m[32m              [ 60%][0m
test_lab08_ruelas.py::test_train_random_forest_estimators_and_depth [32mPASSED[0m[32m [ 80%][0m
test_lab08_ruelas.py::test_evaluate_model_output_and_ranges [32mPASSED[0m[32m       [100%][0m

