# Laboratorio 8: Random Forest y despliegues

**Duración:** 2 horas  
**Formato:** Implementación, despliegue y competencia  

---

## Portada del equipo

**Integrantes:**
- Nombre 1 (Usuario GitHub)
- Nombre 2 (Usuario GitHub)
- Nombre 3 (Usuario GitHub)

**Repositorio del equipo:**  
<https://github.com/usuario/equipoX>

**Fecha de entrega:**  
__/__/____

## Elemento 1 - Implementación del Random Forest

In [5]:
!pip -q install scikit-learn joblib numpy pandas


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import numpy as np
from typing import List, Optional, Union
from sklearn.tree import DecisionTreeClassifier
from collections import Counter

class SimpleRandomForest:
    """
    Random Forest 'desde cero' usando DecisionTreeClassifier como base.
    - Bootstrap por árbol
    - Submuestreo aleatorio de features (max_features)
    - Votación mayoritaria en predict
    """
    def __init__(
        self,
        n_estimators: int = 100,
        max_features: Union[int, float, str, None] = "sqrt",
        max_depth: Optional[int] = None,
        criterion: str = "gini",
        random_state: Optional[int] = 42,
    ):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.criterion = criterion
        self.random_state = random_state
        self.trees: List[DecisionTreeClassifier] = []
        self._rng = np.random.default_rng(random_state)

    def _bootstrap_sample(self, X: np.ndarray, y: np.ndarray):
        n = X.shape[0]
        idx = self._rng.integers(0, n, size=n)
        return X[idx], y[idx]

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.trees = []
        seeds = self._rng.integers(0, 10_000_000, size=self.n_estimators)
        for s in seeds:
            Xi, yi = self._bootstrap_sample(X, y)
            clf = DecisionTreeClassifier(
                criterion=self.criterion,
                max_depth=self.max_depth,
                max_features=self.max_features,
                random_state=int(s),
            )
            clf.fit(Xi, yi)
            self.trees.append(clf)
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        preds = np.column_stack([t.predict(X) for t in self.trees])
        maj = []
        for row in preds:
            c = Counter(row)
            maj.append(c.most_common(1)[0][0])
        return np.array(maj)


In [11]:
import os, joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv("iris_train.csv")
y_col = "target"
X_df = df.drop(columns=[y_col])
y = df[y_col].to_numpy()

X = X_df.select_dtypes(include=[np.number]).to_numpy()
feature_names = X_df.columns.tolist()

class_names = ["setosa", "versicolor", "virginica"]

# 3) Split reproducible
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

rf = SimpleRandomForest(
    n_estimators=100,
    max_features="sqrt",
    max_depth=None,
    random_state=42,
    criterion="gini",
).fit(Xtr, ytr)

yp = rf.predict(Xte)
print("Accuracy:", round(accuracy_score(yte, yp), 4))
print("Matriz de confusión:\n", confusion_matrix(yte, yp))
print(classification_report(yte, yp, target_names=class_names))

# 6) Guardar PKL
os.makedirs("model", exist_ok=True)
bundle = {
    "model": rf,
    "feature_names": feature_names,
    "class_names": class_names,
}
joblib.dump(bundle,"../model/model.pkl")
print("Modelo guardado en 'model/model.pkl'")


Accuracy: 0.84
Matriz de confusión:
 [[9 0 0]
 [0 6 2]
 [0 2 6]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.75      0.75      0.75         8
   virginica       0.75      0.75      0.75         8

    accuracy                           0.84        25
   macro avg       0.83      0.83      0.83        25
weighted avg       0.84      0.84      0.84        25

Modelo guardado en 'model/model.pkl'


### Elemento 1 - Preguntas teóricas

## Elemento 2 - Comparativa con scikit-learn

In [None]:
# Comparativa SimpleRandomForest vs scikit-learn RandomForestClassifier

import time, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
)
from sklearn.ensemble import RandomForestClassifier

# 1) Datos
df = pd.read_csv("iris_train.csv")
y = df["target"].to_numpy()
X = df.drop(columns=["target"]).to_numpy()
class_names = ["setosa", "versicolor", "virginica"]  # si tu target es 0/1/2

Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

def train_and_eval(model, Xtr, ytr, Xte, yte, name="model"):
    t0 = time.time()
    model.fit(Xtr, ytr)
    fit_sec = time.time() - t0

    yp = model.predict(Xte)
    acc = accuracy_score(yte, yp)
    pr, rc, f1, _ = precision_recall_fscore_support(
        yte, yp, average="macro", zero_division=0
    )
    cm = confusion_matrix(yte, yp)

    print(f"\n=== {name} ===")
    print(f"Tiempo de entrenamiento: {fit_sec:.4f} s")
    # OOB si existe
    if hasattr(model, "oob_score_") and model.oob_score_ is not None:
        print(f"OOB score: {model.oob_score_:.4f}")
    print(f"Accuracy: {acc:.4f} | Macro P: {pr:.4f}  R: {rc:.4f}  F1: {f1:.4f}")
    print("Matriz de confusión:\n", cm)
    try:
        print(classification_report(yte, yp, target_names=class_names))
    except Exception:
        print(classification_report(yte, yp))
    return {
        "name": name, "fit_sec": fit_sec, "acc": acc, "precision_macro": pr,
        "recall_macro": rc, "f1_macro": f1, "cm": cm
    }

# 2) Tu implementación "desde cero"
res_custom = train_and_eval(
    SimpleRandomForest(
        n_estimators=1000,
        max_features="sqrt",
        max_depth=None,
        random_state=42,
        # Si tu versión mejorada soporta OOB y paralelismo, actívalos:
        # bootstrap=True, max_samples=None, n_jobs=-1
    ),
    Xtr, ytr, Xte, yte, name="SimpleRandomForest (custom)"
)

# 3) Implementación scikit-learn
res_sklearn = train_and_eval(
    RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        max_features="sqrt",
        random_state=42,
        oob_score=True,           # calcula OOB (requiere bootstrap=True por defecto)
        n_jobs=-1
    ),
    Xtr, ytr, Xte, yte, name="RandomForestClassifier (sklearn)"
)

# 4) Resumen en un DataFrame (bonito para ver/ordenar)
import pandas as pd
summary = pd.DataFrame([res_custom, res_sklearn])[
    ["name", "fit_sec", "acc", "precision_macro", "recall_macro", "f1_macro"]
].sort_values(by="acc", ascending=False)
summary.reset_index(drop=True, inplace=True)
summary



=== SimpleRandomForest (custom) ===
Tiempo de entrenamiento: 0.8797 s
Accuracy: 0.8400 | Macro P: 0.8333  R: 0.8333  F1: 0.8333
Matriz de confusión:
 [[9 0 0]
 [0 6 2]
 [0 2 6]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.75      0.75      0.75         8
   virginica       0.75      0.75      0.75         8

    accuracy                           0.84        25
   macro avg       0.83      0.83      0.83        25
weighted avg       0.84      0.84      0.84        25


=== RandomForestClassifier (sklearn) ===
Tiempo de entrenamiento: 0.1812 s
OOB score: 0.9700
Accuracy: 0.8400 | Macro P: 0.8333  R: 0.8333  F1: 0.8333
Matriz de confusión:
 [[9 0 0]
 [0 6 2]
 [0 2 6]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.75      0.75      0.75         8
   virginica       0.75      0.75      0.75         8

    accuracy        

Unnamed: 0,name,fit_sec,acc,precision_macro,recall_macro,f1_macro
0,SimpleRandomForest (custom),0.879725,0.84,0.833333,0.833333,0.833333
1,RandomForestClassifier (sklearn),0.181208,0.84,0.833333,0.833333,0.833333


### Elemento 2 - Preguntas teóricas

## Elemento 3 - Creación y despliegue de la API

### Elemento 3 - Preguntas teóricas