In [15]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pickle

# Laboratorio 8: Random Forest y despliegues

**Duración:** 2 horas  
**Formato:** Implementación, despliegue y competencia  

---

## Portada del equipo

**Integrantes:**
- Nombre 1 (Usuario GitHub)
- Nombre 2 (Usuario GitHub)
- Nombre 3 (Usuario GitHub)

**Repositorio del equipo:**  
<https://github.com/usuario/equipoX>

**Fecha de entrega:**  
__/__/____

## Elemento 1 - Implementación del Random Forest

In [16]:
df=pd.read_csv('iris_train.csv')
X,y=df.iloc[:,:-1].values,df.iloc[:,-1].values

In [17]:
class RandomForest:
  def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt', random_state=17):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.max_features = max_features
    self.random_state = random_state
    self.trees = []

  def bootstrap(self, X, y):
    n_samples = len(X)
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

  def fit(self, X, y):
    self.trees = []
    n_features = X.shape[1]

    if self.max_features == 'sqrt':
      max_feats = max(1, int(np.sqrt(n_features)))
    elif self.max_features == 'log2':
      max_feats = max(1, int(np.log2(n_features)))
    else:
      max_feats = n_features

    # Handle max_depth based on string inputs 'sqrt' or 'log2'
    if self.max_depth == 'sqrt':
        depth = max(1, int(np.sqrt(n_features)))
    elif self.max_depth == 'log2':
        depth = max(1, int(np.log2(n_features)))
    else:
        depth = self.max_depth # Use the provided integer or None


    for i in range(self.n_estimators):
      tree = DecisionTreeClassifier(max_depth=depth, max_features=self.max_features, random_state=self.random_state + i)

      X_sample, y_sample = self.bootstrap(X, y)
      tree.fit(X_sample, y_sample)
      self.trees.append(tree)


  def predict(self, X):
    tree_preds = np.array([tree.predict(X) for tree in self.trees])
    # Use axis=0 for column-wise operation
    return np.array([np.argmax(np.bincount(tree_preds[:, i])) for i in range(tree_preds.shape[1])])


  def fit_predict(self, X, y):
    self.fit(X, y)
    return self.predict(X)

  def get_params(self, deep=True):
    return {'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'max_features': self.max_features, 'random_state': self.random_state}

  def set_params(self, **params):
    for key, value in params.items():
      setattr(self, key, value)
    return self

In [18]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,123.0,123.0,124.0,120.0,125.0
mean,5.821221,2.764442,3.909994,1.186667,0.984
std,2.428445,2.174626,2.484749,0.758474,0.822898
min,-11.601111,-14.870849,1.1,0.1,0.0
25%,5.1,2.7,1.6,0.3,0.0
50%,5.7,3.0,4.25,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,24.111271,4.4,23.439238,2.5,2.0


In [19]:
df[(df["sepal length (cm)"]>10) | (df["sepal length (cm)"]<0)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
22,-11.601111,3.8,6.4,2.0,2
97,24.111271,2.3,4.4,1.3,1


In [20]:
# Importar knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

# Rellenar datos atípicos usando el promedio de su clase
df=pd.read_csv('iris_train.csv')
df[(df > 10) | (df < 0)] = None

# Rellenar nulos con KNN por clase
df_filled = df.copy()

for clase in df.iloc[:, -1].unique():
    mascara = df.iloc[:, -1] == clase
    imputer = KNNImputer(n_neighbors=5)
    df_filled.loc[mascara, df.columns[:-1]] = imputer.fit_transform(df.loc[mascara, df.columns[:-1]])


In [21]:
df_filled.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,125.0,125.0,125.0,125.0,125.0
mean,5.82096,3.04176,3.75312,1.19616,0.984
std,0.817912,0.446658,1.766489,0.754991,0.822898
min,4.3,2.0,1.1,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.7,3.0,4.2,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.7,4.4,6.9,2.5,2.0


In [22]:
X,y=df_filled.iloc[:,:-1].values,df_filled.iloc[:,-1].values

rf=RandomForest(n_estimators=100, max_depth='sqrt', random_state=17)
rf.fit(X,y)

y_hat=rf.predict(X)
accuracy_score(y,y_hat)

0.96

In [23]:
with open("../models/modelo.pkl", "wb") as f:
    pickle.dump(rf, f)

### Elemento 1 - Preguntas teóricas

## Elemento 2 - Comparativa con scikit-learn

In [24]:
# comparar con sklearn
from sklearn.ensemble import RandomForestClassifier
rf_sklearn=RandomForestClassifier(n_estimators=100, max_depth=None, random_state=17)
rf_sklearn.fit(X,y)

y_hat=rf_sklearn.predict(X)
accuracy_score(y,y_hat)

1.0

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150, 200,400],  # Number of trees in the forest
    'max_depth': [None, 5, 10, 15]  # Maximum depth of the trees
}

# Create a GridSearchCV object
# We use the custom RandomForest class
grid_search = GridSearchCV(RandomForest(random_state=17), param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)

Best parameters:  {'max_depth': None, 'n_estimators': 50}
Best accuracy:  0.9359999999999999


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

# Make predictions using the scikit-learn RandomForestClassifier
y_pred_sklearn = rf_sklearn.predict(X)

# Calculate and print the confusion matrix for the scikit-learn model
conf_matrix_sklearn = confusion_matrix(y, y_pred_sklearn)
print("Matriz de Confusión (scikit-learn):")
print(conf_matrix_sklearn)

# Calculate and print the classification report for the scikit-learn model
class_report_sklearn = classification_report(y, y_pred_sklearn)
print("\nReporte de Clasificación (scikit-learn):")
print(class_report_sklearn)

Matriz de Confusión (scikit-learn):
[[43  0  0]
 [ 0 41  0]
 [ 0  0 41]]

Reporte de Clasificación (scikit-learn):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        41
           2       1.00      1.00      1.00        41

    accuracy                           1.00       125
   macro avg       1.00      1.00      1.00       125
weighted avg       1.00      1.00      1.00       125



In [27]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Make predictions on the training data (you might want to use a separate test set)
y_pred = rf.predict(X)

# Calculate and print the confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
print("Matriz de Confusión:")
print(conf_matrix)

# Calculate and print the classification report (includes precision, recall, f1-score)
class_report = classification_report(y, y_pred)
print("\nReporte de Clasificación:")
print(class_report)

Matriz de Confusión:
[[42  1  0]
 [ 0 39  2]
 [ 0  2 39]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      0.95      0.94        41
           2       0.95      0.95      0.95        41

    accuracy                           0.96       125
   macro avg       0.96      0.96      0.96       125
weighted avg       0.96      0.96      0.96       125



### Elemento 2 - Preguntas teóricas

## Elemento 3 - Creación y despliegue de la API

### Elemento 3 - Preguntas teóricas