# Explore here

In [2]:
# Your code here
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from numpy.random import seed, choice
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.metrics import *
from imblearn.metrics import specificity_score
import pandas as pd

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df['Outcome']=df['Outcome'].map({1:0, 0:1})

In [5]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
class RandomForestCustom:

  def __init__(self, n_estimators, random_state, max_depth, min_samples_leaf, max_features, X, y):
    self.n_estimators = n_estimators
    self.random_state = random_state
    self.max_depth = max_depth
    self.min_samples_leaf = min_samples_leaf
    self.max_features = max_features
    self.X = X
    self.y = y
    self.models = []

  def get_bootstrap_datasets(self):
    seed(self.random_state)
    # Seleccionar aleatoriamente las filas que vamos a usar para entrenar cada árbol
    idxs = [choice(len(self.X), len(self.X), replace=True) for _ in range(self.n_estimators)]
    # Seleccionar aleatoriamente las columnas que vamos a usar para entrenar cada árbol
    feature_idxs = [choice(range(self.X.shape[1]), self.max_features, replace=False) for _ in range(self.n_estimators)]
    return feature_idxs, [(self.X[idxs[i],:][:,feature_idxs[i]], self.y[idxs[i]]) for i in range(self.n_estimators)]

  def fit(self):
    feature_idxs, data_sets = self.get_bootstrap_datasets()
    for i, data in enumerate(data_sets):
      X_i, y_i = data
      tree_i = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state).fit(X_i, y_i)
      self.models.append((feature_idxs[i], tree_i))

  def predict(self, X):
    predictions = np.vstack([model.predict(X[:,idxs]) for idxs, model in self.models])
    predicciones = [Counter(predictions[:,i]).most_common(1)[0][0] for i in range(predictions.shape[1])]
    return predicciones

In [7]:
# Instanciamos la clase RF creando dos objetos
rf_simple = RandomForestCustom(n_estimators=5, random_state=42, max_depth=2, min_samples_leaf=20, max_features=X_train.shape[1]//2, X=X_train.values, y=y_train.values)
rf_complex = RandomForestCustom(n_estimators=100, random_state=42, max_depth=20, min_samples_leaf=1, max_features=int(X_train.shape[1]*0.8), X=X_train.values, y=y_train.values)

In [8]:
rf_simple.fit()
rf_complex.fit()

In [9]:
# Evaluar el modelo complejo en entrenamiento y prueba
train_pred_complex = rf_complex.predict(X_train.values)
test_pred_complex = rf_complex.predict(X_test.values)

# Evaluar el modelo simple en entrenamiento y prueba
train_pred_simple = rf_simple.predict(X_train.values)
test_pred_simple = rf_simple.predict(X_test.values)

In [10]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [11]:
get_metrics(y_train, y_test, train_pred_complex, test_pred_complex)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,1.0,1.0,1.0,1.0,1.0,1.0
Test,0.766234,0.8125,0.757576,0.83871,0.787879,0.727273
Diferencia,0.233766,0.1875,0.242424,0.16129,0.212121,0.272727


In [12]:
get_metrics(y_train, y_test, train_pred_simple, test_pred_simple)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.771987,0.845475,0.691171,0.758416,0.955112,0.42723
Test,0.766234,0.836364,0.70101,0.760331,0.929293,0.472727
Diferencia,0.005753,0.009111,-0.009839,-0.001915,0.025819,-0.045497
