### En esta oportunidad trabajeremos con la misma base de datos de diabetes, por eso importaremos la misma data, como pudimo evidenciar que dicha data no tiene duplicados, datos faltantes.

    - Procedemos a dividir los datos y aplicar el modelo de Random Forest.
    - Validar si buscando mejores hiperparametros mejoramos los resultados.

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.metrics import specificity_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

In [9]:
data_original = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")
data_original.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])
    
    return metrics_df

In [11]:
X = data_original.drop(['Outcome'], axis=1)
y = data_original['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Inicializar el modelo Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Entrenar el modelo
rf_classifier.fit(X_train, y_train)

# Predecir con el conjunto de prueba
predictions = rf_classifier.predict(X_test)

# Calcular la precisión del modelo
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7207792207792207


In [13]:
from sklearn.model_selection import GridSearchCV

# Define los parámetros a buscar
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Inicializa el modelo
rf_classifier = RandomForestClassifier(random_state=42)

# Realiza la búsqueda de cuadrícula
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Muestra los mejores parámetros encontrados
print("Mejores parámetros:", grid_search.best_params_)

# Evalúa el modelo con los mejores parámetros en el conjunto de prueba
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Mejores parámetros: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 20}
Accuracy: 0.7662337662337663


In [14]:
from sklearn.model_selection import RandomizedSearchCV


# Define los rangos para hiperparámetros
param_dist = {
    'n_estimators': [5, 10, 20],  # List of possible values for n_estimators
        'max_depth': [1,3,4,5, 8, 10],  # You can specify different values to search
    'max_features': [8,(X_train.shape[1] // 2) ,(X_train.shape[1] // 2) + 1, (X_train.shape[1] // 2) + 2],
    'min_samples_leaf': [3, 5, 10,15,20],
}

# Inicializa el modelo
rf_classifier = RandomForestClassifier(random_state=42)

# Realiza la búsqueda aleatoria
random_search = RandomizedSearchCV(rf_classifier, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, scoring='accuracy')
random_search.fit(X_train, y_train)

# Muestra los mejores parámetros encontrados
print("Mejores parámetros:", random_search.best_params_)

# Evalúa el modelo con los mejores parámetros en el conjunto de prueba
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)


Mejores parámetros: {'n_estimators': 20, 'min_samples_leaf': 3, 'max_features': 5, 'max_depth': 4}
Accuracy: 0.7922077922077922


In [15]:
train_forest_predict = random_search.predict(X_train)
test_forest_predict = random_search.predict(X_test)

In [16]:
get_metrics(y_train, y_test, train_forest_predict, test_forest_predict)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.820847,0.707447,0.7748,0.815951,0.624413,0.925187
Test,0.792208,0.698113,0.765657,0.72549,0.672727,0.858586
Diferencia,0.028639,0.009334,0.009144,0.090461,-0.048314,0.066601


### Con el metodo Random Forest presenta una mejora con respecto al arbol de decisión, disminuyendo el porcentaje de diferencia enre el metodo de entrenamiento y testeo.