# Explore here

In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import *
from imblearn.metrics import specificity_score

from pickle import dump
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

from sklearn import *

## CARGA DE DATOS


In [167]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")

In [168]:
X = df.drop(['Outcome'], axis= 1)
y = df['Outcome']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

## DEFINICION DE MÉTRICAS 

In [169]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

## DECISION TREE


In [170]:
decission_tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=2 , min_samples_leaf=1, min_samples_split=5)
decission_tree_model.fit(X_train, y_train)

In [171]:
y_pred_test = decission_tree_model.predict(X_test)
y_pred_train = decission_tree_model.predict(X_train)

get_metrics(y_train, y_test,y_pred_train, y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.771987,0.62766,0.720886,0.723926,0.553991,0.887781
Test,0.772727,0.646465,0.730303,0.727273,0.581818,0.878788
Diferencia,-0.00074,-0.018805,-0.009417,-0.003346,-0.027828,0.008993


## RANDOM FOREST

In [172]:
RF_model = RandomForestClassifier(criterion = 'gini',max_depth=2, min_samples_split=10, max_features=None, bootstrap=True, random_state = 42)
RF_model.fit(X_train, y_train)

In [173]:
y_pred_test = RF_model.predict(X_test)
y_pred_train = RF_model.predict(X_train)

get_metrics(y_train, y_test,y_pred_train, y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.771987,0.593023,0.703277,0.778626,0.478873,0.927681
Test,0.772727,0.646465,0.730303,0.727273,0.581818,0.878788
Diferencia,-0.00074,-0.053441,-0.027026,0.051353,-0.102945,0.048893


## PRIMERA APROXIMACIÓN A BOOST

In [174]:
model = GradientBoostingClassifier(random_state = 42)
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)


print(get_metrics(y_train, y_test,y_pred_train, y_pred_test))

            Accuracy        F1       AUC  Precision    Recall  Specificity
Train       0.938111  0.905941  0.919602   0.958115  0.859155     0.980050
Test        0.740260  0.649123  0.725253   0.627119  0.672727     0.777778
Diferencia  0.197851  0.256818  0.194350   0.330997  0.186428     0.202272


In [175]:
# DEFINICIÓN DE HYPERPARAMETROS PARA OPTIMIZACIÓN

param_grid = {
    'n_estimators': [100, 200],  
    'learning_rate': [0.05, 0.1],  
    'max_depth': [3, 4],  
    'min_samples_split': [5, 10],  
    'min_samples_leaf': [2, 4], 
    'subsample': [0.8], 
    'max_features': ['sqrt']  
}

In [176]:
# USO GRID SEARCH PARA ORIENTAR LA BUSQUEDA DE HYPERPARÁMETROS
grid = GridSearchCV(model, param_grid, scoring = 'accuracy', cv = 5)

grid.fit(X_train, y_train)

grid.best_params_

{'learning_rate': 0.05,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 100,
 'subsample': 0.8}

Los parámetros que he elegido los he calculado en local porque aquí tardaba mucho. Pero el concepto es el mismo. He visto que el parámetro learning rate es clave. 

## MODELO DEFINITIVO BOOST

In [177]:
model2 = GradientBoostingClassifier(random_state = 42, learning_rate=0.003, max_depth=4 ,max_features='log2', min_samples_leaf=4, min_samples_split=2,  n_estimators=300, subsample=1)
model2.fit(X_train, y_train)

y_pred_test = model2.predict(X_test)
y_pred_train = model2.predict(X_train)


print(get_metrics(y_train, y_test,y_pred_train, y_pred_test))

            Accuracy        F1       AUC  Precision    Recall  Specificity
Train       0.820847  0.668675  0.750588   0.932773  0.521127     0.980050
Test        0.811688  0.688172  0.760606   0.842105  0.581818     0.939394
Diferencia  0.009159 -0.019497 -0.010018   0.090668 -0.060691     0.040656


Ole el modelo que ha quedado en comparación con los otros. 

## CONCLUSIONES


In [178]:
y_pred_test = decission_tree_model.predict(X_test)
y_pred_train = decission_tree_model.predict(X_train)
print ( 'DecissionTree metrics')
print(get_metrics(y_train, y_test,y_pred_train, y_pred_test))

y_pred_test = RF_model.predict(X_test)
y_pred_train = RF_model.predict(X_train)
print ('\nRandomTree metrics')
print(get_metrics(y_train, y_test,y_pred_train, y_pred_test))

y_pred_test = model2.predict(X_test)
y_pred_train = model2.predict(X_train)

print ('\nBoostingClassifier metrics')
print(get_metrics(y_train, y_test,y_pred_train, y_pred_test))

DecissionTree metrics
            Accuracy        F1       AUC  Precision    Recall  Specificity
Train       0.771987  0.627660  0.720886   0.723926  0.553991     0.887781
Test        0.772727  0.646465  0.730303   0.727273  0.581818     0.878788
Diferencia -0.000740 -0.018805 -0.009417  -0.003346 -0.027828     0.008993

RandomTree metrics
            Accuracy        F1       AUC  Precision    Recall  Specificity
Train       0.771987  0.593023  0.703277   0.778626  0.478873     0.927681
Test        0.772727  0.646465  0.730303   0.727273  0.581818     0.878788
Diferencia -0.000740 -0.053441 -0.027026   0.051353 -0.102945     0.048893

BoostingClassifier metrics
            Accuracy        F1       AUC  Precision    Recall  Specificity
Train       0.820847  0.668675  0.750588   0.932773  0.521127     0.980050
Test        0.811688  0.688172  0.760606   0.842105  0.581818     0.939394
Diferencia  0.009159 -0.019497 -0.010018   0.090668 -0.060691     0.040656


Sorprendentemente con DecissionTree y con RandoForest, hemos termiando con un modelo prácticamente igual. Pordríamos decir que el modelo de DeccisionTree es ligeramente más consistente entre los datos de test y train en las métricas de F1 y AUC. Además el de RandomForest si que tiene un poco de sobreajuste en cuanto a la Specificity y la precisión. 

En cuanto al BoostingClassifier, obtiene un resultado muy bueno. Tiene un poquito más de sobreajuste accuracy que los otros modelos. Y tambien adolece del sobreajuste en specificity y precision. 

Por último decir, que el valor del Recall que es tan bajo con respecto al de Especifidad, es porque el dataset está desbalanceado (35% de positivos). Tenemos muchos menos casos positivos por lo perjudica al dato del recall. Para eso está el valor F1 y AUC que compensan esa diferencia. 

