In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from sklearn.metrics import mean_squared_error, r2_score

from pickle import dump

In [5]:
# Descargamos el archivo CSV localmente:
csv_url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
local_csv = "diabetes.csv"

# Descargamos el archivo usando requests:
import requests
response = requests.get(csv_url)
with open(local_csv, 'wb') as f:
    f.write(response.content)

# Leemos el archivo CSV localmente:
df = pd.read_csv(local_csv)
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [13]:
print(f"La dimensión de nuestro DataFrame es: {df.shape} (filas, columnas)")

La dimensión de nuestro DataFrame es: (768, 9) (filas, columnas)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Como podemos comprobar, todas las filas de nuestro DataFrame tienen valores no nulos, y son de tipo numérico: enteros y decimales.

In [15]:
# Usamos el método 'duplicated()' para detectar duplicados en un DataFrame y 'sum()' para contar el número de duplicados: 
duplicados = df.duplicated()
num_duplicados = duplicados.sum()

print(duplicados)
print(f"El número de duplicados es: {num_duplicados}.")

0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool
El número de duplicados es: 0.


In [17]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Variables numéricas: 

- Pregnancies 
- Glucose 
- BloodPressure 
- SkinThickness
- Insulin
- BMI
- DiabetesPedigreeFunction
- Age
- Outcome: variable objetivo. 

In [20]:
# Selección de variables: 
# Dividimos el conjunto de datos en muestras de train y test:
X = df.drop("Outcome", axis = 1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
60,2,84,0,0,0,0.0,0.304,21
618,9,112,82,24,0,28.2,1.282,50
346,1,139,46,19,83,28.7,0.654,22
294,0,161,50,0,0,21.9,0.254,65
231,6,134,80,37,370,46.2,0.238,46


In [21]:
# Split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
y_train

60     0
618    1
346    0
294    0
231    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 614, dtype: int64

In [23]:
X_train.shape

(614, 8)

In [24]:
X_test.shape

(154, 8)

In [25]:
len(y_train)

614

In [26]:
len(y_test)

154

In [31]:
# Escalado
from sklearn.preprocessing import StandardScaler

# Creamos una lista con las variables independientes para su normalización, dado que detectamos una alta desproporcionalidad en los datos, 
# debido a la amplia variación en los rangos, desviaciones estándar y la presencia de valores cero. 

num_variables = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']

# Instanciamos el escalador:
scaler = StandardScaler()

# Entrenamos el escalador con los datos de entrenamiento:
scaler.fit(X_train[num_variables])

# Aplicamos el escalador en ambos y creamos un df independiente respectivo a la división de los datos:
X_train_num_scal = scaler.transform(X_train[num_variables])
X_train_num_scal = pd.DataFrame(X_train_num_scal, index = X_train.index, columns = num_variables)

X_test_num_scal = scaler.transform(X_test[num_variables])
X_test_num_scal = pd.DataFrame(X_test_num_scal, index = X_test.index, columns = num_variables)

X_train_num_scal.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
60,-0.526397,-1.151398,-3.752683,-1.322774,-0.701206,-4.135256,-0.490735,-1.03594
618,1.588046,-0.276643,0.680345,0.233505,-0.701206,-0.489169,2.41503,1.487101
346,-0.82846,0.566871,-1.265862,-0.09072,0.013448,-0.424522,0.549161,-0.948939
294,-1.130523,1.254179,-1.049617,-1.322774,-0.701206,-1.30372,-0.639291,2.792122
231,0.681856,0.410665,0.572222,1.07649,2.484601,1.838121,-0.686829,1.139095


## Modelado:

In [44]:
# Modelos:
ada = AdaBoostClassifier(n_estimators=1000, learning_rate= 00.1, random_state=42)
gb = GradientBoostingClassifier(n_estimators=1000, learning_rate= 00.1, random_state=42)
xgb = XGBClassifier(n_estimators=1000, learning_rate= 00.1, random_state = 42, use_label_encoder=False, eval_metric='mlogloss')
lgb = LGBMClassifier(n_estimators=1000, learning_rate= 00.1, random_state = 42)

# Entrenamiento:
ada.fit(X_train, y_train)
gb.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 213, number of negative: 401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 664
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346906 -> initscore=-0.632669
[LightGBM] [Info] Start training from score -0.632669


## Predicción:

In [45]:
# Predicción:
ada_y_pred_test = ada.predict(X_test)
gb_y_pred_test = gb.predict(X_test)
xgb_y_pred_test = xgb.predict(X_test)
lgb_y_pred_test = lgb.predict(X_test)

ada_y_pred_train = ada.predict(X_train)
gb_y_pred_train = gb.predict(X_train)
xgb_y_pred_train = xgb.predict(X_train)
lgb_y_pred_train = lgb.predict(X_train)

## Métricas:

In [46]:
# Métricas:
ada_accuracy_test = accuracy_score(y_test, ada_y_pred_test)
ada_accuracy_train = accuracy_score(y_train, ada_y_pred_train)

gb_accuracy_test = accuracy_score(y_test, gb_y_pred_test)
gb_accuracy_train = accuracy_score(y_train, gb_y_pred_train)

xgb_accuracy_test = accuracy_score(y_test, xgb_y_pred_test)
xgb_accuracy_train = accuracy_score(y_train, xgb_y_pred_train)

lgb_accuracy_test = accuracy_score(y_test, lgb_y_pred_test)
lgb_accuracy_train = accuracy_score(y_train, lgb_y_pred_train)

print('AdaBoost')
print("Accuracy Test: ", ada_accuracy_test)
print("Accuracy Train: ", ada_accuracy_train)

print('Gradient Boosting')
print("Accuracy Test: ", gb_accuracy_test)
print("Accuracy Train: ", gb_accuracy_train)

print('XGBoost')
print("Accuracy Test: ", xgb_accuracy_test)
print("Accuracy Train: ", xgb_accuracy_train)

print('LightGBM')
print("Accuracy Test: ", lgb_accuracy_test)
print("Accuracy Train: ", lgb_accuracy_train)


AdaBoost
Accuracy Test:  0.7467532467532467
Accuracy Train:  0.8469055374592834
Gradient Boosting
Accuracy Test:  0.7337662337662337
Accuracy Train:  1.0
XGBoost
Accuracy Test:  0.7012987012987013
Accuracy Train:  1.0
LightGBM
Accuracy Test:  0.7207792207792207
Accuracy Train:  1.0


- AdaBoost:

    - Accuracy Test: 0.7468
    - Accuracy Train: 0.8469

Análisis: Este modelo parece tener un buen equilibrio, aunque hay una caída en la precisión del conjunto de prueba, lo cual es esperado, pero la diferencia no es extremadamente grande.

- Gradient Boosting:

    - Accuracy Test: 0.7338
    - Accuracy Train: 1.0

Análisis: La precisión perfecta en el conjunto de entrenamiento sugiere sobreajuste. La caída en la precisión del conjunto de prueba indica que el modelo no generaliza bien.

- XGBoost:

    - Accuracy Test: 0.7013
    - Accuracy Train: 1.0

Análisis: Similar a Gradient Boosting, la precisión perfecta en el conjunto de entrenamiento indica sobreajuste. La precisión del conjunto de prueba es la más baja entre los modelos, lo que sugiere que este modelo particular de XGBoost no generaliza bien en tu conjunto de datos.

- LightGBM:

    - Accuracy Test: 0.7208
    - Accuracy Train: 1.0

Análisis: Nuevamente, la precisión perfecta en el conjunto de entrenamiento sugiere sobreajuste. La precisión del conjunto de prueba es mejor que XGBoost pero sigue mostrando signos de sobreajuste.
