<a href="https://colab.research.google.com/github/DajeanArcila/DajeanArcila/blob/main/Pronostico_con_GridsearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics


In [3]:

# Configurar la semilla para reproducibilidad
np.random.seed(42)

# Generar datos ficticios
num_samples = 500
data = {
    'age': np.random.randint(20, 80, size=num_samples),
    'gender': np.random.choice(['male', 'female'], size=num_samples),
    'bmi': np.round(np.random.uniform(18.5, 40, size=num_samples), 1),
    'smoker': np.random.choice(['yes', 'no'], size=num_samples),
    'exercise_level': np.random.choice(['low', 'medium', 'high'], size=num_samples),
    'cholesterol': np.round(np.random.uniform(150, 300, size=num_samples), 1)
}



In [4]:
# Crear el DataFrame
df_health = pd.DataFrame(data)

# Mostrar las primeras filas del DataFrame
print(df_health.head())



   age  gender   bmi smoker exercise_level  cholesterol
0   58  female  36.5    yes         medium        203.2
1   71    male  28.6    yes           high        160.4
2   48    male  27.4     no            low        227.9
3   34  female  24.4    yes            low        160.1
4   62    male  19.7     no            low        270.1


In [5]:
# Guardar el DataFrame en un archivo Excel
df_health.to_excel('health_data.xlsx', index=False)

In [6]:
# Cargar los datos
df_health = pd.read_excel('health_data.xlsx')

In [7]:
# Mostrar resumen estadístico
print(df_health.describe())

              age         bmi  cholesterol
count  500.000000  500.000000    500.00000
mean    50.396000   28.854600    224.99040
std     17.335982    6.147199     43.37577
min     20.000000   18.600000    150.50000
25%     36.750000   23.375000    187.00000
50%     51.000000   28.650000    226.60000
75%     65.000000   34.050000    263.42500
max     79.000000   40.000000    299.80000


In [8]:
# One-hot encode the data using pandas get_dummies
df_health = pd.get_dummies(df_health)
print(df_health.head())

   age   bmi  cholesterol  gender_female  gender_male  smoker_no  smoker_yes  \
0   58  36.5        203.2           True        False      False        True   
1   71  28.6        160.4          False         True      False        True   
2   48  27.4        227.9          False         True       True       False   
3   34  24.4        160.1           True        False      False        True   
4   62  19.7        270.1          False         True       True       False   

   exercise_level_high  exercise_level_low  exercise_level_medium  
0                False               False                   True  
1                 True               False                  False  
2                False                True                  False  
3                False                True                  False  
4                False                True                  False  


In [9]:
# Labels are the values we want to predict
labels = np.array(df_health['cholesterol'])
# Remove the labels from the features
# axis 1 refers to the columns
features = df_health.drop('cholesterol', axis=1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [10]:
# Using Skicit-learn to split data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=42)


In [11]:
# Entrenamiento del modelo GradientBoostingRegressor
grb = GradientBoostingRegressor(n_estimators=100)
grb.fit(train_features, train_labels)
gbr_pred = grb.predict(test_features)


In [12]:
# Evaluación del modelo
r_sq = grb.score(test_features, test_labels)
print('Coeficiente de Determinación (R²):', r_sq)
print('MAE:', metrics.mean_absolute_error(test_labels, gbr_pred))
print('MSE:', metrics.mean_squared_error(test_labels, gbr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test_labels, gbr_pred)))


Coeficiente de Determinación (R²): -0.06370855672178743
MAE: 39.608878704027674
MSE: 2113.4433868756323
RMSE: 45.97220232788106


In [13]:
# GridSearch para optimización de hiperparámetros
parameters = {
    'learning_rate': [0.03],
    'subsample': [0.2],
    'n_estimators': [100, 500, 1000, 1500],
    'max_depth': [8]
}
grid_search = GridSearchCV(grb, parameters, scoring='r2', cv=2, n_jobs=-1)
grid_search.fit(train_features, train_labels)

In [14]:
print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_search.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_search.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_search.best_params_)


 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.03, max_depth=8, subsample=0.2)

 The best score across ALL searched params:
 -0.10538456013715802

 The best parameters across ALL searched params:
 {'learning_rate': 0.03, 'max_depth': 8, 'n_estimators': 100, 'subsample': 0.2}


In [15]:
# Entrenamiento del modelo optimizado
best_model = grid_search.best_estimator_
best_model.fit(train_features, train_labels)
gbr_tunned_pred = best_model.predict(test_features)

In [16]:

# Evaluación del modelo optimizado
r_sq = best_model.score(test_features, test_labels)
print('Coeficiente de Determinación (R²):', r_sq)
print('MAE:', metrics.mean_absolute_error(test_labels, gbr_tunned_pred))
print('MSE:', metrics.mean_squared_error(test_labels, gbr_tunned_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test_labels, gbr_tunned_pred)))

Coeficiente de Determinación (R²): 0.01598100605826691
MAE: 38.84658706067769
MSE: 1955.1111271638524
RMSE: 44.216638578298245


In [19]:
# Generar un nuevo dataset ficticio de salud para predicciones
np.random.seed(100)
num_new_samples = 10

new_data = {
    'age': np.random.randint(20, 70, num_new_samples),
    'weight': np.random.randint(50, 100, num_new_samples),
    'height': np.random.randint(150, 200, num_new_samples),
    'systolic_bp': np.random.randint(90, 160, num_new_samples),
    'diastolic_bp': np.random.randint(60, 100, num_new_samples),
    'bmi': np.round(np.random.uniform(18.5, 35, num_new_samples), 2),
    'exercise_freq': np.random.randint(0, 7, num_new_samples),
    'smoking': np.random.randint(0, 2, num_new_samples),
    'alcohol_consumption': np.random.randint(0, 2, num_new_samples)
}

new_features = pd.DataFrame(new_data)

# Predecir niveles de colesterol en el nuevo dataset
new_predictions = grb.predict(new_features)

# Añadir las predicciones al nuevo dataset
new_features['predicted_cholesterol_level'] = new_predictions

# Mostrar el nuevo dataset con las predicciones
print(new_features)


   age  weight  height  systolic_bp  diastolic_bp    bmi  exercise_freq  \
0   28      52     166          103            67  19.00              5   
1   44      84     159           94            76  31.30              6   
2   23      64     179          149            62  27.59              0   
3   59      84     172          157            90  24.79              5   
4   43      99     152           97            79  27.23              1   
5   35      98     177          139            94  34.29              4   
6   68      74     194          137            87  21.40              2   
7   30      65     154          155            90  20.45              3   
8   50      86     181          151            99  32.74              6   
9   54      93     151          104            98  19.74              3   

   smoking  alcohol_consumption  predicted_cholesterol_level  
0        0                    0                   284.887946  
1        1                    1                 

