# Optimización de hiperparámetros con Optuna



XGBoost

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import optuna
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import joblib



In [64]:
from xgboost import XGBClassifier

loaded_model = XGBClassifier()
loaded_model.load_model("xgboost_stroke_model_final.bin")

scaler = joblib.load('scaler_model.joblib')

In [66]:
# Cargar los datos
df = pd.read_csv('Data/stroke_dataset.csv')

# Preparación de los datos
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])

# Selección de características
features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'] + \
           [col for col in df.columns if col.startswith(('gender_', 'ever_married_', 'work_type_', 'Residence_type_', 'smoking_status_'))]

X = df[features]
y = df['stroke']

# División de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalado de características

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [67]:
important_features = [
    "age",
    "smoking_status_never smoked",
    "hypertension",
    "work_type_Private",
    "Residence_type_Rural",
    "heart_disease"
]

# Filtrar los conjuntos de datos
X_train_important = X_train[important_features]
X_test_important = X_test[important_features]

In [68]:
X_train_important

Unnamed: 0,age,smoking_status_never smoked,hypertension,work_type_Private,Residence_type_Rural,heart_disease
4428,24.0,False,0,True,True,0
1135,3.0,False,0,False,True,0
2417,69.0,False,0,True,False,0
1173,46.0,True,0,True,False,0
3696,52.0,True,1,True,True,0
...,...,...,...,...,...,...
4049,29.0,True,0,True,False,0
1053,56.0,False,0,True,False,0
3526,53.0,False,0,True,True,0
3051,44.0,True,0,True,True,0


In [69]:
# Obtener las probabilidades de las predicciones
y_prob_xgb = loaded_model.predict_proba(X_test_important)[:, 1]

# Definir un nuevo umbral
new_threshold = 0.165  # Ajusta este valor según sea necesario

# Clasificar las predicciones basadas en el nuevo umbral
y_pred_loaded_model = (y_prob_xgb >= new_threshold).astype(int)

# Evaluar el rendimiento del modelo cargado
accuracy = accuracy_score(y_test, y_pred_loaded_model)
print(f"Accuracy del modelo cargado: {accuracy:.4f}")

# Imprimir el informe de clasificación
print("\nClassification Report del modelo cargado:")
print(classification_report(y_test, y_pred_loaded_model))



Accuracy del modelo cargado: 0.9137

Classification Report del modelo cargado:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       947
           1       0.26      0.40      0.32        50

    accuracy                           0.91       997
   macro avg       0.62      0.67      0.64       997
weighted avg       0.93      0.91      0.92       997



# Lo importante

In [61]:
def transform_input_data(input_data):
    # Create a dictionary with all columns initialized to 0
    data = {col: 0 for col in ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 
                               'gender_Female', 'gender_Male', 
                               'ever_married_No', 'ever_married_Yes', 
                               'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 
                               'Residence_type_Rural', 'Residence_type_Urban', 
                               'smoking_status_Unknown', 'smoking_status_formerly smoked', 
                               'smoking_status_never smoked', 'smoking_status_smokes']}
    
    # Fill in the non-categorical values
    data['age'] = input_data['age']
    data['hypertension'] = input_data['hypertension']
    data['heart_disease'] = input_data['heart_disease']
    data['avg_glucose_level'] = input_data['avg_glucose_level']
    data['bmi'] = input_data['bmi']
    
    # Set the appropriate categorical columns to 1
    data[f"gender_{input_data['gender']}"] = 1
    data[f"ever_married_{input_data['ever_married']}"] = 1
    data[f"work_type_{input_data['work_type']}"] = 1
    data[f"Residence_type_{input_data['Residence_type']}"] = 1
    data[f"smoking_status_{input_data['smoking_status']}"] = 1
    
    # Create a DataFrame
    df = pd.DataFrame([data])
    
    return df


In [81]:
# Example input data
sample_input_data = {
    'gender': 'Male',
    'age': 44.0,
    'hypertension': 0,
    'heart_disease': 0,
    'ever_married': 'Yes',
    'work_type': 'Private',
    'Residence_type': 'Urban',
    'avg_glucose_level': 105.5,
    'bmi': 60.6,
    'smoking_status': 'never smoked'
}

# Transform the data
transformed_df = transform_input_data(sample_input_data)


In [82]:
important_columns = ['age', 'smoking_status_never smoked', 'hypertension', 
                     'work_type_Private', 'Residence_type_Rural', 'heart_disease']

selected_df = transformed_df[important_columns]


X_input = selected_df

# Make probability predictions
y_prob_xgb_optimized = loaded_model.predict_proba(X_input)[:, 1]

# Define the new threshold
new_threshold = 0.165  # Adjust this value as needed

# Classify predictions based on the new threshold
y_pred_xgb_adjusted = (y_prob_xgb_optimized >= new_threshold).astype(int)

print("\nPredicted class (0: No stroke, 1: Stroke):")
print(y_pred_xgb_adjusted)


# Print results
print("Probability of stroke:")
print(y_prob_xgb_optimized)


# Optional: Interpret the result
if y_pred_xgb_adjusted[0] == 1:
    print("\nThe model predicts a high risk of stroke.")
else:
    print("\nThe model predicts a low risk of stroke.")

print(f"\nPredicted probability: {y_prob_xgb_optimized[0]:.4f}")
print(f"Threshold used: {new_threshold}")


Predicted class (0: No stroke, 1: Stroke):
[0]
Probability of stroke:
[0.00941098]

The model predicts a low risk of stroke.

Predicted probability: 0.0094
Threshold used: 0.165


In [5]:
import xgboost as xgb
print(xgb.__version__)

1.6.2
