In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

full_data_path = '/workspaces/Health-Indicators/notebooks/diabetes_012_health_indicators_BRFSS2015.csv'
df_full = pd.read_csv(full_data_path)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
missing_values_full = df_full.isnull().sum()
print(missing_values_full)

Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [3]:
X = df_full.drop('Diabetes_012', axis=1)
y = df_full['Diabetes_012']
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X, y)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaled, y_train)

In [9]:
from sklearn.metrics import roc_auc_score

y_pred_rf_proba = rf.predict_proba(X_test_scaled)


In [10]:
y_pred_rf_proba = rf.predict_proba(X_test_scaled)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf_proba, multi_class='ovr', average='weighted')
print(f'RandomForest ROC AUC: {roc_auc_rf}')


RandomForest ROC AUC: 0.9832095942650049


In [11]:
# Import XGBoost and train the model
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_scaled, y_train)

In [18]:
# Predict probabilities for the test set
y_pred_xgb_proba = xgb.predict_proba(X_test_scaled)


In [19]:
# Calculate the ROC AUC score using the predicted probabilities
roc_auc_xgb = roc_auc_score(y_test, y_pred_xgb_proba, multi_class='ovr', average='weighted')
print(f'XGBoost ROC AUC: {roc_auc_xgb}')


XGBoost ROC AUC: 0.9438301377471376


In [None]:
from joblib import dump

dump(rf, "random_forest_diabetes_model.joblib")
dump(xgb, "xgboost_diabetes_model.joblib")


['xgboost_diabetes_model.joblib']