In [8]:


#THE ML_MODEL IS BEING USED TO SAVE A .PKL FILE


import pickle
import numpy as np
from pathlib import Path
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
from sqlalchemy import create_engine

db_url = 'postgresql://francis:1234@localhost/Ozempic_ML'
engine = create_engine(db_url)


query = "SELECT * FROM cardio_train"
df = pd.read_sql(query, engine)
df = df.drop(columns=['id'])  

df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)  
df['bp_range'] = df['ap_hi'] - df['ap_lo']  
df['age_years'] = df['age'] // 365  

df['bmi_category'] = pd.cut(df['BMI'],
                             bins=[0, 18.5, 24.9, 29.9, float('inf')],
                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

df['bp_category'] = pd.cut(df['bp_range'],
                            bins=[-float('inf'), 40, 60, 80, float('inf')],
                            labels=['Low', 'Normal', 'Elevated', 'Hypertensive'])

df['age_group'] = pd.cut(df['age_years'],
                          bins=[0, 18, 45, 60, float('inf')],
                          labels=['Child', 'Adult', 'Middle-aged', 'Senior'])


df['risk_score'] = (
    (df['BMI'] * 0.4) +
    (df['cholesterol'] * 0.3) +
    (df['gluc'] * 0.3) +
    (df['smoke'] * 0.2) +
    (df['alco'] * 0.2) +
    (df['active'] * -0.2) +  
    (df['cardio'] * 0.5)    
)


df['comorbidity_flag'] = ((df['BMI'] >= 30) & (df['bp_range'] > 80)).astype(int)


df['chol_gluc_interaction'] = df['cholesterol'] * df['gluc']  
df['smoke_alco_interaction'] = df['smoke'] * df['alco']     

df.drop(columns=['weight', 'height', 'age'], inplace=True)

df['qualifies_for_ozempic'] = (
    (df['gluc'] >= 1) & 
    (df['cholesterol'] == 1) &  
    (df['BMI'] >= 25)
).astype(int)

df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'] | (
    ((df['smoke'] == 1) | (df['alco'] == 1) | (df['cardio'] == 1)) &  
    (df['risk_score'] > 0.5)  
).astype(int)

df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'] | df['comorbidity_flag']
df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'].astype(int)

y = df['qualifies_for_ozempic']
x = df.drop(columns=['qualifies_for_ozempic', 'gluc', 'cardio'])


x = pd.get_dummies(x, drop_first=True)


smote = SMOTE(random_state=1, k_neighbors=5)
x_resampled, y_resampled = smote.fit_resample(x, y)


x_train, x_test, y_train, y_test = train_test_split(
    x_resampled, y_resampled, test_size=0.2, random_state=1, stratify=y_resampled
)


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_model = GridSearchCV(
    RandomForestClassifier(random_state=1, class_weight='balanced'), 
    param_grid, 
    cv=3, 
    scoring='roc_auc'
)
rf_model.fit(x_train, y_train)

pickle_filename = 'optimized_model.pkl'
with open(pickle_filename, 'wb') as file:
    pickle.dump(rf_model, file)


y_pred = rf_model.best_estimator_.predict(x_test)
print(classification_report(y_test, y_pred))

model_filename = 'optimized_model.pkl'
model_data = Path(model_filename).read_bytes()



              precision    recall  f1-score   support

           0       0.95      0.97      0.96     10548
           1       0.97      0.95      0.96     10549

    accuracy                           0.96     21097
   macro avg       0.96      0.96      0.96     21097
weighted avg       0.96      0.96      0.96     21097



In [9]:
df

Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,bp_range,age_years,bmi_category,bp_category,age_group,risk_score,comorbidity_flag,chol_gluc_interaction,smoke_alco_interaction,qualifies_for_ozempic
0,2,110,80,1,1,0,0,1,0,21.967120,30,50,Normal,Low,Middle-aged,9.186848,0,1,0,0
1,1,140,90,3,1,0,0,1,1,34.927679,50,55,Obese,Normal,Middle-aged,15.471072,0,3,0,1
2,1,130,70,3,1,0,0,0,1,23.507805,60,51,Normal,Normal,Middle-aged,11.103122,0,3,0,1
3,2,150,100,1,1,0,0,1,1,28.710479,50,48,Overweight,Normal,Middle-aged,12.384192,0,1,0,1
4,1,100,60,1,1,0,0,0,0,23.011177,40,47,Normal,Low,Middle-aged,9.804471,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,2,120,80,1,1,1,0,1,0,26.927438,40,52,Overweight,Low,Middle-aged,11.370975,0,1,0,1
69996,1,140,90,2,2,0,0,1,1,50.472681,50,61,Obese,Normal,Senior,21.689072,0,4,0,1
69997,2,180,90,3,1,0,1,0,1,31.353579,90,52,Obese,Hypertensive,Middle-aged,14.441432,1,3,0,1
69998,1,135,80,1,2,0,0,0,1,27.099251,55,61,Overweight,Normal,Senior,12.239700,0,2,0,1


In [10]:
feature_importances = rf_model.best_estimator_.feature_importances_

for feature, importance in zip(x.columns, feature_importances):
    print(f"Feature: {feature}, Importance: {importance}")


Feature: gender, Importance: 0.010958838000598608
Feature: ap_hi, Importance: 0.05257292277034961
Feature: ap_lo, Importance: 0.025810389127561638
Feature: cholesterol, Importance: 0.08751862574639677
Feature: smoke, Importance: 0.04704621017949272
Feature: alco, Importance: 0.01882693169920689
Feature: active, Importance: 0.009531026606727683
Feature: BMI, Importance: 0.1835198416001341
Feature: bp_range, Importance: 0.020387925050876377
Feature: age_years, Importance: 0.025796825634832255
Feature: risk_score, Importance: 0.28315797875957993
Feature: comorbidity_flag, Importance: 0.00017229526758374522
Feature: chol_gluc_interaction, Importance: 0.06643951926111102
Feature: smoke_alco_interaction, Importance: 0.003278666118482975
Feature: bmi_category_Normal, Importance: 0.0944157437269239
Feature: bmi_category_Overweight, Importance: 0.034422566213970475
Feature: bmi_category_Obese, Importance: 0.01833538981214082
Feature: bp_category_Normal, Importance: 0.005361798552149034
Feature: