In [None]:
from sqlalchemy import create_engine
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
from pathlib import Path
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


db_url = 'postgresql://francis:1234@localhost/Ozempic_ML'
engine = create_engine(db_url)

query = "SELECT * FROM cardio_train"
df = pd.read_sql(query, engine)
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [2]:
# Basic feature adjustments
df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)  # Calculate BMI
df['bp_range'] = df['ap_hi'] - df['ap_lo']             # Blood pressure range
df['age_years'] = df['age'] // 365                    # Convert age from days to years

# Advanced features
df['bmi_category'] = pd.cut(df['BMI'], 
                             bins=[0, 18.5, 24.9, 29.9, float('inf')], 
                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

df['bp_category'] = pd.cut(df['bp_range'], 
                            bins=[-float('inf'), 40, 60, 80, float('inf')], 
                            labels=['Low', 'Normal', 'Elevated', 'Hypertensive'])

df['age_group'] = pd.cut(df['age_years'], 
                          bins=[0, 18, 45, 60, float('inf')], 
                          labels=['Child', 'Adult', 'Middle-aged', 'Senior'])

# Risk score
df['risk_score'] = (
    (df['BMI'] * 0.4) + 
    (df['cholesterol'] * 0.3) + 
    (df['gluc'] * 0.3) + 
    (df['smoke'] * 0.2) + 
    (df['alco'] * 0.2) + 
    (df['active'] * -0.2) +  # Negative weight for active lifestyle
    (df['cardio'] * 0.5)     # Strong weight for existing cardiovascular disease
)

# Feature for comorbidities 
df['comorbidity_flag'] = ((df['BMI'] >= 30) & (df['bp_range'] > 80)).astype(int)

# Interaction features
df['chol_gluc_interaction'] = df['cholesterol'] * df['gluc']  # Cholesterol and glucose interaction
df['smoke_alco_interaction'] = df['smoke'] * df['alco']      # Smoking and alcohol interaction

scaler = StandardScaler()
df[['BMI', 'ap_hi', 'ap_lo', 'bp_range', 'age_years', 'risk_score']] = scaler.fit_transform(
    df[['BMI', 'ap_hi', 'ap_lo', 'bp_range', 'age_years', 'risk_score']]
)

# Drop columns
df.drop(columns=['weight', 'height', 'age'], inplace=True)

df.head()


Unnamed: 0,id,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,bp_range,age_years,bmi_category,bp_category,age_group,risk_score,comorbidity_flag,chol_gluc_interaction,smoke_alco_interaction
0,0,2,-0.122182,-0.088238,1,1,0,0,1,0,-0.917577,-0.009057,-0.4198,Normal,Low,Middle-aged,-1.064729,0,1,0
1,1,1,0.07261,-0.03518,3,1,0,0,1,1,1.210081,0.07377,0.31911,Obese,Normal,Middle-aged,1.385073,0,3,0
2,2,1,0.007679,-0.141297,3,1,0,0,0,1,-0.664652,0.115184,-0.272018,Normal,Normal,Middle-aged,-0.317701,0,3,0
3,3,2,0.137541,0.017879,1,1,0,0,1,1,0.18944,0.07377,-0.715364,Overweight,Normal,Middle-aged,0.181703,0,1,0
4,4,1,-0.187113,-0.194356,1,1,0,0,0,0,-0.746181,0.032357,-0.863146,Normal,Low,Middle-aged,-0.823959,0,1,0


In [3]:
# Step 1: Base Qualification Check (Glucose, Cholesterol, and BMI)
df['qualifies_for_ozempic'] = (
    (df['gluc'] >= 2) & 
    (df['cholesterol'] == 2) &  
    (df['BMI'] >= 30)
).astype(int)

# Step 2: Enhanced Qualification with Additional Risk Factors
# Check if smoking, drinking, or cardio conditions heighten the likelihood
df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'] | (
    ((df['smoke'] == 1) | (df['alco'] == 1) | (df['cardio'] == 1)) &  
    (df['risk_score'] > 0.5)  
).astype(int)

# Step 3: Incorporate Advanced Features for Comorbidities
# If comorbidity (e.g., high BMI + hypertension) is present, increase qualification chances
df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'] | df['comorbidity_flag']

# Ensure the column is binary (1: qualifies, 0: does not qualify)
df['qualifies_for_ozempic'] = df['qualifies_for_ozempic'].astype(int)


In [4]:

# Split data into features (X) and target (y)
y = df['qualifies_for_ozempic']
x = df.drop(columns=['qualifies_for_ozempic', 'id', 'gluc', 'cardio'])

In [5]:
# One-hot encode categorical variables
x = pd.get_dummies(x, drop_first=True)

In [6]:
print(df['qualifies_for_ozempic'].value_counts())


qualifies_for_ozempic
0    57436
1    12564
Name: count, dtype: int64


In [7]:
# Check the class distribution before SMOTE
print("Class distribution before SMOTE:")
print(y.value_counts())

# Apply SMOTE to address class imbalance
smote = SMOTE(random_state=1, k_neighbors=5)  # Use 5 neighbors for better resampling
x_resampled, y_resampled = smote.fit_resample(x, y)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())


Class distribution before SMOTE:
qualifies_for_ozempic
0    57436
1    12564
Name: count, dtype: int64
Class distribution after SMOTE:
qualifies_for_ozempic
0    57436
1    57436
Name: count, dtype: int64


In [8]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x_resampled, y_resampled, test_size=0.2, random_state=1, stratify=y_resampled
)


In [9]:
# Define and train the model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_model = GridSearchCV(
    RandomForestClassifier(random_state=1, class_weight='balanced'), 
    param_grid, 
    cv=3, 
    scoring='roc_auc'  # Focus on ROC AUC for healthcare relevance
)
rf_model.fit(x_train, y_train)


In [10]:
# Evaluate the model
y_pred = rf_model.best_estimator_.predict(x_test)

In [11]:
# Print evaluation metrics
print("\nBest Parameters:", rf_model.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Does Not Qualify', 'Qualifies']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, rf_model.best_estimator_.predict_proba(x_test)[:, 1]))



Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}

Classification Report:
                  precision    recall  f1-score   support

Does Not Qualify       1.00      0.95      0.97     11487
       Qualifies       0.96      1.00      0.98     11488

        accuracy                           0.98     22975
       macro avg       0.98      0.98      0.98     22975
    weighted avg       0.98      0.98      0.98     22975


Confusion Matrix:
[[10956   531]
 [   32 11456]]

ROC AUC Score: 0.9972766689388246


In [12]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation to evaluate the model
scores = cross_val_score(rf_model.best_estimator_, x_resampled, y_resampled, cv=5, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())


Cross-validation scores: [0.95978237 0.97754081 0.9752764  0.97627753 0.97662575]
Mean cross-validation score: 0.9731005717491306
