In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier as XGBC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer as CTR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder , StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import  pandas as pd 


df_raw = pd.read_csv('../artifacts/raw.csv')

df = df_raw.sample(random_state=2, n= 175000)

gender_map = {'Female': 0, 'Male': 1, 'Other': 2}
ethnicity_map = {'Asian': 0, 'Black': 1, 'Hispanic': 2, 'White': 3, 'Other': 4}
education_level_map = {'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
income_level_map = {'Low': 0, 'Lower-Middle': 1, 'Middle': 2, 'Upper-Middle': 3, 'High': 4}
smoking_status_map = {'Never': 0, 'Former': 1, 'Current': 2}
employment_status_map = {'Unemployed': 0, 'Student': 1, 'Employed': 2, 'Retired': 3}

df['gender'] = df['gender'].map(gender_map)
df['ethnicity'] = df['ethnicity'].map(ethnicity_map)
df['education_level'] = df['education_level'].map(education_level_map)
df['income_level'] = df['income_level'].map(income_level_map)
df['smoking_status'] = df['smoking_status'].map(smoking_status_map)
df['employment_status'] = df['employment_status'].map(employment_status_map)


In [2]:
X = df.drop(columns=['id','diagnosed_diabetes'], axis=1)
y = df['diagnosed_diabetes']

X_train , X_test , y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=42)

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline([('imputer',SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output= False))])
preprocessing = CTR([('num', num_pipeline, num_cols),
                     ('cat', cat_pipeline, cat_cols)])

lr =XGBC()
model = Pipeline([('pre', preprocessing),
                  ('lr', lr)])


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

print(accuracy_score(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))
print(precision_score(y_pred, y_test))
print(recall_score(y_pred, y_test))

0.7239285714285715
0.6728285714285714
0.6728285714285714
[[ 5424  3699]
 [ 7752 18125]]
0.8305076979472141
0.700428952351509


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from xgboost import XGBClassifier

X = df.drop(columns=['id', 'diagnosed_diabetes'])
y = df['diagnosed_diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

model = Pipeline([
    ('pre', preprocessing),
    ('lr', XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

pre = model.named_steps['pre']
xgb_model = model.named_steps['lr']

feature_names = pre.get_feature_names_out()

importance = pd.Series(
    xgb_model.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

importance_df = importance.reset_index()
importance_df.columns = ['feature', 'importance']

importance_df['base_feature'] = (
    importance_df['feature']
)

final_importance = (
    importance_df
    .groupby('base_feature')['importance']
    .sum()
    .sort_values(ascending=False)
)

print("\nTop contributing ORIGINAL features:\n")
print(final_importance.head(10))


Train accuracy: 0.6866
Test accuracy: 0.6731714285714285
Accuracy: 0.6731714285714285
Confusion matrix:
 [[ 4982  8194]
 [ 3245 18579]]
Precision: 0.6939453927464236
Recall: 0.8513104838709677

Top contributing ORIGINAL features:

base_feature
num__family_history_diabetes               0.670452
num__physical_activity_minutes_per_week    0.058064
num__age                                   0.056363
num__bmi                                   0.021675
num__triglycerides                         0.020716
num__ldl_cholesterol                       0.017458
num__systolic_bp                           0.015489
num__diet_score                            0.011872
num__waist_to_hip_ratio                    0.011267
num__hdl_cholesterol                       0.011228
Name: importance, dtype: float32


In [4]:
df.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [5]:
keep_cols = [
    'family_history_diabetes',
    'physical_activity_minutes_per_week',
    'age',
    'bmi',
    'triglycerides',
    'ldl_cholesterol',
    'systolic_bp',
    'diet_score',
    'waist_to_hip_ratio',
    'hdl_cholesterol',
    'diagnosed_diabetes'
]
df = df[keep_cols]


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from xgboost import XGBClassifier

X = df.drop(columns=['diagnosed_diabetes'])
y = df['diagnosed_diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

model = Pipeline([
    ('pre', preprocessing),
    ('lr', XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

pre = model.named_steps['pre']
xgb_model = model.named_steps['lr']

feature_names = pre.get_feature_names_out()

importance = pd.Series(
    xgb_model.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

importance_df = importance.reset_index()
importance_df.columns = ['feature', 'importance']

importance_df['base_feature'] = (
    importance_df['feature']
)

final_importance = (
    importance_df
    .groupby('base_feature')['importance']
    .sum()
    .sort_values(ascending=False)
)

print("\nTop contributing ORIGINAL features:\n")
print(final_importance.head(10))


Train accuracy: 0.6850785714285714
Test accuracy: 0.6756857142857143
Accuracy: 0.6756857142857143
Confusion matrix:
 [[ 5082  8094]
 [ 3257 18567]]
Precision: 0.696410487228536
Recall: 0.8507606304985337

Top contributing ORIGINAL features:

base_feature
num__family_history_diabetes               0.751458
num__physical_activity_minutes_per_week    0.070504
num__age                                   0.063630
num__bmi                                   0.023506
num__triglycerides                         0.021675
num__ldl_cholesterol                       0.018135
num__systolic_bp                           0.014173
num__diet_score                            0.013000
num__hdl_cholesterol                       0.011993
num__waist_to_hip_ratio                    0.011924
Name: importance, dtype: float32


In [10]:
df.describe

<bound method NDFrame.describe of         family_history_diabetes  physical_activity_minutes_per_week  age  \
541241                        0                                  51   28   
27812                         0                                  52   63   
36028                         0                                  44   47   
341525                        0                                  21   32   
548183                        1                                  49   59   
...                         ...                                 ...  ...   
306307                        0                                  74   40   
214761                        0                                  76   34   
176542                        1                                 102   64   
466533                        1                                  80   51   
495212                        0                                 308   45   

         bmi  triglycerides  ldl_cholesterol  systoli