# kaggle competition on diabetic classification

## 1. Import Libraries
Load required dependencies for data processing and modeling.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from lightgbm import LGBMClassifier
import xgboost as xgb


## 2. Load Data
Read training data and preview the structure.

In [3]:
train= 'train.csv'
test= 'test.csv'

df= pd.read_csv(train)

df.head(10)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0
5,5,42,1,100,4.4,6.4,5.3,25.5,0.84,111,...,Female,White,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
6,6,41,2,148,3.4,5.6,3.7,27.9,0.89,130,...,Female,White,Graduate,Lower-Middle,Current,Employed,0,0,0,1.0
7,7,51,3,102,4.0,7.3,5.5,27.1,0.83,125,...,Male,Asian,Highschool,Low,Never,Employed,1,0,0,1.0
8,8,34,2,44,2.7,7.0,7.9,22.6,0.81,120,...,Male,White,Highschool,Lower-Middle,Never,Employed,0,0,0,0.0
9,9,44,1,36,5.8,5.7,6.6,29.3,0.88,110,...,Male,Hispanic,Highschool,Middle,Never,Employed,1,0,0,1.0


## 3. Feature Engineering
Encode categorical variables using one-hot and ordinal encoding.

In [4]:
encoder= OrdinalEncoder()
ohe= OneHotEncoder(sparse_output=False, drop='first')

categorical_cols= [
    'ethnicity',
    'education_level',
    'income_level',
    'smoking_status',
    'employment_status'
]

encoded_array = ohe.fit_transform(df[categorical_cols])
encoded_cols= ohe.get_feature_names_out(categorical_cols)

encoded_df= pd.DataFrame(encoded_array, columns=encoded_cols,index=df.index)
df= pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)


df['gender']= encoder.fit_transform(df[['gender']])

df.head(10)


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,education_level_Postgraduate,income_level_Low,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,smoking_status_Former,smoking_status_Never,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5,5,42,1,100,4.4,6.4,5.3,25.5,0.84,111,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6,6,41,2,148,3.4,5.6,3.7,27.9,0.89,130,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,51,3,102,4.0,7.3,5.5,27.1,0.83,125,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,8,34,2,44,2.7,7.0,7.9,22.6,0.81,120,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,9,44,1,36,5.8,5.7,6.6,29.3,0.88,110,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## 4. Model Training & Tuning
Train LGBMClassifier with hyperparameter optimization via RandomizedSearchCV.

In [10]:
x= df.drop(columns=['id','diagnosed_diabetes'])
y= df['diagnosed_diabetes']

X_train, X_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)

model= LGBMClassifier()

param_grid={
    'n_estimators': [700,800],
    'learning_rate': [0.05,0.1],
    'max_depth': [9,10],
    'min_child_weight':[1,2]
}

grid = RandomizedSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train,y_train)

print("best cv score", grid.best_score_)
print("best params:", grid.best_params_)

best_score= grid.best_estimator_
test_score_pred= best_score.predict(X_test)

test_score_pred = best_score.predict_proba(X_test)[:,1]
print("test ROC-AUC score:", roc_auc_score(y_test, test_score_pred))






[LightGBM] [Info] Number of positive: 279148, number of negative: 168852
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1652
[LightGBM] [Info] Number of data points in the train set: 448000, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623098 -> initscore=0.502720
[LightGBM] [Info] Start training from score 0.502720
[LightGBM] [Info] Number of positive: 279149, number of negative: 168851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 448000, number of used features: 35
[LightGBM] [In

## 5. Generate Submission
Prepare test data and create submission file with predictions.

In [8]:
test_df= pd.read_csv(test)
X_test_final = test_df.drop(columns=['id'])

cat_cols = X_test_final.select_dtypes(include='object').columns

# Fill missing with 'missing' if any
X_test_final[cat_cols] = X_test_final[cat_cols].fillna('missing')

# One-hot encode
X_test_final = pd.get_dummies(X_test_final, columns=cat_cols)

missing_cols = set(X_train.columns) - set(X_test_final.columns)
for c in missing_cols:
    X_test_final[c] = 0

X_test_final = X_test_final[X_train.columns]

submission_probs = best_score.predict_proba(X_test_final)[:,1]

submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': submission_probs
})

submission.to_csv('submission.csv', index=False)




## 6. Model Comparison
Compare performance of different models on test set.

In [None]:
models= {
'RandomForest': RandomForestClassifier(n_estimators=300, max_depth=10),
'LightGBM': LGBMClassifier(n_estimators=300, learning_rate=0.05)
}

for name,model in models.items():
    model.fit(X_train,y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    print(f"{name}: {auc:.5f}")

