In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import resample
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import joblib
import os

# 1. Load dataset

In [2]:
df = pd.read_csv('Diabetes_Final_Data_V2.csv')

# 2. Clean column names (remove spaces etc.)

In [4]:
df.columns = df.columns.str.strip()

In [6]:
df.head()

Unnamed: 0,age,gender,pulse_rate,systolic_bp,diastolic_bp,glucose,height,weight,bmi,family_diabetes,hypertensive,family_hypertension,cardiovascular_disease,stroke,diabetic
0,42,Female,66,110,73,5.88,1.65,70.2,25.75,0,0,0,0,0,No
1,35,Female,60,125,68,5.71,1.47,42.5,19.58,0,0,0,0,0,No
2,62,Female,57,127,74,6.85,1.52,47.0,20.24,0,0,0,0,0,No
3,73,Male,55,193,112,6.28,1.63,57.4,21.72,0,0,0,0,0,No
4,68,Female,71,150,81,5.71,1.42,36.0,17.79,0,0,0,0,0,No


# 3. Define categorical and numerical columns

In [8]:
categorical_features = [
    "gender", "family_diabetes", "hypertensive",
    "family_hypertension", "cardiovascular_disease",
    "stroke", "age_group"
]
target_col = 'diabetic'
numerical_features = [col for col in df.columns if col not in categorical_features + [target_col]]


# 4. Handle age_group if not already categorized


In [10]:
if df['age_group'].isnull().any():
    df['age_group'] = pd.cut(
        df['age'],
        bins=[0, 30, 45, 60, 100],
        labels=['Young', 'Mid', 'Senior', 'Elderly']
    ).astype(str)


KeyError: 'age_group'

# 5. Balance the dataset

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import resample
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import joblib
import os

# 1. Load dataset
df = pd.read_csv('Diabetes_Final_Data_V2.csv')  # Replace with your file path

# 2. Clean column names (remove spaces etc.)
df.columns = df.columns.str.strip()

# 3. Define categorical and numerical columns
categorical_features = [
    "gender", "family_diabetes", "hypertensive",
    "family_hypertension", "cardiovascular_disease",
    "stroke", "age_group"
]
target_col = 'diabetic'  # Adjust if yours is different

numerical_features = [col for col in df.columns if col not in categorical_features + [target_col]]

# 4. Handle age_group if not already categorized

# 5. Balance the dataset
df_majority = df[df[target_col] == 0]
df_minority = df[df[target_col] == 1]
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)
balanced_df = pd.concat([df_majority, df_minority_upsampled])

# 6. Split features and label
X = balanced_df.drop(columns=target_col)
y = balanced_df[target_col]

# 7. Preprocessing: ColumnTransformer
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', scaler, numerical_features),
    ('cat', encoder, categorical_features)
])

# Fit the preprocessor
preprocessor.fit(X)

# Save preprocessor and feature list
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(X.columns.tolist(), 'feature_order.pkl')

# Transform features
X_transformed = preprocessor.transform(X)

# 8. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Create models directory
os.makedirs('models', exist_ok=True)

# 9. Train and Save: Logistic Regression
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
joblib.dump(lr, 'models/lr.pkl')

# 10. Train and Save: Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
joblib.dump(rf, 'models/rf.pkl')

# 11. Train and Save: XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
joblib.dump(xgb, 'models/xgb.pkl')

# 12. Train and Save: LightGBM
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train, y_train)
joblib.dump(lgb, 'models/lgb.pkl')

# 13. Train and Save: Neural Network
nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
nn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)
nn.save('models/nn.h5')

print("âœ… All models and the preprocessor saved successfully.")


InvalidParameterError: The 'n_samples' parameter of resample must be an int in the range [1, inf) or None. Got 0 instead.