In [1]:
# ================================================
# 1. Import Libraries
# ================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report




In [2]:
# ================================================
# 2. Load Dataset
# ================================================
df = pd.read_csv("/content/diabetes.csv")   # <-- change filename

print(df.head())   # preview
print(df.info())   # datatypes
print(df.describe())  # quick stats

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [4]:
# ================================================
# 3. Identify Features and Target
# ================================================
target_col = "Outcome"  # <-- change to your target
X = df.drop(columns=[target_col])
y = df[target_col]

In [5]:
# ================================================
# 4. Train-Test Split
# ================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 and y.value_counts().min() >= 2 else None
)

In [6]:
# ================================================
# 5. Preprocessing Pipelines
# ================================================
# Separate numerical and categorical columns
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object", "category"]).columns

# Numerical pipeline
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine both
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ]
)

In [7]:
print(preprocessor)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index([], dtype='object'))])


In [8]:
# ================================================
# 6. Add Feature Selection
# ================================================
# Select top K features using ANOVA F-test
feature_selector = SelectKBest(score_func=f_classif, k=20)  # <-- adjust "k"
print(feature_selector)

# Fit the feature selector on the processed training data
# This requires X_train_processed to be generated by running the cell below this one first.
X_train_processed = preprocessor.fit_transform(X_train, y_train)
feature_selector.fit(X_train_processed, y_train)

all_features = preprocessor.get_feature_names_out()
selected_features = all_features[feature_selector.get_support()]

print("Selected features:")
print(selected_features)

SelectKBest(k=20)
Selected features:
['num__Pregnancies' 'num__Glucose' 'num__BloodPressure'
 'num__SkinThickness' 'num__Insulin' 'num__BMI'
 'num__DiabetesPedigreeFunction' 'num__Age']




In [9]:
# ================================================
# 7. Candidate Models
# ================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier()
}

results = {}
best_pipelines = {}

for name, clf in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("feature_select", feature_selector),   # <-- feature selection
        ("classifier", clf)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    best_pipelines[name] = pipe

    print(f"Model: {name}")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print("-"*50)



Model: Logistic Regression
Accuracy: 0.7337662337662337
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       100
           1       0.60      0.70      0.65        54

    accuracy                           0.73       154
   macro avg       0.71      0.73      0.72       154
weighted avg       0.75      0.73      0.74       154

--------------------------------------------------
Model: Random Forest
Accuracy: 0.7597402597402597
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       100
           1       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154

--------------------------------------------------




Model: Gradient Boosting
Accuracy: 0.7532467532467533
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       100
           1       0.67      0.59      0.63        54

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154

--------------------------------------------------
Model: SVM
Accuracy: 0.7532467532467533
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       100
           1       0.66      0.61      0.63        54

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154

--------------------------------------------------
Model: KNN
Accuracy: 0.7012987012987013
              precision    recall  f1-score   support

           0       0.75      0.80      0.78       100
 



In [10]:
# ================================================
# 8. Pick Best Model
# ================================================
best_model_name = max(results, key=results.get)
best_pipe = best_pipelines[best_model_name]

print("Best Model:", best_model_name, "with Accuracy:", results[best_model_name])

Best Model: Random Forest with Accuracy: 0.7597402597402597


In [11]:
# ================================================
# 9. Hyperparameter Tuning on Best Model
# ================================================
if best_model_name == "Random Forest":
    param_grid = {
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [None, 5, 10, 20],
        "feature_select__k": [5, 10, 15]  # also tune feature selection
    }
elif best_model_name == "Logistic Regression":
    param_grid = {
        "classifier__C": [0.01, 0.1, 1, 10],
        "feature_select__k": [5, 10, 15]
    }
elif best_model_name == "SVM":
    param_grid = {
        "classifier__C": [0.1, 1, 10],
        "classifier__kernel": ["linear", "rbf"],
        "feature_select__k": [5, 10, 15]
    }
elif best_model_name == "Gradient Boosting":
    param_grid = {
        "classifier__n_estimators": [100, 200],
        "classifier__learning_rate": [0.01, 0.1, 0.2],
        "feature_select__k": [5, 10, 15]
    }
elif best_model_name == "KNN":
    param_grid = {
        "classifier__n_neighbors": [3, 5, 7, 9],
        "feature_select__k": [5, 10, 15]
    }

grid = GridSearchCV(
    estimator=best_pipe,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)
print("Test Accuracy:", grid.score(X_test, y_test))



Best Params: {'classifier__max_depth': 10, 'classifier__n_estimators': 100, 'feature_select__k': 10}
Best CV Score: 0.7768892443022791
Test Accuracy: 0.7532467532467533


In [12]:
import joblib
joblib.dump(best_pipe, "/content/best_pipeline.joblib")

['/content/best_pipeline.joblib']