In [53]:
# Standard libraries
import os
import glob
import numpy as np
from numpy import asarray
import cv2
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns

# Scikit-learn libraries for preprocessing, modeling, and evaluation
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# TensorFlow/Keras libraries for building and training models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


In [35]:
import pandas as pd
df=pd.read_csv(r"C:\Users\chira\.cache\kagglehub\datasets\uciml\pima-indians-diabetes-database\versions\1\diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [54]:
import joblib

# Create BMI_category based on BMI values
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df['BMI_category'] = df['BMI'].apply(categorize_bmi)

# Split data into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# List of numeric and categorical features
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
categorical_features = ['BMI_category']

# Apply Standard Scaler and OneHotEncoder
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('cat', encoder, categorical_features)
    ])

# Create and fit a pipeline for KNN Classifier (will be repeated for DecisionTree)
def knn_model(k):
    knn = KNeighborsClassifier(n_neighbors=k)
    model = Pipeline(steps=[('preprocessor', preprocessor), ('knn', knn)])
    model.fit(X_train, y_train)
    return model

# Evaluate different KNN models
best_f1 = 0
best_k = 0
for k in [3, 5, 7]:
    model = knn_model(k)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"K={k}, F1 Score={f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_k = k

# Build and evaluate Decision Tree model
def decision_tree_model(max_depth):
    tree = DecisionTreeClassifier(max_depth=max_depth)
    model = Pipeline(steps=[('preprocessor', preprocessor), ('tree', tree)])
    model.fit(X_train, y_train)
    return model

best_f1_tree = 0
best_depth = 0
for depth in [3, 5, 7]:
    model = decision_tree_model(depth)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"Max Depth={depth}, F1 Score={f1:.4f}")
    if f1 > best_f1_tree:
        best_f1_tree = f1
        best_depth = depth

# Saving the best model and preprocessor
best_knn_model = knn_model(best_k)
best_tree_model = decision_tree_model(best_depth)

# Save the preprocessor, best model, and encoder
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(best_knn_model, 'best_knn_model.pkl')
joblib.dump(best_tree_model, 'best_tree_model.pkl')

print("Best KNN model saved with k=", best_k)
print("Best Decision Tree model saved with max_depth=", best_depth)


K=3, F1 Score=0.6182
K=5, F1 Score=0.5556
K=7, F1 Score=0.5849
Max Depth=3, F1 Score=0.6476
Max Depth=5, F1 Score=0.6931
Max Depth=7, F1 Score=0.6609
Best KNN model saved with k= 3
Best Decision Tree model saved with max_depth= 5
