In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv('https://github.com/plotly/datasets/raw/refs/heads/master/diabetes.csv')

def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 24.9:
        return 'Normal'
    elif bmi < 29.9:
        return 'Overweight'
    return 'Obese'

df['BMI_category'] = df['BMI'].apply(bmi_category)

# Splitting dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Defining feature types
numeric_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']
categorical_features = ['BMI_category']

# Standardizing numeric features
scaler = StandardScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
val_df[numeric_features] = scaler.transform(val_df[numeric_features])

# One-hot encoding categorical features
encoder = OneHotEncoder(sparse=False)
train_encoded = encoder.fit_transform(train_df[categorical_features])
val_encoded = encoder.transform(val_df[categorical_features])

# Creating encoded DataFrames
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
val_encoded_df = pd.DataFrame(val_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Combining numeric and categorical features
X_train = pd.concat([train_df[numeric_features].reset_index(drop=True), train_encoded_df], axis=1)
X_val = pd.concat([val_df[numeric_features].reset_index(drop=True), val_encoded_df], axis=1)

y_train = train_df['Outcome']
y_val = val_df['Outcome']

# Training KNN model with different values of k
best_knn_f1, best_k = 0, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"K={k}, F1 Score: {f1}")
    if f1 > best_knn_f1:
        best_knn_f1, best_k = f1, k

print(f"Best K: {best_k}, Best F1 Score: {best_knn_f1}")

# Training Decision Tree model with different max_depth values
best_dt_f1, best_depth = 0, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    preds = dt.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"max_depth={depth}, F1 Score: {f1}")
    if f1 > best_dt_f1:
        best_dt_f1, best_depth = f1, depth

print(f"Best max_depth: {best_depth}, Best F1 Score: {best_dt_f1}")

# Selecting the best model
best_model = knn if best_knn_f1 > best_dt_f1 else dt

def inference(sample):
    sample_numeric = scaler.transform(sample[numeric_features])
    sample_categorical = encoder.transform(sample[categorical_features])
    sample_transformed = pd.concat([pd.DataFrame(sample_numeric), pd.DataFrame(sample_categorical)], axis=1)
    return best_model.predict(sample_transformed)

# Running inference on sample validation data
for i in range(5):
    sample = val_df.iloc[[i]]
    print(f"Sample {i + 1} Prediction: {inference(sample)}")


K=3, F1 Score: 0.5454545454545454
K=5, F1 Score: 0.6037735849056604
K=7, F1 Score: 0.6605504587155963
Best K: 7, Best F1 Score: 0.6605504587155963
max_depth=3, F1 Score: 0.6476190476190475
max_depth=5, F1 Score: 0.6379310344827586
max_depth=7, F1 Score: 0.5535714285714286
Best max_depth: 3, Best F1 Score: 0.6476190476190475
Sample 1 Prediction: [0]
Sample 2 Prediction: [0]
Sample 3 Prediction: [0]
Sample 4 Prediction: [0]
Sample 5 Prediction: [0]
