In [47]:
import pandas as pd
import zipfile

zip_path = "./data_processed.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    X_train = pd.read_csv(z.open('X_train_std.csv'))
    Y_train = pd.read_csv(z.open('Y_train.csv'))
    X_test  = pd.read_csv(z.open('X_test_std.csv'))
    Y_test  = pd.read_csv(z.open('Y_test.csv'))

unique_values = Y_train['diabetes_stage'].unique()
print("Unique values:", unique_values)
print(Y_train['diabetes_stage'].value_counts())

Unique values: ['Type 2' 'Pre-Diabetes' 'No Diabetes' 'Gestational' 'Type 1']
diabetes_stage
Type 2          44916
Pre-Diabetes    23740
No Diabetes      6044
Gestational       209
Type 1             91
Name: count, dtype: int64


In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns (dtype == object)
categorical_cols = X_train.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

# Apply one-hot encoding only to categorical columns
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure train and test have the same columns
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

le = LabelEncoder()

# Fit on training labels and transform both train/test
Y_train_encoded = le.fit_transform(Y_train.values.ravel())
Y_test_encoded = le.transform(Y_test.values.ravel())

Categorical columns: Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'smoking_status'],
      dtype='object')


In [36]:
import time
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.base import clone
from itertools import combinations

def run_grid_search_classification(model_pipeline, param_grid, X_train, y_train, X_test, y_test, cv=5, scoring='accuracy'):
    """
    Run GridSearchCV for SVM or Naive Bayes classification.

    Args:
        model_pipeline : sklearn pipeline or estimator
        param_grid : dict of parameters for GridSearchCV
        X_train, y_train, X_test, y_test : training and test data
        cv : number of cross-validation folds
        scoring : metric for GridSearchCV ('accuracy', 'f1_macro', etc.)
        doJaccard : only works if pipeline has a linear model with coef_ attribute

    Returns:
        best_model, best_params
    """
    start_time = time.time()

    # Grid Search
    grid_search = GridSearchCV(
        estimator=model_pipeline,
        param_grid=param_grid,
        scoring=scoring,
        refit=True,
        cv=cv,
        n_jobs=-1,
        verbose=2
    )

    # Fit GridSearch
    grid_search.fit(X_train, y_train.ravel() if len(y_train.shape) > 1 else y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # CV results
    cv_results = grid_search.cv_results_
    best_index = grid_search.best_index_
    mean_score = cv_results['mean_test_score'][best_index]
    std_score = cv_results['std_test_score'][best_index]

    # Evaluate on test set
    y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("\nClassification report on test set:\n")
    print(classification_report(y_test, y_test_pred))

    elapsed_time = time.time() - start_time

    print("Best params:", best_params)
    print(f"CV {scoring}: {mean_score:.4f} ± {std_score:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")
    print(f"Elapsed time: {elapsed_time:.2f} s")

    return best_model, best_params


In [50]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe_svm = Pipeline([
    ('svc', SVC(class_weight='balanced'))
])

param_grid_svm = {
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto']
}

best_svm, best_params_svm = run_grid_search_classification(
    pipe_svm, param_grid_svm, X_train_encoded, Y_train_encoded, X_test_encoded, Y_test_encoded
)


Fitting 5 folds for each of 12 candidates, totalling 60 fits

Classification report on test set:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        69
           1       0.41      0.64      0.50      1937
           2       0.59      0.70      0.64      8105
           3       0.00      0.00      0.00        31
           4       0.91      0.75      0.82     14858

    accuracy                           0.72     25000
   macro avg       0.38      0.42      0.39     25000
weighted avg       0.76      0.72      0.73     25000

Best params: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
CV accuracy: 0.7205 ± 0.0041
Test accuracy: 0.7205
Elapsed time: 12064.20 s


In [48]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Define the pipeline with fixed parameters (no grid search)
pipe_svm = Pipeline([
    ('svm', SVC(
        kernel='rbf',
        C=10,
        gamma='scale',
        class_weight='balanced'
    ))
])

# Fit the model
pipe_svm.fit(X_train_encoded, Y_train_encoded)

# Predict
y_pred = pipe_svm.predict(X_test_encoded)

# Evaluate
print("\nClassification report:\n")
print(classification_report(Y_test_encoded, y_pred))
print("Test accuracy:", accuracy_score(Y_test_encoded, y_pred))



Classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        69
           1       0.41      0.64      0.50      1937
           2       0.59      0.70      0.64      8105
           3       0.00      0.00      0.00        31
           4       0.91      0.75      0.82     14858

    accuracy                           0.72     25000
   macro avg       0.38      0.42      0.39     25000
weighted avg       0.76      0.72      0.73     25000

Test accuracy: 0.72052


In [41]:
from sklearn.naive_bayes import ComplementNB

pipe_nb = Pipeline([
    ('nb', ComplementNB(class_prior=None))
])

param_grid_nb = {
    'nb__alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'nb__fit_prior': [True, False],
    'nb__norm': [True, False],
}


best_nb, best_params_nb = run_grid_search_classification(
    pipe_nb, param_grid_nb, X_train_encoded, Y_train_encoded, X_test_encoded, Y_test_encoded
)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\naive_bayes.py", line 762, in fit
    self._count(X, Y)
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\naive_bayes.py", line 1037, in _count
    check_non_negative(X, "ComplementNB (input X)")
  File "c:\Users\rylee\anaconda3\envs\stats\Lib\site-packages\sklearn\utils\validation.py", line 1824, in check_non_negative
    raise ValueError(f"Negative values in data passed to {whom}.")
ValueError: Negative values in data passed to ComplementNB (input X).


In [27]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Train
model = GaussianNB()
model.fit(X_train_encoded, Y_train_encoded)  # use encoded labels

# Predict
y_pred = model.predict(X_test_encoded)

# Evaluate
print("Accuracy:", accuracy_score(Y_test_encoded, y_pred))
print(classification_report(Y_test_encoded, y_pred))


Accuracy: 0.41064
              precision    recall  f1-score   support

           0       0.01      0.99      0.01        69
           1       0.48      0.26      0.34      1937
           2       0.61      0.32      0.42      8105
           3       0.01      0.06      0.01        31
           4       0.84      0.48      0.61     14858

    accuracy                           0.41     25000
   macro avg       0.39      0.42      0.28     25000
weighted avg       0.73      0.41      0.52     25000

