In [1]:
# Dataframe
import pandas as pd
import numpy as np

# Data Scaler and labelencoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Data Split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# ML model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC

import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Dropout
# from scikeras.wrappers import KerasClassifier # if it is showing this error then try to import this library
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Evaluation of the model
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Do load and unload the model
import joblib

In [None]:
# Define the data path
raw_data = r'C:\Users\Alpesh\Desktop\RawData\Urine.xlsx'
df = pd.read_excel(raw_data)

# Encoding the categorial column into numerical value
label_dict = {'NU': 0,
              'CC': 1,
              'ECG': 2,
              'HTP': 3,
              'NRT': 4,
              'OT': 5}

df['Group_encoded'] = df['Group'].map(label_dict)
df.head()
df.tail()

# Split the data into features (X) and target variable (y)
X = df.drop(['Group', 'Group_encoded'], axis=1)
y = df['Group_encoded']

# View the dataset
X.head()
y.head()
print(X.shape)
print(y.shape)

# Split the data into training and testing sets in stratified way
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=42, stratify=y)

# Check the splitted data
print(f'X_train.shape: {X_train.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_test.shape: {y_test.shape}')

In [None]:
# Initializing Classifiers
rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
xgb_model = GradientBoostingClassifier()
knn_model = KNeighborsClassifier()
svm_model = OneVsOneClassifier(SVC())

# Create a function to build the Keras model
def create_model(layers=1, units=64, activation='relu', alpha=0.0001, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential()
    for _ in range(layers):
        model.add(Dense(units, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(alpha)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(len(np.unique(y)), activation='softmax'))  # Assuming 'Group_encoded' is a categorical variable
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create a KerasClassifier
mlp_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_model, verbose=0)

# Building the pipelines
knn_model_std = Pipeline([('std', StandardScaler()),
                  ('knn_model', knn_model)])

svm_model_std = Pipeline([('std', StandardScaler()),
                  ('svm_model', svm_model)])

mlp_model_std = Pipeline([('std', StandardScaler()),
                  ('mlp_model', mlp_model)])

In [56]:
# Setting up the parameter grids

param_grid_rf = {
    'n_estimators': [100, 200, 300, 500, 1000], # NL2, NL5 Optimised
    'max_depth': [None, 2, 3, 4, 5, 10], # NL2 Optimised
    'min_samples_split': [2, 3, 5, 10, 15], # Optimised
    'min_samples_leaf': [1, 2, 3, 5, 10], # NL5 Optimised
    'max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5], # NL5 Optimised
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50]} # Optimised

param_grid_dt = {
    'max_depth': [5, 10, 20, 30], # NL2, NL4 Optimised
    'min_samples_split': [2, 3, 5, 10, 15], # Optimised
    'min_samples_leaf': [1, 2, 4], # NL4 Optimised
    'max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5], # Optimised
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50]}

param_grid_xgb = {
    'min_samples_split': [3],
    'min_samples_leaf': [1, 2, 5],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt', 'log2']}

param_grid_knn = {
    'knn_model__n_neighbors': [10],
    'knn_model__weights': ['distance'],
    'knn_model__algorithm': ['auto'],
    'knn_model__leaf_size': [20],
    'knn_model__p': [2],
    'knn_model__metric': ['manhattan']}

param_grid_svm = [
    {'svm_model__estimator__kernel': ['rbf'],
     'svm_model__estimator__C': [1.0],
     'svm_model__estimator__gamma': [0.001]},
    {'svm_model__estimator__kernel': ['linear'],
     'svm_model__estimator__C': [1.0]}]

param_grid_mlp = {
    'mlp_model__layers': [2],
    'mlp_model__units': [64],
    'mlp_model__activation': ['relu'],
    'mlp_model__alpha': [0.001],
    'mlp_model__dropout_rate': [0.4],
    'mlp_model__learning_rate': [0.01],
    'mlp_model__batch_size': [16],
    'mlp_model__epochs': [100]}

# Create a list of classifiers, corresponding parameter grids and classifier short name
classifiers = [rf_model, dt_model]
param_grids = [param_grid_rf, param_grid_dt]
name = ['RForest', 'DTree']


In [None]:
# Setting up GridSearchCV objects

gridcvs = {}
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

for model, param_grid, name in zip(classifiers, param_grids, name):
    gcv = GridSearchCV(estimator=model,
                       param_grid=param_grid,
                       scoring='accuracy',
                       n_jobs=-1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv
    
# Outer cross-validation loop
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, gs_est in sorted(gridcvs.items()):
    print(50 * '-', '\n')
    print(f'Algorithm: {name}')
    print('    Inner loop:')

    # Inner cross-validation loop
    for i, (train_index, test_index) in enumerate(outer_cv.split(X_train, y_train), 1):
        X_train_inner, X_test_inner = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_inner, y_test_inner = y_train.iloc[train_index], y_train.iloc[test_index]

        gs_est.fit(X_train_inner, y_train_inner)

        # Assuming `gs_est.best_params_` and `gs_est.best_score_` are accessible
        print(f'\n        Best ACC (avg. of inner test folds) {gs_est.best_score_ * 100:.2f}%')
        print(f'        Best parameters: {gs_est.best_params_}')
        
        # Assuming you have a function or code to calculate ACC for the outer test fold
        acc_outer = gs_est.score(X_test_inner, y_test_inner)
        print(f'        ACC (on outer test fold) {acc_outer * 100:.2f}%')

    # Outer cross-validation loop
    nested_scores = cross_val_score(gs_est, X_train, y_train, cv=outer_cv, scoring='accuracy')
    
    print('\n    Outer Loop:')
    for i, score in enumerate(nested_scores, 1):
        print(f'        ACC {score * 100:.2f}%')

    print(f'\n{name} | outer ACC {nested_scores.mean() * 100:.2f}% +/- {nested_scores.std() * 100:.2f}')
    

-------------------------------------------------- 

Algorithm: DTree
    Inner loop:

        Best ACC (avg. of inner test folds) 76.85%
        Best parameters: {'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
        ACC (on outer test fold) 59.26%

        Best ACC (avg. of inner test folds) 87.04%
        Best parameters: {'max_depth': 10, 'max_features': 0.5, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
        ACC (on outer test fold) 85.19%

        Best ACC (avg. of inner test folds) 80.56%
        Best parameters: {'max_depth': 10, 'max_features': 0.5, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
        ACC (on outer test fold) 85.19%

        Best ACC (avg. of inner test folds) 79.63%
        Best parameters: {'max_depth': 5, 'max_features': 0.2, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
        ACC (on outer test fold) 70.37%

   