<a href="https://colab.research.google.com/github/Alpesh202/FSFW-U-ML/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the necessary package




In [2]:
!pip install scikit-learn tensorflow pandas numpy openpyxl matplotlib joblib scikeras mlxtend

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


## Install the necessary library

In [3]:
# Data Handling
import pandas as pd
import numpy as np

# Data Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# multiclass classification problem using an artificial neural network in Python
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier

# Model Evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Model Persistence - load and unload the model
import joblib

## Data handling and preprocessing

In [6]:
file_path = "/content/Urine.xlsx"

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path)

# Display the DataFrame
print(df)

    Group  HIPI_2.336_94.05351_CC_Spe  RPPI_1.43_96.06918_CC+HTP+OT_Spe  \
0      NU                1.019047e+06                      4.369687e+06   
1      NU                1.251668e+06                      3.461698e+06   
2      NU                1.826197e+06                      4.281905e+06   
3      NU                6.082321e+05                      1.511828e+07   
4      NU                2.545110e+06                      3.993454e+06   
..    ...                         ...                               ...   
175    OT                4.313545e+06                      3.042152e+07   
176    OT                2.890789e+06                      2.167816e+07   
177    OT                2.406921e+06                      7.114720e+07   
178    OT                1.670501e+06                      2.404034e+07   
179    OT                1.414180e+06                      1.774169e+07   

     HIPI_2.265_98.04838_CC_Spe  RPPI_2.289_100.10043_CC_Spe  \
0                  7.889221e+05    

In [7]:
# Encoding the categorical column into numerical values
label_dict = {'NU': 0, 'CC': 1, 'ECG': 2, 'HTP': 3, 'NRT': 4, 'OT': 5}
df['Group_encoded'] = df['Group'].map(label_dict)

# Display the first and last few rows of the DataFrame
df.head()
df.tail()

# Split the data into features (X) and target variable (y)
X = df.drop(['Group', 'Group_encoded'], axis=1)
y = df['Group_encoded']

# Display the features and target variable
X.head()
y.head()

# Display the shape of the dataset
print(X.shape)
print(y.shape)

# Split the data into training and testing sets in a stratified way
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=42, stratify=y)

# Display the shapes of the splitted data
print(f'X_train.shape: {X_train.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_test.shape: {y_test.shape}')


(180, 208)
(180,)
X_train.shape: (126, 208)
y_train.shape: (126,)
X_test.shape: (54, 208)
y_test.shape: (54,)


In [79]:
# Initializing Classifiers
rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
gb_model = GradientBoostingClassifier()
knn_model = KNeighborsClassifier()
svm_model = SVC()
mlp_model = MLPClassifier()

# Building the pipelines
knn_model_std = Pipeline([('std', StandardScaler()), ('knn_model', knn_model)])
svm_model_std = Pipeline([('std', StandardScaler()), ('svm_model', svm_model)])
mlp_model_std = Pipeline([('std', StandardScaler()), ('mlp_model', mlp_model)])

# Setting up the parameter grids

param_grid_rf = {
    'n_estimators': [50, 100, 200, 300], # NL2, NL5 Optimised
    'max_depth': [None, 2, 3, 5], # NL2 Optimised
    'min_samples_split': [2, 3, 5, 10], # Optimised
    'min_samples_leaf': [1, 2, 3, 5], # NL5 Optimised
    'max_leaf_nodes': [None, 10, 20, 30]} # Optimised

param_grid_dt = {
    'max_depth': [5, 10, 20, 30], # NL2, NL4 Optimised
    'min_samples_split': [2, 3, 5, 10, 15], # Optimised
    'min_samples_leaf': [1, 2, 3, 5], # NL4 Optimised
    'max_leaf_nodes': [None, 10, 20, 30, 40]}

param_grid_gb = {
    'learning_rate': [0.01, 0.1, 0.2, 0.4, 0.8], # in NL5
    'n_estimators': [50, 100, 150, 200, 300], # in NL5
    'subsample': [0.2, 0.4, 0.6, 0.8], # in NL5
    'max_depth': [2, 3, 5, 10], # in 3S, NL5
    'min_samples_leaf': [4, 6, 8, 10]}

param_grid_knn = {
    'knn_model__n_neighbors': np.arange(1, 10), # in 20S
    'knn_model__weights': ['uniform', 'distance'],
    'knn_model__leaf_size': [5, 10, 20, 40],
    'knn_model__p': [1, 2],
    'knn_model__metric': ['euclidean', 'manhattan']}

param_grid_svm = [{
    'svm_model__kernel': ['rbf'],
    'svm_model__C': np.power(10., np.arange(-5, 5)),
    'svm_model__gamma': np.power(10., np.arange(-6, 0))},
    {'svm_model__kernel': ['linear'],
    'svm_model__C': np.power(10., np.arange(-5, 5))}]

param_grid_mlp = {
    'mlp_model__hidden_layer_sizes': [(50,), (100,), (150,), (50, 50), (100, 50), (100, 100)],
    'mlp_model__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'mlp_model__learning_rate_init': [0.0001, 0.001, 0.01, 0.1],
    'mlp_model__max_iter': [100, 200, 300, 500, 1000],
    'mlp_model__batch_size': [16, 32, 64, 128]}

# Create a list of classifiers, corresponding parameter grids and classifier short name
classifiers = [rf_model, dt_model, gb_model, knn_model_std, svm_model_std, mlp_model_std]
param_grids = [param_grid_rf, param_grid_dt, param_grid_gb, param_grid_knn, param_grid_svm, param_grid_mlp]
name = ['RForest', 'DTree', 'GradientBoosting', 'KNN', 'SVM', 'MLP']

In [None]:
import tensorflow as tf
from tensorflow.keras.regularizers import l1, l2

# Inside your create_model function
def create_model(num_layers=1, units=64, activation='relu', kernel_regularizer=None, activity_regularizer=None, dropout_rate=0.3, learning_rate=0.001):
    model = Sequential()

    # Add input layer
    model.add(Dense(units, input_dim=X_train.shape[1], activation=activation, kernel_regularizer=kernel_regularizer, activity_regularizer=activity_regularizer))

    # Add hidden layers
    for _ in range(num_layers - 1):
        model.add(Dense(units, activation=activation, kernel_regularizer=kernel_regularizer, activity_regularizer=activity_regularizer))
        model.add(tf.keras.layers.Dropout(dropout_rate))

    # Output layer
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create KerasClassifier
ann_model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)

# Building the pipelines
ann_model_std = Pipeline([('std', StandardScaler()), ('ann_model', ann_model)])

param_grid_ann = {
    'ann_model__num_layers': [1, 2],
    'ann_model__units': [32, 64, 128],
    'ann_model__activation': ['tanh', 'relu'],
    'ann_model__kernel_regularizer': [None, 'l1', 'l2'],  # Regularization for weights
    'ann_model__activity_regularizer': [None, 'l1', 'l2'],  # Regularization for bias
    'ann_model__dropout_rate': [0.2, 0.3, 0.4, 0.5, 0.6],
    'ann_model__learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'ann_model__batch_size': [8, 16, 32], # can also remove, if yes add into model - epochs=50, batch_size=8
    'mlp_model__epochs': [50, 100, 150]} # can also remove, if yes add into model - epochs=50, batch_size=8

# Create a list of classifiers, corresponding parameter grids and classifier short name
classifiers1 = [ann_model_std]
param_grids1 = [param_grid_ann]
name1 = ['MLP']


In [None]:

from keras.layers import Dense
from keras.models import Sequential

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=2))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss=' categorical_crossentropy', metrics=['accuracy'])
model.summary()

hist = model.fit(x, to_categorical(y), epochs=40, batch_size=10, validation_split=0.2)

from sklearn.metrics import confusion_matrix

y_predicted = model.predict(x_test)
mat = confusion_matrix(y_test.argmax(axis=1), y_predicted.argmax(axis=1))

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=faces.target_names,
            yticklabels=faces.target_names)

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

In [1]:
# multi-class classification with Keras
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical

# define baseline model
def baseline_model():
 model = Sequential()
 model.add(Dense(64, input_dim=4, activation='relu'))
 model.add(Dense(128, activation = 'relu'))
 model.add(Dropout(0.2))
 model.add(Dense(6, activation='softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 return model

ann_model = KerasClassifier(model=baseline_model, epochs=200, batch_size=5, verbose=0)

param_grid_ann = {
    } # can also remove, if yes add into model - epochs=50, batch_size=8

# Create a list of classifiers, corresponding parameter grids and classifier short name
classifiers1 = [ann_model_std]
param_grids1 = [param_grid_ann]
name1 = ['MLP']


ModuleNotFoundError: No module named 'scikeras'

In [None]:
# Setting up GridSearchCV objects

gridcvs = {}
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for model, param_grid, name in zip(classifiers1, param_grids1, name1):
    gcv = GridSearchCV(estimator=model,
                       param_grid=param_grid,
                       scoring='accuracy',
                       cv=inner_cv,
                       n_jobs=-1,
                       verbose=1,
                       refit=True)
    gridcvs[name] = gcv

# Outer cross-validation loop
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, gs_est in sorted(gridcvs.items()):
    print(50 * '-', '\n')
    print(f'Algorithm: {name}')
    print('    Inner loop:')

    # Inner cross-validation loop
    for i, (train_index, test_index) in enumerate(outer_cv.split(X_train, y_train), 1):
        X_train_inner, X_test_inner = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_inner, y_test_inner = y_train.iloc[train_index], y_train.iloc[test_index]

        gs_est.fit(X_train_inner, y_train_inner)

        # Assuming `gs_est.best_params_` and `gs_est.best_score_` are accessible
        print(f'\n        Best ACC (avg. of inner test folds) {gs_est.best_score_ * 100:.2f}%')
        print(f'        Best parameters: {gs_est.best_params_}')

        # Assuming you have a function or code to calculate ACC for the outer test fold
        acc_outer = gs_est.score(X_test_inner, y_test_inner)
        print(f'        ACC (on outer test fold) {acc_outer * 100:.2f}%')

    # Outer cross-validation loop
    nested_scores = cross_val_score(gs_est, X_train, y_train, cv=outer_cv, scoring='accuracy')

    print('\n    Outer Loop:')
    for i, score in enumerate(nested_scores, 1):
        print(f'        ACC {score * 100:.2f}%')

    print(f'\n{name} | outer ACC {nested_scores.mean() * 100:.2f}% +/- {nested_scores.std() * 100:.2f}')


-------------------------------------------------- 

Algorithm: MLP
    Inner loop:
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  X, y = self._initialize(X, y)



        Best ACC (avg. of inner test folds) 78.00%
        Best parameters: {}
        ACC (on outer test fold) 80.77%
Fitting 5 folds for each of 1 candidates, totalling 5 fits


### Version of the all the library being used in the project

In [None]:
import sys
import pkg_resources

packages_to_check = ['scikit-learn',
                     'tensorflow',
                     'pandas',
                     'numpy',
                     'openpyxl',
                     'matPlotlib',
                     'matPlotlib',
                     'joblib',
                     'scikeras',
                     'mlxtend']

# Print the Python version
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
print(f"Python version: {python_version}")

# Print the versions
for package in packages_to_check:
    version = pkg_resources.get_distribution(package).version
    print(f"{package.capitalize()} version: {version}")