In [1]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, classification_report, make_scorer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
import optuna
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from datetime import datetime
import time

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
#Model Definition
def create_dnn_model(input_dim, layers, units, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))  
    model.add(Dense(units, activation='relu'))
    model.add(Dropout(dropout_rate))
    
    for _ in range(layers - 1):
        model.add(Dense(units, activation='relu'))
        model.add(Dropout(dropout_rate))
    
    model.add(Dense(len(np.unique(y)), activation='softmax'))  
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [5]:
#Optimization Function
def optimize_dnn(trial):
    layers = trial.suggest_int('layers', 1, 5)
    units = trial.suggest_int('units', 16, 128, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_int('batch_size', 16, 128, step=16)
    epochs = trial.suggest_int('epochs', 10, 50)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    fold_scores = []

    for train_idx, val_idx in kfold.split(X_scaled, y):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        #Build model
        model = create_dnn_model(X_train.shape[1], layers, units, dropout_rate, learning_rate)

        #Train model
        model.fit(
            X_train, y_train,
            epochs=epochs, batch_size=batch_size,
            validation_data=(X_val, y_val),
            verbose=0
        )

        #Predictions
        y_pred = np.argmax(model.predict(X_val), axis=1)
        fold_f1 = f1_score(y_val, y_pred, average='macro')
        fold_scores.append(fold_f1)

    return np.mean(fold_scores)


In [6]:
#Optuna study with stratified cross-validation
study = optuna.create_study(direction='maximize')
study.optimize(optimize_dnn, n_trials=10)

[I 2025-01-28 18:14:10,045] A new study created in memory with name: no-name-4ec9c8d7-0ee3-401c-9f31-928ca1466c3d


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 968us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 928us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 904us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step


[I 2025-01-28 19:37:06,201] Trial 0 finished with value: 0.9999435289256897 and parameters: {'layers': 3, 'units': 112, 'dropout_rate': 0.3240283761223979, 'learning_rate': 0.005016195137692198, 'batch_size': 80, 'epochs': 40}. Best is trial 0 with value: 0.9999435289256897.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 951us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 943us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 938us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 944us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 950us/step


[I 2025-01-28 19:56:36,326] Trial 1 finished with value: 0.5882158143768952 and parameters: {'layers': 3, 'units': 16, 'dropout_rate': 0.49261931910299894, 'learning_rate': 0.00010358991013751943, 'batch_size': 80, 'epochs': 12}. Best is trial 0 with value: 0.9999435289256897.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 920us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 954us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 927us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 938us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 941us/step


[I 2025-01-28 20:53:34,898] Trial 2 finished with value: 0.9999682681041617 and parameters: {'layers': 2, 'units': 112, 'dropout_rate': 0.41537543029431667, 'learning_rate': 0.004479647803917762, 'batch_size': 128, 'epochs': 47}. Best is trial 2 with value: 0.9999682681041617.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 981us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 988us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 979us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 993us/step


[I 2025-01-28 21:57:50,217] Trial 3 finished with value: 0.9999622355306673 and parameters: {'layers': 4, 'units': 64, 'dropout_rate': 0.4934957041195418, 'learning_rate': 0.0010678746138302968, 'batch_size': 80, 'epochs': 32}. Best is trial 2 with value: 0.9999682681041617.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 995us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 973us/step


[I 2025-01-28 22:53:15,889] Trial 4 finished with value: 1.0 and parameters: {'layers': 4, 'units': 96, 'dropout_rate': 0.24552040101739658, 'learning_rate': 0.00010682901273334459, 'batch_size': 128, 'epochs': 34}. Best is trial 4 with value: 1.0.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 959us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 938us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 941us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 925us/step


[I 2025-01-28 23:25:25,354] Trial 5 finished with value: 0.7928541294944024 and parameters: {'layers': 3, 'units': 16, 'dropout_rate': 0.30930864021264504, 'learning_rate': 0.005311893651972563, 'batch_size': 96, 'epochs': 23}. Best is trial 4 with value: 1.0.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 900us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 890us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 879us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 873us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 891us/step


[I 2025-01-29 00:13:15,886] Trial 6 finished with value: 1.0 and parameters: {'layers': 2, 'units': 48, 'dropout_rate': 0.1696089970273834, 'learning_rate': 0.002894261858747309, 'batch_size': 128, 'epochs': 47}. Best is trial 4 with value: 1.0.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 877us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 878us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 877us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 870us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 861us/step


[I 2025-01-29 01:31:54,160] Trial 7 finished with value: 0.9999587510855334 and parameters: {'layers': 1, 'units': 96, 'dropout_rate': 0.13474469204454215, 'learning_rate': 0.0002546799043578438, 'batch_size': 32, 'epochs': 24}. Best is trial 4 with value: 1.0.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 985us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 976us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 986us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 978us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 979us/step


[I 2025-01-29 04:39:21,474] Trial 8 finished with value: 0.9999534413326417 and parameters: {'layers': 4, 'units': 80, 'dropout_rate': 0.4149153228696153, 'learning_rate': 0.0011577163846692724, 'batch_size': 32, 'epochs': 39}. Best is trial 4 with value: 1.0.


[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 949us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 934us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 928us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 944us/step
[1m8802/8802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step


[I 2025-01-29 05:27:17,429] Trial 9 finished with value: 0.9999381056547467 and parameters: {'layers': 3, 'units': 96, 'dropout_rate': 0.22774230359927441, 'learning_rate': 0.0066453775036853695, 'batch_size': 64, 'epochs': 20}. Best is trial 4 with value: 1.0.


In [7]:
#Best parameters
print("Best hyperparameters:", study.best_params)

Best hyperparameters: {'layers': 4, 'units': 96, 'dropout_rate': 0.24552040101739658, 'learning_rate': 0.00010682901273334459, 'batch_size': 128, 'epochs': 34}


In [8]:
#Final model training with optimized hyperparameters

best_params = study.best_params
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
final_predictions = np.zeros_like(y)
final_true_labels = y

fold_no = 1
start_time = time.time()

for train_idx, val_idx in kfold.split(X_scaled, y):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = create_dnn_model(
        X_train.shape[1],
        layers=best_params['layers'],
        units=best_params['units'],
        dropout_rate=best_params['dropout_rate'],
        learning_rate=best_params['learning_rate']
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True
    )

    print(f"Training fold {fold_no}...")
    model.fit(
        X_train,
        y_train,
        epochs=best_params['epochs'],
        batch_size=best_params['batch_size'],
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=0
    )

    fold_predictions = np.argmax(model.predict(X_val), axis=1)
    final_predictions[val_idx] = fold_predictions

    #Print classification report for each fold
    #print(f"Fold {fold_no} Classification Report:\n")
    #print(classification_report(y_val, fold_predictions))

    fold_no += 1
end_time = time.time()

Training fold 1...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 907us/step
Training fold 2...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 941us/step
Training fold 3...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 912us/step
Training fold 4...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 926us/step
Training fold 5...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 938us/step
Training fold 6...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 892us/step
Training fold 7...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 931us/step
Training fold 8...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 917us/step
Training fold 9...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 931us/step
Training fold 10...
[1m4401/4401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 913us/step


In [9]:
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 5104.42 seconds


In [10]:
print("\nFinal Classification Report:\n")
print(classification_report(final_true_labels, final_predictions))


Final Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1223737
           1       1.00      1.00      1.00     74663
           2       1.00      1.00      1.00      9991
           3       1.00      1.00      1.00     54900
           4       1.00      1.00      1.00     24951
           5       1.00      1.00      1.00     19977

    accuracy                           1.00   1408219
   macro avg       1.00      1.00      1.00   1408219
weighted avg       1.00      1.00      1.00   1408219

