In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time

In [2]:
#Load the dataset
df = pd.read_csv('complete_decimal_dataset.csv')

In [3]:
#Data Preprocessing
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
#Optimization function
def optimize_logistic_regression(trial):
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga', 'lbfgs'])

    #l1_ratio if elasticnet is selected
    l1_ratio = None
    if penalty == 'elasticnet':
        if solver != 'saga':  # Elasticnet only works with saga solver
            raise optuna.TrialPruned()
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)

    #compatibility between penalty and solver
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.TrialPruned()
    if penalty == 'l2' and solver not in ['lbfgs', 'liblinear', 'saga']:
        raise optuna.TrialPruned()

    #Logistic Regression model
    model = LogisticRegression(
        C=C, penalty=penalty, solver=solver, random_state=42, max_iter=1000, l1_ratio=l1_ratio
    )

    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(model, X_scaled, y, cv=stratified_kfold, scoring=scorer, n_jobs=-1).mean()

    return scores


In [5]:
#Optuna study with stratified cross-validation
study_lr = optuna.create_study(direction="maximize")
study_lr.optimize(optimize_logistic_regression, n_trials=10, n_jobs=-1)


[I 2025-01-29 13:58:15,001] A new study created in memory with name: no-name-7ddeac39-922e-444f-94fe-2f9746e0c508
[I 2025-01-29 13:58:15,031] Trial 6 pruned. 
[I 2025-01-29 14:02:12,939] Trial 7 finished with value: 0.8356915191511065 and parameters: {'C': 0.13124366465765952, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 7 with value: 0.8356915191511065.
[I 2025-01-29 14:08:05,233] Trial 9 finished with value: 0.8402619364467958 and parameters: {'C': 2.471662902691288, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 9 with value: 0.8402619364467958.
[I 2025-01-29 14:09:57,238] Trial 2 finished with value: 0.8591338467263402 and parameters: {'C': 0.3416727358467226, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 2 with value: 0.8591338467263402.
[I 2025-01-29 14:18:12,283] Trial 3 finished with value: 0.24624570212915353 and parameters: {'C': 0.00012738196640488703, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8591338467263402.
[I 2025-01-29 14:

In [6]:
#Best parameters
print(f"Best parameters for Random Forest: {study_lr.best_params}")

Best parameters for Random Forest: {'C': 3.2376253329526534, 'penalty': 'l1', 'solver': 'saga'}


In [7]:
#optimized hyperparameters
best_params = study_lr.best_params
model = LogisticRegression(**best_params, random_state=42, max_iter=1000)

In [8]:
#10-fold cross-validation 
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
start_time = time.time()
y_pred = cross_val_predict(model, X_scaled, y, cv=stratified_kfold, n_jobs=-1)
end_time = time.time()

In [9]:
#Evaluate the time it takes to train the model
training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

Model training time: 3647.24 seconds


In [10]:
#Classification report
report = classification_report(y, y_pred)
print("Classification Report with Tuned Hyperparameters and 10-Fold CV:\n")
print(report)


Classification Report with Tuned Hyperparameters and 10-Fold CV:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98   1223737
           1       1.00      1.00      1.00     74663
           2       1.00      1.00      1.00      9991
           3       0.92      0.18      0.30     54900
           4       1.00      1.00      1.00     24951
           5       1.00      1.00      1.00     19977

    accuracy                           0.97   1408219
   macro avg       0.98      0.86      0.88   1408219
weighted avg       0.97      0.97      0.96   1408219

