In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna
from datetime import datetime
import time


In [2]:
notebook_start_time = datetime.now()
print(f"Notebook started at: {notebook_start_time}")

#Load dataset
df = pd.read_csv('complete_decimal_dataset.csv')

#Encode target variable
label_encoder = LabelEncoder()
df['specific_class_encoded'] = label_encoder.fit_transform(df['specific_class'])

#Prepare features and target
X = df.drop(columns=['label', 'category', 'specific_class', 'specific_class_encoded'])
y = df['specific_class_encoded']

#Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Notebook started at: 2025-01-20 08:24:30.127569


In [3]:
#Training and testing sets Splits (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def optimize_logistic_regression(trial):
    
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga', 'lbfgs'])

    #l1_ratio if elasticnet is selected
    l1_ratio = None
    if penalty == 'elasticnet':
        if solver != 'saga':  # Elasticnet only works with saga solver
            raise optuna.TrialPruned()
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)

    #compatibility between penalty and solver
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.TrialPruned()
    if penalty == 'l2' and solver not in ['lbfgs', 'liblinear', 'saga']:
        raise optuna.TrialPruned()

    #Logistic Regression model
    model = LogisticRegression(
        C=C, penalty=penalty, solver=solver, random_state=42, max_iter=1000, l1_ratio=l1_ratio
    )

    #Cross-validation
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average='macro')
    scores = cross_val_score(model, X_train, y_train, cv=stratified_kfold, scoring=scorer, n_jobs=-1)

    return scores.mean()


In [4]:
#Optuna study
study_lr = optuna.create_study(direction="maximize")
study_lr.optimize(optimize_logistic_regression, n_trials=10, n_jobs=-1)


[I 2025-01-20 08:24:31,745] A new study created in memory with name: no-name-ce38b78e-e364-4d1c-8fc9-a6709abb9d2c
[I 2025-01-20 08:24:31,751] Trial 1 pruned. 
[I 2025-01-20 08:24:31,770] Trial 8 pruned. 
[I 2025-01-20 08:26:08,371] Trial 0 finished with value: 0.20671388102931956 and parameters: {'C': 0.00013104504885704289, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.057622501421336514}. Best is trial 0 with value: 0.20671388102931956.
[I 2025-01-20 08:27:25,924] Trial 6 finished with value: 0.20669709721325907 and parameters: {'C': 0.0003877552880547607, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.20671388102931956.
[I 2025-01-20 08:27:48,852] Trial 7 finished with value: 0.6555050642700521 and parameters: {'C': 0.001196162324265911, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 7 with value: 0.6555050642700521.
[I 2025-01-20 08:28:46,977] Trial 4 finished with value: 0.8354180888797542 and parameters: {'C': 0.3552017555114897, 'penalty': 

In [5]:
print(f"Best parameters for Random Forest: {study_lr.best_params}")

Best parameters for Random Forest: {'C': 0.438732755172357, 'penalty': 'l1', 'solver': 'saga'}


In [6]:
#best parameters
best_params = study_lr.best_params
model = LogisticRegression(**best_params, random_state=42, max_iter=1000)

In [7]:
start_time = time.time()

model.fit(X_train, y_train)
end_time = time.time()

training_duration = end_time - start_time
print(f"Model training time: {training_duration:.2f} seconds")

y_pred = model.predict(X_test)

#Classification report on test data
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))


Model training time: 1116.33 seconds
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       0.92      0.18      0.31     10902
           4       1.00      1.00      1.00      4968
           5       1.00      1.00      1.00      3989

    accuracy                           0.97    281644
   macro avg       0.98      0.86      0.88    281644
weighted avg       0.97      0.97      0.96    281644



In [8]:
# Log the end time
notebook_end_time = datetime.now()
print(f"Notebook ended at: {notebook_end_time}")

# Calculate the total duration
notebook_duration = notebook_end_time - notebook_start_time
print(f"Total notebook runtime: {notebook_duration}")

Notebook ended at: 2025-01-20 09:12:13.849884
Total notebook runtime: 0:47:43.722315


In [11]:
from sklearn.metrics import classification_report,accuracy_score, make_scorer, precision_score, recall_score, f1_score

#Evaluate on training set
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average="macro")

#Evaluate on test set
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average="macro")


#Compare results
print(f"Training Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}")
print(f"Training F1 Score: {train_f1:.2f}, Test F1 Score: {test_f1:.2f}")

if train_accuracy > test_accuracy:
    print("The model might be overfitting.")
elif train_accuracy < test_accuracy:
    print("Test set performs better than training.")
else:
    print("Training and test performance are comparable.")

#Classification reports
print("\nClassification Report (Training):")
print(classification_report(y_train, y_train_pred))

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.97, Test Accuracy: 0.97
Training F1 Score: 0.88, Test F1 Score: 0.88
Test set performs better than training.

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    978947
           1       1.00      1.00      1.00     59642
           2       1.00      1.00      1.00      8017
           3       0.92      0.18      0.30     43998
           4       1.00      1.00      1.00     19983
           5       1.00      1.00      1.00     15988

    accuracy                           0.97   1126575
   macro avg       0.98      0.86      0.88   1126575
weighted avg       0.97      0.97      0.96   1126575

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    244790
           1       1.00      1.00      1.00     15021
           2       1.00      1.00      1.00      1974
           3       0.92      0.18      0.31  