In [1]:
# Importing Necesasary Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import joblib

In [3]:
# Reading the Dataset
df = pd.read_csv("G:\My Drive\ITS\Tugas\Semester_5\Biomedical Engineering\Final Project Req\Dataset.csv")

In [4]:
# Handling Missing Values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
# Label Targeting and Feature Selection
label = df.columns[-1]
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
feature_cols = [c for c in numeric_cols if c != label]

# Preparing Feature Matrix and Target Vector
X = df[feature_cols].values
y_raw = df[label].values

# Label Encoding the Target Variable
le = LabelEncoder()
y = le.fit_transform(df[label])

In [7]:
# Splitting the Dataset into Training and Val-Test Sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42)

# Further Splitting Val-Test Set into Validation and Test Sets
X_val, X_test, y_val, y_test = train_test_split(X_val_test,
                                                y_val_test,
                                                test_size=0.5,
                                                stratify=y_val_test,
                                                random_state=42)

In [8]:
print("Training set shape:")
print("Shape of X_Train:", X_train.shape)
print("Shape of y_Train:", y_train.shape)

print("\nValidation set shape:")
print("Shape of X_Val:", X_val.shape)
print("Shape of y_Val:", y_val.shape)

print("\nTest set shape:")
print("Shape of X_Test:", X_test.shape)
print("Shape of y_Test:", y_test.shape)


Training set shape:
Shape of X_Train: (669, 20530)
Shape of y_Train: (669,)

Validation set shape:
Shape of X_Val: (143, 20530)
Shape of y_Val: (143,)

Test set shape:
Shape of X_Test: (144, 20530)
Shape of y_Test: (144,)


In [None]:
from tqdm import tqdm
import time

# Enable verbose output for LogisticRegression
lasso = LogisticRegression(
    penalty="l1",
    solver="saga",
    C=0.1,
    max_iter=5000,
    n_jobs=-1,
    verbose=1
)

# Wrap the fit method with tqdm for progress bar
print("Training Lasso Logistic Regression...")
start_time = time.time()
lasso.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Training completed in {elapsed_time:.2f} seconds")

lasso_coef = np.mean(np.abs(lasso.coef_), axis=0)
lasso_rank_idx = np.argsort(lasso_coef)[::-1]
lasso_genes = []
for i in tqdm(lasso_rank_idx, desc="Ranking features"):
    lasso_genes.append(feature_cols[i])


Training Lasso Logistic Regression...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


In [None]:
# Save the model to the specified path
joblib.dump(lasso, 'White-Box Model/Saved Model/lasso_logistic_model.pkl')

In [None]:
y_train_pred = lasso.predict(X_train)
y_val_pred = lasso.predict(X_val)

print("\nTraining Accuracy:", accuracy_score(y_train, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))