In [9]:
# Importing Necesasary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import joblib
from tqdm import tqdm
import time

In [2]:
# Reading the Dataset
df = pd.read_csv("G:\My Drive\ITS\Tugas\Semester_5\Biomedical Engineering\Final Project Req\Dataset.csv")

In [3]:
# Handling Missing Values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
# Label Targeting and Feature Selection
label = df.columns[-1]
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
feature_cols = [c for c in numeric_cols if c != label]

# Preparing Feature Matrix and Target Vector
X = df[feature_cols].values
y_raw = df[label].values

# Label Encoding the Target Variable
le = LabelEncoder()
y = le.fit_transform(df[label])

In [5]:
# Splitting the Dataset into Training and Val-Test Sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=42)

# Further Splitting Val-Test Set into Validation and Test Sets
X_val, X_test, y_val, y_test = train_test_split(X_val_test,
                                                y_val_test,
                                                test_size=0.5,
                                                stratify=y_val_test,
                                                random_state=42)

In [7]:
# Checking class distribution of each set
from collections import Counter

print("Training set class distribution:", Counter(y_train))
print("\nValidation set class distribution:", Counter(y_val))
print("\nTest set class distribution:", Counter(y_test))

Training set class distribution: Counter({np.int64(2): 304, np.int64(3): 136, np.int64(0): 99, np.int64(4): 83, np.int64(1): 47})

Validation set class distribution: Counter({np.int64(2): 65, np.int64(3): 29, np.int64(0): 21, np.int64(4): 18, np.int64(1): 10})

Test set class distribution: Counter({np.int64(2): 65, np.int64(3): 29, np.int64(0): 22, np.int64(4): 18, np.int64(1): 10})


In [6]:
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [8]:
print("Training set shape:")
print("Shape of X_Train:", X_train.shape)
print("Shape of y_Train:", y_train.shape)

print("\nValidation set shape:")
print("Shape of X_Val:", X_val.shape)
print("Shape of y_Val:", y_val.shape)

print("\nTest set shape:")
print("Shape of X_Test:", X_test.shape)
print("Shape of y_Test:", y_test.shape)


Training set shape:
Shape of X_Train: (669, 20530)
Shape of y_Train: (669,)

Validation set shape:
Shape of X_Val: (143, 20530)
Shape of y_Val: (143,)

Test set shape:
Shape of X_Test: (144, 20530)
Shape of y_Test: (144,)


In [10]:
elastic_net = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    C=0.1,
    l1_ratio=0.5,  # 0.5 = equal L1 and L2
    max_iter=5000,
    n_jobs=-1,
    verbose=1
)

# Wrap the fit method with tqdm for progress bar
print("Training Lasso Logistic Regression...")
start_time = time.time()
elastic_net.fit(X_train_scaled, y_train)
elapsed_time = time.time() - start_time
print(f"Training completed in {elapsed_time / 60:.2f} minutes.")

elastic_net_coef = np.mean(np.abs(elastic_net.coef_), axis=0)
elastic_net_rank_idx = np.argsort(elastic_net_coef)[::-1]
elastic_net_genes = []
for i in tqdm(elastic_net_rank_idx, desc="Ranking features"):
    elastic_net_genes.append(feature_cols[i])

Training Lasso Logistic Regression...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


convergence after 3448 epochs took 1231 seconds
Training completed in 20.53 minutes.


Ranking features: 100%|██████████| 20530/20530 [00:00<00:00, 2617455.81it/s]


In [11]:
# Save the model to the specified path
joblib.dump(elastic_net, 'Saved Model/elastic_net_logistic_model.pkl')

['Saved Model/elastic_net_logistic_model.pkl']

In [12]:
y_train_pred = elastic_net.predict(X_train_scaled)
y_val_pred = elastic_net.predict(X_val)

print("\nTraining Accuracy:", accuracy_score(y_train, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))


Training Accuracy: 1.0
Validation Accuracy: 0.7202797202797203

Classification Report (Validation):
              precision    recall  f1-score   support

       Basal       1.00      0.95      0.98        21
        Her2       1.00      0.70      0.82        10
        LumA       0.62      1.00      0.76        65
        LumB       1.00      0.10      0.19        29
      Normal       1.00      0.44      0.62        18

    accuracy                           0.72       143
   macro avg       0.92      0.64      0.67       143
weighted avg       0.83      0.72      0.66       143



In [13]:
# Analyze top contributing features for each class
n_top_features = 10

for class_idx, class_name in enumerate(le.classes_):
    print(f"\n{'='*60}")
    print(f"Class: {class_name}")
    print('='*60)

    # Get coefficients for this class
    class_coef = elastic_net.coef_[class_idx]

    # Create DataFrame with features and their coefficients
    class_features = pd.DataFrame({
        'feature': feature_cols,
        'coefficient': class_coef,
        'abs_coefficient': np.abs(class_coef)
    }).sort_values('abs_coefficient', ascending=False)

    print(f"\nTop {n_top_features} Contributing Features:")
    print(class_features.head(n_top_features).to_string(index=False))

    # Show positive and negative contributors separately
    positive_features = class_features[class_features['coefficient'] > 0].head(5)
    negative_features = class_features[class_features['coefficient'] < 0].head(5)

    print(f"\nTop 5 Positive Contributors (increase probability):")
    print(positive_features[['feature', 'coefficient']].to_string(index=False))

    print(f"\nTop 5 Negative Contributors (decrease probability):")
    print(negative_features[['feature', 'coefficient']].to_string(index=False))


Class: Basal

Top 10 Contributing Features:
 feature  coefficient  abs_coefficient
 CXorf61     0.139225         0.139225
   FOXC1     0.103079         0.103079
 HORMAD1     0.099002         0.099002
      AR    -0.081959         0.081959
 SFRS13B     0.077822         0.077822
C11orf86     0.073533         0.073533
    MLPH    -0.067761         0.067761
   LEMD1     0.065674         0.065674
  TRIM15     0.062971         0.062971
  GPR160    -0.059804         0.059804

Top 5 Positive Contributors (increase probability):
 feature  coefficient
 CXorf61     0.139225
   FOXC1     0.103079
 HORMAD1     0.099002
 SFRS13B     0.077822
C11orf86     0.073533

Top 5 Negative Contributors (decrease probability):
feature  coefficient
     AR    -0.081959
   MLPH    -0.067761
 GPR160    -0.059804
   TFF3    -0.049747
  ENPP1    -0.040979

Class: Her2

Top 10 Contributing Features:
 feature  coefficient  abs_coefficient
   KRT20     0.250402         0.250402
   ERBB2     0.127655         0.127655
C