In [1]:
# Standard library imports
import os
import random
import gc
import copy

# Third-party library imports
import numpy as np
import pandas as pd

# PyTorch and related libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

# einops library for tensor operations
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
# Custom TINTO library imports
from TINTOlib.tinto import TINTO
from TINTOlib.supertml import SuperTML
from TINTOlib.igtd import IGTD
from TINTOlib.refined import REFINED
from TINTOlib.barGraph import BarGraph
from TINTOlib.distanceMatrix import DistanceMatrix
from TINTOlib.combination import Combination
from TINTOlib.featureWrap import FeatureWrap
from TINTOlib.bie import BIE

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

In [3]:
import torch

# Get CUDA version
cuda_version = torch.version.cuda
print(f"CUDA Version: {cuda_version}")

# Get cuDNN version
cudnn_version = torch.backends.cudnn.version()
print(f"cuDNN Version: {cudnn_version}")

# Get PyTorch version
pytorch_version = torch.__version__
print(f"PyTorch Version: {pytorch_version}")

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use GPU.")
    
    # Get the name of the current GPU
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
    
    # Create a random tensor and move it to GPU to verify
    x = torch.rand(5, 3)
    print(f"Is this tensor on GPU? {x.cuda().is_cuda}")
else:
    print("CUDA is not available. PyTorch will use CPU.")

# Additional check: is CUDA initialized?
print(f"Is CUDA initialized? {torch.cuda.is_initialized()}")

# Number of available GPUs
print(f"Number of available GPUs: {torch.cuda.device_count()}")

# Current device index
print(f"Current device index: {torch.cuda.current_device()}")


CUDA Version: 12.1
cuDNN Version: 90100
PyTorch Version: 2.5.1+cu121
CUDA is available. PyTorch can use GPU.
Current GPU: NVIDIA A100-PCIE-40GB MIG 7g.40gb
Is this tensor on GPU? True
Is CUDA initialized? True
Number of available GPUs: 1
Current device index: 0


## DATASET

In [4]:
SEED = 64
# SET RANDOM SEED FOR REPRODUCIBILITY
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [5]:
# Create variable to store dataset name
dataset_name = 'preprocessed_heloc'
results_path = f'logs/Binary/{dataset_name}/MLP_Binary'

In [6]:
df = pd.read_csv(f"Datasets/Binary/{dataset_name}.csv")

In [7]:
df.shape

(9871, 24)

In [8]:
# Drop the second-to-last column if MIMO
# df = df.drop(df.columns[-1], axis=1)

In [9]:
df.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,55,144,4,84,20,3,0,83,2,3,...,0,0,0,33,-8,8,1,1,69,Bad
1,61,58,15,41,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,Bad
2,67,66,5,24,9,0,0,100,-7,7,...,0,4,4,53,66,4,2,1,86,Bad
3,66,169,1,73,28,1,1,93,76,6,...,0,5,4,72,83,6,4,3,91,Bad
4,81,333,27,132,12,0,0,100,-7,7,...,0,1,1,51,89,3,1,0,80,Bad


## LOAD AND PREPROCESS

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import os
import cv2

# Function to load and preprocess only numerical data
def load_and_preprocess_numerical_data(batch_size=32):
    # Split data
    df_x = df.drop(df.columns[-1], axis=1)  # All features except the target
    df_y = df[df.columns[-1]]  # Target column

    X_train, X_val, y_train, y_val = train_test_split(df_x, df_y, test_size=0.20, random_state=SEED)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50, random_state=SEED)

    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Scale numerical data
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    attributes = len(X_train.columns)

    print("Attributes: ", attributes)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.as_tensor(X_train.values, dtype=torch.float32)
    X_val_tensor = torch.as_tensor(X_val.values, dtype=torch.float32)
    X_test_tensor = torch.as_tensor(X_test.values, dtype=torch.float32)
    y_train_tensor = torch.as_tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
    y_val_tensor = torch.as_tensor(y_val.values, dtype=torch.float32).reshape(-1, 1)
    y_test_tensor = torch.as_tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

    # Create DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    return train_loader, val_loader, test_loader, attributes

## EXPERIMENTS

In [11]:
def calculate_iterations_per_epoch(dataset_size, batch_size):
    iterations = dataset_size // batch_size
    if dataset_size % batch_size != 0:
        iterations += 1
    return iterations

In [12]:
batch_size = 32

In [13]:
num_epochs = calculate_iterations_per_epoch(df.shape[0], batch_size)
# For the Boston dataset, the number of samples is too small for a range test, so the number of epochs is tripled.
#num_epochs = num_epochs*3

In [14]:
num_epochs

309

### EXPERIMENT 1: LIGHTBGM

In [31]:
import lightgbm as lgb

In [32]:
# Preprocess data
# Split features and target
df_x = df.iloc[:, :-1]  # All columns except the last one (features)
df_y = df.iloc[:, -1]   # The last column (target)

# Encode target labels as integers
label_encoder = LabelEncoder()
df_y = label_encoder.fit_transform(df_y)  # Converts string labels to integers

# Create train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df_x, df_y, test_size=0.2, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

scaler = MinMaxScaler()

# Fit and transform the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Prepare data in LightGBM Dataset format
train_data = lgb.Dataset(X_train_scaled, label=y_train)
val_data = lgb.Dataset(X_val_scaled, label=y_val, reference=train_data)

In [53]:
# LightGBM parameters
lgb_params = {
    "objective": "binary",
    "num_leaves": 34,
    "lambda_l2": 1.3e-3,
    "lambda_l1": 2.6e-5,
    "learning_rate": 7.1e-2,
    "num_iterations": 1000
}

In [54]:
# Train the model
print("Training LightGBM model...")
lgb_model = lgb.train(
    lgb_params,
    train_data,
    valid_sets=val_data
)

Training LightGBM model...
[LightGBM] [Info] Number of positive: 3838, number of negative: 4058
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1465
[LightGBM] [Info] Number of data points in the train set: 7896, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.486069 -> initscore=-0.055739
[LightGBM] [Info] Start training from score -0.055739


In [55]:
# Predict on train, validation, and test data
y_train_prob = lgb_model.predict(X_train_scaled)
y_val_prob = lgb_model.predict(X_val_scaled)
y_test_prob = lgb_model.predict(X_test_scaled)

# Convert probabilities to binary predictions (threshold = 0.5)
y_train_pred = (y_train_prob > 0.5).astype(int)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_test_pred = (y_test_prob > 0.5).astype(int)

# Calculate accuracy for each dataset
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_val = accuracy_score(y_val, y_val_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

# Calculate AUC for each dataset
auc_train = roc_auc_score(y_train, y_train_prob)
auc_val = roc_auc_score(y_val, y_val_prob)
auc_test = roc_auc_score(y_test, y_test_prob)

# Print results
print(f"Train Accuracy: {accuracy_train:.4f}, AUC: {auc_train:.4f}")
print(f"Validation Accuracy: {accuracy_val:.4f}, AUC: {auc_val:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}, AUC: {auc_test:.4f}")

Train Accuracy: 1.0000, AUC: 1.0000
Validation Accuracy: 0.7416, AUC: 0.7974
Test Accuracy: 0.7136, AUC: 0.7692


## FINAL METRICS AND BEST MODEL

In [None]:
def find_best_model(base_path):
    best_rmse = float('inf')
    best_folder = None

    # Walk through all directories and files in the base path
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == f'metrics.txt':
                file_path = os.path.join(root, file)
                
                # Read metrics from the file
                with open(file_path, 'r') as f:
                    metrics = f.read()
                
                # Parse the metrics into a dictionary
                metrics_dict = {}
                for line in metrics.splitlines():
                    key, value = line.split(': ')
                    metrics_dict[key.strip()] = float(value.strip())
                
                # Check if the current folder has a better validation loss
                if metrics_dict['test_rmse'] < best_rmse:
                    best_rmse = metrics_dict['test_rmse']
                    best_folder = root
    
    return best_folder, best_rmse

In [None]:
import os

def read_metrics(file_path):
    metrics = {}
    with open(file_path, 'r') as file:
        for line in file:
            key, value = line.split(': ')
            metrics[key.strip()] = float(value.strip())
    return metrics

def rename_folder(old_folder_path, prefix):
    folder_name = os.path.basename(old_folder_path)
    new_folder_name = f"{prefix}_{folder_name}"
    parent_dir = os.path.dirname(old_folder_path)
    new_folder_path = os.path.join(parent_dir, new_folder_name)
    os.rename(old_folder_path, new_folder_path)
    return new_folder_path

def process_folders(root_dir):
    prefixes = ["_Model"]
    best_folders = []

    for prefix in prefixes:
        matching_folders = [f for f in os.listdir(root_dir) if f.startswith(prefix) and os.path.isdir(os.path.join(root_dir, f))]
        if matching_folders:
            best_folder = None
            best_test_rmse = float('inf')
            for folder in matching_folders:
                metrics_file = os.path.join(root_dir, folder, 'metrics.txt')
                if os.path.exists(metrics_file):
                    metrics = read_metrics(metrics_file)
                    if metrics['test_rmse'] < best_test_rmse:
                        best_test_rmse = metrics['test_rmse']
                        best_folder = folder
            if best_folder:
                new_path = rename_folder(os.path.join(root_dir, best_folder), "TOP")
                best_folders.append(new_path)
    
    if best_folders:
        overall_best_folder = None
        overall_best_test_rmse = float('inf')
        for folder in best_folders:
            metrics_file = os.path.join(folder, 'metrics.txt')
            if os.path.exists(metrics_file):
                metrics = read_metrics(metrics_file)
                if metrics['test_rmse'] < overall_best_test_rmse:
                    overall_best_test_rmse = metrics['test_rmse']
                    overall_best_folder = folder
        if overall_best_folder:
            rename_folder(overall_best_folder, "BEST")
        
    return best_folders

In [None]:
# Usage
base_path = f"logs/Regression/{dataset_name}/MLP/"
best_folders = process_folders(base_path)
print(f"Best model folder: {best_folders}")