In [1]:

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_rcv1
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
import lightgbm as lgb
import optuna
import os

# Load RCV1 dataset
print("Loading RCV1 dataset...")
data = fetch_rcv1()
X, y = data.data[:50000], data.target[:50000]  # Limit dataset size for performance

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.toarray(), dtype=torch.float32)


Loading RCV1 dataset...


In [2]:
class NeuralNetwork(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(NeuralNetwork, self).__init__()
            self.fc1 = nn.Linear(input_dim, 128)
            self.dropout = nn.Dropout(0.2)
            self.fc2 = nn.Linear(128, 64)
            self.fc3 = nn.Linear(64, output_dim)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.dropout(x)
            x = torch.relu(self.fc2(x))
            x = torch.sigmoid(self.fc3(x))
            return x

In [3]:

# Define PyTorch Neural Network
def train_neural_network(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor):
    print("Training neural network with PyTorch...")

    input_dim = X_train_tensor.shape[1]
    output_dim = y_train_tensor.shape[1]

    model = NeuralNetwork(input_dim, output_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_model_path = 'best_nn_model.pth'
    best_precision = 0

    for epoch in range(10):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            y_pred = (model(X_test_tensor) > 0.5).float()
            precision = precision_score(y_test_tensor.numpy(), y_pred.numpy(), average='micro')
            recall = recall_score(y_test_tensor.numpy(), y_pred.numpy(), average='micro')

            if precision > best_precision:
                best_precision = precision
                torch.save(model.state_dict(), best_model_path)

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    print("Best model saved with precision:", best_precision)

# train_neural_network(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor)


In [4]:

# LightGBM with Optuna
def objective(trial):
    param = {
        'objective': 'multiclass',
        'num_class': y_train.shape[1],
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0)
    }

    d_train = lgb.Dataset(X_train, label=np.argmax(y_train.toarray(), axis=1))
    d_valid = lgb.Dataset(X_test, label=np.argmax(y_test.toarray(), axis=1), reference=d_train)

    gbm = lgb.train(param, d_train, valid_sets=[d_valid])

    y_pred = np.argmax(gbm.predict(X_test), axis=1)
    accuracy = accuracy_score(np.argmax(y_test.toarray(), axis=1), y_pred)

    return accuracy

def train_lightgbm():
    print("Training LightGBM with Optuna...")
    global study
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5)

    print("Best parameters:", study.best_params)

    best_params = study.best_params
    d_train = lgb.Dataset(X_train, label=np.argmax(y_train.toarray(), axis=1))
    best_gbm = lgb.train(best_params, d_train, num_boost_round=100, valid_sets=[d_train])

    predictions = best_gbm.predict(X_test)
    if predictions.ndim > 1:  # Multiclass output
        y_pred = np.argmax(predictions, axis=1)
    else:  # Binary classification output
        y_pred = (predictions > 0.5).astype(int)
        
    accuracy = accuracy_score(np.argmax(y_test.toarray(), axis=1), y_pred)
    precision = precision_score(np.argmax(y_test.toarray(), axis=1), y_pred, average='micro')
    recall = recall_score(np.argmax(y_test.toarray(), axis=1), y_pred, average='micro')

    print(f"LightGBM Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

# train_lightgbm()


In [5]:
# Compare PyTorch and LightGBM results
def compare_results():
    print("\nComparing PyTorch and LightGBM results...")

    # Load the best PyTorch model
    input_dim = X_train_tensor.shape[1]
    output_dim = y_train_tensor.shape[1]
    model = NeuralNetwork(input_dim, output_dim)
    model.load_state_dict(torch.load('best_nn_model.pth'))
    model.eval()

    with torch.no_grad():
        y_pred_nn = (model(X_test_tensor) > 0.5).float()
        accuracy_nn = accuracy_score(y_test_tensor.numpy(), y_pred_nn.numpy())
        precision_nn = precision_score(y_test_tensor.numpy(), y_pred_nn.numpy(), average='micro')
        recall_nn = recall_score(y_test_tensor.numpy(), y_pred_nn.numpy(), average='micro')

    print(f"PyTorch Neural Network -> Accuracy: {accuracy_nn:.4f}, Precision: {precision_nn:.4f}, Recall: {recall_nn:.4f}")

    # Evaluate the best LightGBM model
    best_params = study.best_params
    d_train = lgb.Dataset(X_train, label=np.argmax(y_train.toarray(), axis=1))
    best_gbm = lgb.train(best_params, d_train, num_boost_round=100, valid_sets=[d_train])

    # Handle binary and multiclass cases
    predictions = best_gbm.predict(X_test)
    if predictions.ndim > 1:  # Multiclass output
        y_pred_gbm = np.argmax(predictions, axis=1)
    else:  # Binary classification output
        y_pred_gbm = (predictions > 0.5).astype(int)

    accuracy_gbm = accuracy_score(np.argmax(y_test.toarray(), axis=1), y_pred_gbm)
    precision_gbm = precision_score(np.argmax(y_test.toarray(), axis=1), y_pred_gbm, average='micro')
    recall_gbm = recall_score(np.argmax(y_test.toarray(), axis=1), y_pred_gbm, average='micro')

    print(f"LightGBM -> Accuracy: {accuracy_gbm:.4f}, Precision: {precision_gbm:.4f}, Recall: {recall_gbm:.4f}")

    # Create a comparison table
    results = pd.DataFrame({
        'Model': ['PyTorch Neural Network', 'LightGBM'],
        'Accuracy': [accuracy_nn, accuracy_gbm],
        'Precision': [precision_nn, precision_gbm],
        'Recall': [recall_nn, recall_gbm]
    })

    print("\nResults Comparison:")
    print(results)

# Train both models and compare results
train_neural_network(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor)
train_lightgbm()
compare_results()


Training neural network with PyTorch...
Epoch 1, Loss: 0.6980, Precision: 0.0363, Recall: 0.5797
Epoch 2, Loss: 0.6964, Precision: 0.0361, Recall: 0.5610
Epoch 3, Loss: 0.6948, Precision: 0.0321, Recall: 0.4779
Epoch 4, Loss: 0.6932, Precision: 0.0277, Recall: 0.3976
Epoch 5, Loss: 0.6916, Precision: 0.0280, Recall: 0.3965
Epoch 6, Loss: 0.6899, Precision: 0.0281, Recall: 0.3956
Epoch 7, Loss: 0.6880, Precision: 0.0281, Recall: 0.3885
Epoch 8, Loss: 0.6860, Precision: 0.0285, Recall: 0.3826
Epoch 9, Loss: 0.6837, Precision: 0.0292, Recall: 0.3790


[I 2025-01-18 09:05:03,532] A new study created in memory with name: no-name-54f36c7b-d78b-446e-8766-fc7fef2a60f6


Epoch 10, Loss: 0.6810, Precision: 0.0294, Recall: 0.3675
Best model saved with precision: 0.036263322844782744
Training LightGBM with Optuna...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 338979
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 2677
[LightGBM] [Info] Start training from score -3.459674
[LightGBM] [Info] Start training from score -4.268698
[LightGBM] [Info] Start training from score -3.224894
[LightGBM] [Info] Start training from score -4.917145
[LightGBM] [Info] Start training from score -1.852019
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -6.660895
[LightGBM

[I 2025-01-18 09:09:03,744] Trial 0 finished with value: 0.79428 and parameters: {'learning_rate': 0.0972184099484716, 'num_leaves': 34, 'max_depth': 15, 'min_data_in_leaf': 99, 'feature_fraction': 0.8383084745140217}. Best is trial 0 with value: 0.79428.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.214472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374180
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 4175
[LightGBM] [Info] Start training from score -3.459674
[LightGBM] [Info] Start training from score -4.268698
[LightGBM] [Info] Start training from score -3.224894
[LightGBM] [Info] Start training from score -4.917145
[LightGBM] [Info] Start training from score -1.852019
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -6.660895
[LightGBM] [Info] Start training from score -3.133616
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -

[I 2025-01-18 09:12:57,920] Trial 1 finished with value: 0.7904 and parameters: {'learning_rate': 0.025610273873913982, 'num_leaves': 60, 'max_depth': 8, 'min_data_in_leaf': 46, 'feature_fraction': 0.6550074026846638}. Best is trial 0 with value: 0.79428.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.160432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 344921
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 2862
[LightGBM] [Info] Start training from score -3.459674
[LightGBM] [Info] Start training from score -4.268698
[LightGBM] [Info] Start training from score -3.224894
[LightGBM] [Info] Start training from score -4.917145
[LightGBM] [Info] Start training from score -1.852019
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -6.660895
[LightGBM] [Info] Start training from score -3.133616
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -

[I 2025-01-18 09:15:25,992] Trial 2 finished with value: 0.73356 and parameters: {'learning_rate': 0.010972303116613527, 'num_leaves': 101, 'max_depth': 5, 'min_data_in_leaf': 89, 'feature_fraction': 0.6379356409706949}. Best is trial 0 with value: 0.79428.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.182731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369459
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 3904
[LightGBM] [Info] Start training from score -3.459674
[LightGBM] [Info] Start training from score -4.268698
[LightGBM] [Info] Start training from score -3.224894
[LightGBM] [Info] Start training from score -4.917145
[LightGBM] [Info] Start training from score -1.852019
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -6.660895
[LightGBM] [Info] Start training from score -3.133616
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -

[I 2025-01-18 09:20:20,341] Trial 3 finished with value: 0.79748 and parameters: {'learning_rate': 0.04004340165450762, 'num_leaves': 106, 'max_depth': 9, 'min_data_in_leaf': 52, 'feature_fraction': 0.655541341285485}. Best is trial 3 with value: 0.79748.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362672
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 3560
[LightGBM] [Info] Start training from score -3.459674
[LightGBM] [Info] Start training from score -4.268698
[LightGBM] [Info] Start training from score -3.224894
[LightGBM] [Info] Start training from score -4.917145
[LightGBM] [Info] Start training from score -1.852019
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -6.660895
[LightGBM] [Info] Start training from score -3.133616
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -

[I 2025-01-18 09:24:19,272] Trial 4 finished with value: 0.79924 and parameters: {'learning_rate': 0.03653184908880311, 'num_leaves': 30, 'max_depth': 15, 'min_data_in_leaf': 61, 'feature_fraction': 0.8994811769260107}. Best is trial 4 with value: 0.79924.


Best parameters: {'learning_rate': 0.03653184908880311, 'num_leaves': 30, 'max_depth': 15, 'min_data_in_leaf': 61, 'feature_fraction': 0.8994811769260107}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.191862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362672
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 3560
[LightGBM] [Info] Start training from score 43.928640
LightGBM Accuracy: 0.0134, Precision: 0.0134, Recall: 0.0134

Comparing PyTorch and LightGBM results...
PyTorch Neural Network -> Accuracy: 0.0000, Precision: 0.0363, Recall: 0.5797
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi