In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd

# Define the neural network architecture
class ParallelNN(nn.Module):
    def __init__(self, input1_size, input2_size, hidden_size, num_classes):
        super(ParallelNN, self).__init__()

        # Define the first pipeline
        self.pipeline1 = nn.Sequential(
            nn.Linear(input1_size, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048, hidden_size)
        )

        # Define the second pipeline
        self.pipeline2 = nn.Sequential(
            nn.Linear(input2_size, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048, hidden_size)
        )

        # Define the final classification layer
        self.classification = nn.Sequential(
            nn.Linear(2 * hidden_size, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )

    def forward(self, x1, x2):
        out1 = self.pipeline1(x1)
        out2 = self.pipeline2(x2)

        # Concatenate the outputs
        merged_out = torch.cat((out1, out2), dim=1)

        # Apply the final classification layer
        final_out = self.classification(merged_out)
        return final_out
    
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0

    def should_stop(self, current_loss):
        if self.best_loss is None or current_loss < self.best_loss - self.min_delta:
            self.best_loss = current_loss
            self.counter = 0
        else:
            self.counter += 1
        
        return self.counter >= self.patience

early_stopping = EarlyStopping(patience=5, min_delta=0.001)


# Define the input sizes, hidden size, and number of classes
#input1_size = 10  # Change this to match the number of features in your first dataset
#input2_size = 8   # Change this to match the number of features in your second dataset
#hidden_size = 64
#num_classes = 3   # Change this to match the number of classes in your classification task

# Initialize the model
#model = ParallelNN(input1_size, input2_size, hidden_size, num_classes)

# Define your loss function and optimizer
#criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Assuming you have data1 and data2 as torch Tensors
# Replace this with your actual data
# data1 = torch.randn(10000, 10)  # Example: 100 samples, 10 features
# data2 = torch.randn(10000, 8)   # Example: 100 samples, 8 features
# labels = torch.randint(0, 3, (10000,))  # Example labels (0, 1, or 2)

device = "cpu"

# Load the data from a CSV file
csv_name = '../data/preprocessed_data.csv'
data = pd.read_csv(csv_name)

# Normalize the data within each column except the first
data.iloc[:, 1:] = (data.iloc[:, 1:] - data.iloc[:, 1:].mean()) / data.iloc[:, 1:].std()


# The first column is irrelevant, the second column is the label
# Divide the remaining columns into two datasets
NMR_FIRST_FEATURE = 60
data1 = torch.tensor(data.iloc[:, 1:NMR_FIRST_FEATURE - 1].values, dtype=torch.float32)
data2 = torch.tensor(data.iloc[:, NMR_FIRST_FEATURE - 1:].values, dtype=torch.float32)
labels = torch.tensor(data.iloc[:, 0].values, dtype=torch.long)

# Define model, loss function, and optimizer
model = ParallelNN(input1_size=data1.shape[1], input2_size=data2.shape[1], hidden_size=128, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
k_folds = 5

# Prepare data loaders
dataset = TensorDataset(data1, data2, labels)
train_set, val_set = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))])
sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=1000)
train_loader = DataLoader(train_set, batch_size=32, sampler=sampler)
val_loader = DataLoader(val_set, batch_size=16, shuffle=True)

# Training loop
def train(model, dataloader, criterion, optimizer):
    pbar = tqdm(dataloader)
    correct = 0

    for (data1, data2, labels) in pbar:
        model.train()

        data1, data2, labels = data1.to(device), data2.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(data1, data2)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        correct += (outputs.argmax(dim=1) == labels).float().sum()
        accuracy = correct / 1000 # Used in sampling

        
        # Validation loop
        val_loss = 0
        val_accuracy = 0
        model.eval()

        with torch.no_grad():
            for (data1, data2, labels) in val_loader:
                data1, data2, labels = data1.to(device), data2.to(device), labels.to(device)

                outputs = model(data1, data2)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_accuracy += (outputs.argmax(dim=1) == labels).float().sum()

        val_loss /= len(val_loader)
        val_accuracy /= len(val_set)
         
        pbar.set_description(f"Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    return val_loss, val_accuracy
        

In [105]:
# Main function
def main():
    num_epochs = 30

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        val_loss, val_accuracy = train(model, train_loader, criterion, optimizer)
        scheduler.step()

        if early_stopping.should_stop(val_loss):
            print("Early stopping")
            break

    print("Finished training")

if __name__ == '__main__':
    main()

Epoch 1/30


Loss: 0.9494, Accuracy: 0.5970, Val Loss: 1.0434, Val Accuracy: 0.4691: 100%|██████████| 32/32 [00:00<00:00, 40.91it/s]


Epoch 2/30


Loss: 0.9804, Accuracy: 0.7400, Val Loss: 0.9926, Val Accuracy: 0.5679: 100%|██████████| 32/32 [00:00<00:00, 41.49it/s]


Epoch 3/30


Loss: 0.6714, Accuracy: 0.8520, Val Loss: 1.0096, Val Accuracy: 0.4691: 100%|██████████| 32/32 [00:00<00:00, 53.86it/s]


Epoch 4/30


Loss: 1.5901, Accuracy: 0.8820, Val Loss: 1.2019, Val Accuracy: 0.4815: 100%|██████████| 32/32 [00:00<00:00, 52.86it/s]


Epoch 5/30


Loss: 1.2447, Accuracy: 0.9110, Val Loss: 1.1533, Val Accuracy: 0.4938: 100%|██████████| 32/32 [00:00<00:00, 58.65it/s]


Epoch 6/30


Loss: 0.6941, Accuracy: 0.9290, Val Loss: 1.1947, Val Accuracy: 0.4568: 100%|██████████| 32/32 [00:00<00:00, 56.26it/s]


Epoch 7/30


Loss: 0.3470, Accuracy: 0.9230, Val Loss: 1.1249, Val Accuracy: 0.4938: 100%|██████████| 32/32 [00:00<00:00, 47.37it/s]

Early stopping
Finished training



