In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import KFold

def reset_weights(m):
    '''
    Try resetting model weights to avoid weight leakage.
    '''
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

# Define the neural network architecture
class ParallelNN(nn.Module):
    def __init__(self, input1_size, input2_size, hidden_size, num_classes):
        super(ParallelNN, self).__init__()

        # Define the first pipeline
        self.pipeline1 = nn.Sequential(
            nn.Linear(input1_size, 8192),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(8192, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, hidden_size)
        )

        # Define the second pipeline
        self.pipeline2 = nn.Sequential(
            nn.Linear(input2_size, 8192),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(8192, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, hidden_size)
        )

        # Define the final classification layer
        self.classification = nn.Sequential(
            nn.Linear(2 * hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )

    def forward(self, x1, x2):
        out1 = self.pipeline1(x1)
        out2 = self.pipeline2(x2)

        # Concatenate the outputs
        merged_out = torch.cat((out1, out2), dim=1)

        # Apply the final classification layer
        final_out = self.classification(merged_out)
        return final_out

# Define the input sizes, hidden size, and number of classes
#input1_size = 10  # Change this to match the number of features in your first dataset
#input2_size = 8   # Change this to match the number of features in your second dataset
#hidden_size = 64
#num_classes = 3   # Change this to match the number of classes in your classification task

# Initialize the model
#model = ParallelNN(input1_size, input2_size, hidden_size, num_classes)

# Define your loss function and optimizer
#criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)


In [12]:

def reset_weights(m):
  '''
    Try resetting model weights to avoid
    weight leakage.
  '''
  for layer in m.children():
   if hasattr(layer, 'reset_parameters'):
    layer.reset_parameters()
  
  
if __name__ == '__main__':
  
  # Configuration options
  k_folds = 5
  num_epochs = 10
  loss_function = nn.CrossEntropyLoss()
  
  # For fold results
  results = {}
  
  
  # Load the data from a CSV file
  csv_name = 'preprocessed_data.csv'
  data = pd.read_csv(csv_name)

  # Normalize the data within each column except the first
  data.iloc[:, 1:] = (data.iloc[:, 1:] - data.iloc[:, 1:].mean()) / data.iloc[:, 1:].std()


  # The first column is irrelevant, the second column is the label
  # Divide the remaining columns into two datasets
  NMR_FIRST_FEATURE = 60
  data1 = torch.tensor(data.iloc[:, 1:NMR_FIRST_FEATURE - 1].values, dtype=torch.float32)
  data2 = torch.tensor(data.iloc[:, NMR_FIRST_FEATURE - 1:].values, dtype=torch.float32)
  labels = torch.tensor(data.iloc[:, 0].values, dtype=torch.long)
  dataset = TensorDataset(data1, data2, labels)
  
  # Define the K-fold Cross Validator
  kfold = KFold(n_splits=k_folds, shuffle=True)
    
  # Start print
  print('--------------------------------')

  # K-fold Cross Validation model evaluation
  for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    
    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    
    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
                      dataset, 
                      batch_size=10, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=10, sampler=test_subsampler)
    
    # Init the neural network
    model = ParallelNN(data1.shape[1], data2.shape[1], 64, 3)
    model.apply(reset_weights)
    
    # Initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    # Run the training loop for defined number of epochs
    for epoch in range(0, num_epochs):

      # Print epoch
      print(f'Starting epoch {epoch+1}')

      # Set current loss value
      current_loss = 0.0

      # Iterate over the DataLoader for training data
      for (data1, data2, labels) in tqdm(trainloader):
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform forward pass
        outputs = model(data1, data2)
        
        # Compute loss
        loss = loss_function(outputs, labels)
        
        # Perform backward pass
        loss.backward()
        
        # Perform optimization
        optimizer.step()
        
        # Print statistics
        current_loss += loss.item()
        if i % 50 == 49:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 50))
            current_loss = 0.0
        
        
            
    # Process is complete.
    print('Training process has finished. Saving trained model.')

    # Print about testing
    print('Starting testing')
    
    # Saving the model
    # save_path = f'./model-fold-{fold}.pth'
    # torch.save(model.state_dict(), save_path)

    # Evaluationfor this fold
    correct, total = 0, 0
    with torch.no_grad():

      # Iterate over the test data and generate predictions
      for i, data in enumerate(testloader, 0):

        # Get inputs
        data1, data2, labels = data

        # Generate outputs
        outputs = model(data1, data2)

        # Set total and correct
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

      # Print accuracy
      print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
      print('--------------------------------')
      results[fold] = 100.0 * (correct / total)
    
  # Print fold results
  print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
  print('--------------------------------')
  sum = 0.0
  for key, value in results.items():
    print(f'Fold {key}: {value} %')
    sum += value
  print(f'Average: {sum/len(results.items())} %')

--------------------------------
FOLD 0
--------------------------------
Starting epoch 1


100%|██████████| 33/33 [00:14<00:00,  2.25it/s]


Starting epoch 2


100%|██████████| 33/33 [00:12<00:00,  2.61it/s]


Starting epoch 3


100%|██████████| 33/33 [00:22<00:00,  1.48it/s]


Starting epoch 4


100%|██████████| 33/33 [00:13<00:00,  2.51it/s]


Starting epoch 5


100%|██████████| 33/33 [00:13<00:00,  2.52it/s]


Starting epoch 6


100%|██████████| 33/33 [00:17<00:00,  1.93it/s]


Starting epoch 7


100%|██████████| 33/33 [00:14<00:00,  2.23it/s]


Starting epoch 8


100%|██████████| 33/33 [00:19<00:00,  1.69it/s]


Starting epoch 9


100%|██████████| 33/33 [00:17<00:00,  1.94it/s]


Starting epoch 10


100%|██████████| 33/33 [00:19<00:00,  1.68it/s]


Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 0: 40 %
--------------------------------
FOLD 1
--------------------------------
Starting epoch 1


100%|██████████| 33/33 [00:17<00:00,  1.89it/s]


Starting epoch 2


100%|██████████| 33/33 [00:14<00:00,  2.26it/s]


Starting epoch 3


100%|██████████| 33/33 [00:15<00:00,  2.12it/s]


Starting epoch 4


100%|██████████| 33/33 [00:14<00:00,  2.30it/s]


Starting epoch 5


100%|██████████| 33/33 [00:19<00:00,  1.67it/s]


Starting epoch 6


100%|██████████| 33/33 [00:19<00:00,  1.66it/s]


Starting epoch 7


100%|██████████| 33/33 [00:18<00:00,  1.75it/s]


Starting epoch 8


100%|██████████| 33/33 [00:16<00:00,  2.02it/s]


Starting epoch 9


100%|██████████| 33/33 [00:15<00:00,  2.09it/s]


Starting epoch 10


100%|██████████| 33/33 [00:11<00:00,  2.79it/s]


Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 1: 44 %
--------------------------------
FOLD 2
--------------------------------
Starting epoch 1


100%|██████████| 33/33 [00:11<00:00,  2.82it/s]


Starting epoch 2


100%|██████████| 33/33 [00:11<00:00,  2.79it/s]


Starting epoch 3


100%|██████████| 33/33 [00:11<00:00,  2.85it/s]


Starting epoch 4


100%|██████████| 33/33 [00:11<00:00,  2.80it/s]


Starting epoch 5


100%|██████████| 33/33 [00:12<00:00,  2.67it/s]


Starting epoch 6


100%|██████████| 33/33 [00:12<00:00,  2.58it/s]


Starting epoch 7


100%|██████████| 33/33 [00:11<00:00,  2.76it/s]


Starting epoch 8


100%|██████████| 33/33 [00:11<00:00,  2.77it/s]


Starting epoch 9


100%|██████████| 33/33 [00:11<00:00,  2.76it/s]


Starting epoch 10


100%|██████████| 33/33 [00:19<00:00,  1.67it/s]


Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 2: 48 %
--------------------------------
FOLD 3
--------------------------------
Starting epoch 1


100%|██████████| 33/33 [00:34<00:00,  1.04s/it]


Starting epoch 2


100%|██████████| 33/33 [00:14<00:00,  2.25it/s]


Starting epoch 3


100%|██████████| 33/33 [00:24<00:00,  1.35it/s]


Starting epoch 4


100%|██████████| 33/33 [00:21<00:00,  1.55it/s]


Starting epoch 5


100%|██████████| 33/33 [00:22<00:00,  1.44it/s]


Starting epoch 6


100%|██████████| 33/33 [00:14<00:00,  2.31it/s]


Starting epoch 7


100%|██████████| 33/33 [00:20<00:00,  1.63it/s]


Starting epoch 8


100%|██████████| 33/33 [00:23<00:00,  1.42it/s]


Starting epoch 9


100%|██████████| 33/33 [00:16<00:00,  2.01it/s]


Starting epoch 10


100%|██████████| 33/33 [00:17<00:00,  1.86it/s]


Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 3: 52 %
--------------------------------
FOLD 4
--------------------------------
Starting epoch 1


100%|██████████| 33/33 [00:15<00:00,  2.11it/s]


Starting epoch 2


100%|██████████| 33/33 [00:14<00:00,  2.22it/s]


Starting epoch 3


100%|██████████| 33/33 [00:13<00:00,  2.50it/s]


Starting epoch 4


100%|██████████| 33/33 [00:34<00:00,  1.04s/it]


Starting epoch 5


100%|██████████| 33/33 [00:23<00:00,  1.39it/s]


Starting epoch 6


100%|██████████| 33/33 [00:24<00:00,  1.36it/s]


Starting epoch 7


100%|██████████| 33/33 [00:24<00:00,  1.34it/s]


Starting epoch 8


100%|██████████| 33/33 [00:19<00:00,  1.72it/s]


Starting epoch 9


100%|██████████| 33/33 [00:22<00:00,  1.50it/s]


Starting epoch 10


100%|██████████| 33/33 [00:19<00:00,  1.72it/s]


Training process has finished. Saving trained model.
Starting testing
Accuracy for fold 4: 47 %
--------------------------------
K-FOLD CROSS VALIDATION RESULTS FOR 5 FOLDS
--------------------------------
Fold 0: 40.74074074074074 %
Fold 1: 44.44444444444444 %
Fold 2: 48.75 %
Fold 3: 52.5 %
Fold 4: 47.5 %
Average: 46.78703703703704 %
