# Load data

In [None]:
import sys
sys.path.append('../flwrapp')

In [None]:
import datasets
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from dentalData.Pipelines import entire_data_processing_pipeline

In [None]:
ds = entire_data_processing_pipeline(sys.path[-1])

In [None]:
X, y = np.array(ds['features']), np.array(ds['labels'])

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [None]:
def kFoldCrossValidationSplit(X, y, k = 5):
    # TODO: Maybe add shuffling before splitting
    segmentSize = X.shape[0] // k
    segmentations = []
    for i in range(k):
        start = i * segmentSize
        end = (i+1) * segmentSize
        segmentation = {
            "X_train": np.concatenate([X[:start, :], X[end:, :]]),
            "y_train": np.concatenate([y[:start], y[end:]]),
            "X_test": X[start:end],
            "y_test": y[start:end],
        }
        segmentations.append(segmentation)
    return segmentations
        
# Set k for the fold crossvalidation

k = 5

segmentations = kFoldCrossValidationSplit(X, y, k)

segmentations[0]["y_train"].shape

In [None]:
def createSubTrainsets(segmentations: list):
    splitSegmentations = []
    for segmentation in segmentations:
        train_len = segmentation["X_train"].shape[0]
        splitSegmentation = {
            "X_train_10p": segmentation["X_train"][:int(0.1*train_len)],
            "y_train_10p": segmentation["y_train"][:int(0.1*train_len)],
            "X_train_50p": segmentation["X_train"][:int(0.5*train_len)],
            "y_train_50p": segmentation["y_train"][:int(0.5*train_len)],
            "X_train_100p": segmentation["X_train"],
            "y_train_100p": segmentation["y_train"],
            "X_test_total": segmentation["X_test"],
            "y_test_total": segmentation["y_test"],
        }
        splitSegmentations.append(splitSegmentation)
    return splitSegmentations

splitSegmentations = createSubTrainsets(segmentations)

splitSegmentations[0]['X_train_10p'].shape, np.mean(splitSegmentations[0]['y_train_100p'])

# Function for creating dataloaders

In [None]:
def convertToDataloaders(trainSets, testSets):
    # Define collate function for making list into stacked pytorch tensor
    def collate_fn(batch):
        features = torch.tensor([item['features'] for item in batch])
        labels = torch.tensor([item['labels'] for item in batch])
        return {'features': features, 'labels': labels}

    trainloaders = []
    for trainset in trainSets:
        trainloaders.append(DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_fn))

    testloaders = []
    for testset in testSets:
        testloaders.append(DataLoader(testset, batch_size=32, shuffle=True, collate_fn=collate_fn))
    return trainloaders, testloaders

In [None]:
def createDataLoader(segmentation):
    trainsets = [
        datasets.Dataset.from_dict(
            {
                "features": segmentation["X_train_10p"],
                "labels": segmentation["y_train_10p"],
            }
        ),
        datasets.Dataset.from_dict(
            {
                "features": segmentation["X_train_50p"],
                "labels": segmentation["y_train_50p"],
            }
        ),
        datasets.Dataset.from_dict(
            {
                "features": segmentation["X_train_100p"],
                "labels": segmentation["y_train_100p"],
            }
        ),
    ]

    testSets = [
        datasets.Dataset.from_dict(
            {"features": segmentation["X_test_total"], "labels": segmentation["y_test_total"]}
        ),
    ]

    trainloaders, testloaders = convertToDataloaders(trainsets, testSets)
    trainloader10p = trainloaders[0]
    trainloader50p = trainloaders[1]
    trainloader100p = trainloaders[2]
    testloader = testloaders[0]

    return {
        "train_10p": trainloader10p,
        "train_50p": trainloader50p,
        "train_100p": trainloader100p,
        "test": testloader,
    }

dataloaderSegmentations = list(map(lambda x: createDataLoader(x), splitSegmentations))

dataloaderSegmentations

# Define MLP to train and classify

In [None]:
import torch
from torch import nn


class Net(nn.Module):   

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(115, 40)
        self.fc2 = nn.Linear(40, 24)
        self.fc3 = nn.Linear(24, 6)
        self.fc4 = nn.Linear(6, 1)

    def forward(self, x):
        x=self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x) 
        x = torch.relu(x)
        x = self.fc4(x)
        x = torch.sigmoid(x)
    
        return x

In [None]:
def train(net, trainloader, lr, device):
    """Train the net on the training set."""
    net.to(device)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    
    net.train()
    epoch_loss = 0.0
    for batch in trainloader:
        data, targets = batch['features'], batch['labels']
        data, targets = data.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = net(data)[:, 0]

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(trainloader)
    
def test(net, testloader, device):
    """Validate the net on the test set."""
    net.to(device)
    criterion = torch.nn.BCELoss()
    correct, loss = 0, 0.0 
    with torch.no_grad():
        for batch in testloader:
            data = batch["features"]
            labels = batch["labels"]
            outputs = net(data.to(device))[:, 0]
            loss += criterion(outputs, labels.to(device)).item()
            correct += (outputs.data >= 0.5).eq(labels.to(device)).sum().item()
    accuracy = correct / len(testloader.dataset)
    loss = loss / len(testloader)
    
    return loss, accuracy

In [None]:
def trainAndEvaluateNetwork(net, epochs, trainloader, testloaders, lr, device):
    trainloss = []
    testloss = {'global': []}
    testaccuracy = {'global': []}
    
    for key, tl in testloaders.items():
        # Get initial loss and accuracy on all test sets
        initTestLoss, initAccuracy = test(net, tl, device)
        print(f"Initial {key} - Test Loss: {initTestLoss}, Accuracy: {initAccuracy}")
        testloss[key].append(initTestLoss)
        testaccuracy[key].append(initAccuracy)
    
    # Run thrugh the given amount of epochs
    for epoch in range(epochs):
        trainingLoss = train(net, trainloader, lr, device)
        print(f"Epoch {epoch+1}/{epochs} - Trainloss: {trainingLoss:.4f}")
        trainloss.append(trainingLoss)
        
        # Evaluate the updated model on the test sets
        for key, tl in testloaders.items():
            # Get loss and accuracy on all test sets
            testLoss, accuracy = test(net, tl, device)
            print(f"{key} - Test Loss: {testLoss}, Accuracy: {accuracy}")
            testloss[key].append(testLoss)
            testaccuracy[key].append(accuracy)
    
    return trainloss, testloss, testaccuracy    
    

# Experiment 1 tests on ML

In [None]:
# Set parameters for every experiment
lr = 0.0001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 500

## Train 10%

In [None]:
# Train a network for each segmentation, and take the mean of the metrics
trainloss10pLIST = []
testloss10pLIST = []
testaccuracy10pLIST = []
for segmentation in dataloaderSegmentations:
    net = Net()

    trainloss10p, testloss10p, testaccuracy10p = trainAndEvaluateNetwork(net, epochs, segmentation['train_10p'], {"global": segmentation['test']}, lr, device)
    trainloss10pLIST.append(trainloss10p)
    testloss10pLIST.append(testloss10p)
    testaccuracy10pLIST.append(testaccuracy10p)

In [None]:
testloss10pLISTglobal = list(map(lambda x: x['global'], testloss10pLIST))
testaccuracy10pLISTglobal = list(map(lambda x: x['global'], testaccuracy10pLIST))

trainloss10pMEAN = np.sum(np.array(trainloss10pLIST), axis=0) / len(trainloss10pLIST)
testloss10pMEAN = {"global": np.sum(np.array(testloss10pLISTglobal), axis=0) / len(testloss10pLISTglobal)}
testaccuracy10pMEAN = {"global": np.sum(np.array(testaccuracy10pLISTglobal), axis=0) / len(testaccuracy10pLISTglobal)}

### Plot of results

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot train and test loss
plt.plot(trainloss10pMEAN, label='10% Train Loss')
plt.plot(testloss10pMEAN['global'], label='10% Test Loss')
plt.xlabel('Epochs')
plt.legend()

In [None]:
# Plot test accuracy
plt.plot(testaccuracy10pMEAN['global'], label='Global Test Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.grid()

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

### Confusion matrices

In [None]:
# Make a confusion matrix for the test set
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix', labels=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Known to be true')
    plt.show()

In [None]:
for segmentation in splitSegmentations:
    X_test_total_tensor = torch.tensor(segmentation['X_test_total'], dtype=torch.float32)
    plot_confusion_matrix(segmentation['y_test_total'], net(X_test_total_tensor) >= 0.5, title='10% Test Set Confusion Matrix', labels=[0, 1])

## Train 50%

In [None]:
# Train a network for each segmentation, and take the mean of the metrics
trainloss50pLIST = []
testloss50pLIST = []
testaccuracy50pLIST = []
for segmentation in dataloaderSegmentations:
    net = Net()

    trainloss50p, testloss50p, testaccuracy50p = trainAndEvaluateNetwork(net, epochs, segmentation['train_50p'], {"global": segmentation['test']}, lr, device)
    trainloss50pLIST.append(trainloss50p)
    testloss50pLIST.append(testloss50p)
    testaccuracy50pLIST.append(testaccuracy50p)

In [None]:
testloss50pLISTglobal = list(map(lambda x: x['global'], testloss50pLIST))
testaccuracy50pLISTglobal = list(map(lambda x: x['global'], testaccuracy50pLIST))

trainloss50pMEAN = np.sum(np.array(trainloss50pLIST), axis=0) / len(trainloss50pLIST)
testloss50pMEAN = {"global": np.sum(np.array(testloss50pLISTglobal), axis=0) / len(testloss50pLISTglobal)}
testaccuracy50pMEAN = {"global": np.sum(np.array(testaccuracy50pLISTglobal), axis=0) / len(testaccuracy50pLISTglobal)}

### Plot the data

In [None]:
# Plot train loss
plt.plot(trainloss50pMEAN, label='50% Train Loss')
plt.plot(testloss50pMEAN['global'], label='50% Test Loss')
plt.xlabel('Epochs')
plt.legend()

In [None]:
# Plot test accuracy
plt.plot(testaccuracy50pMEAN['global'], label='50% Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.grid()

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

In [None]:
X_test_total_tensor = torch.tensor(splitSegmentations['X_test_total'], dtype=torch.float32)
plot_confusion_matrix(splitSegmentations['y_test_total'], net(X_test_total_tensor) >= 0.5, title='50% Test Set Confusion Matrix', labels=[0, 1])

## Train 100%

In [None]:
# Train a network for each segmentation, and take the mean of the metrics
trainloss100pLIST = []
testloss100pLIST = []
testaccuracy100pLIST = []
for segmentation in dataloaderSegmentations:
    net = Net()

    trainloss100p, testloss100p, testaccuracy100p = trainAndEvaluateNetwork(net, epochs, segmentation['train_100p'], {"global": segmentation['test']}, lr, device)
    trainloss100pLIST.append(trainloss100p)
    testloss100pLIST.append(testloss100p)
    testaccuracy100pLIST.append(testaccuracy100p)

In [None]:
testloss100pLISTglobal = list(map(lambda x: x['global'], testloss100pLIST))
testaccuracy100pLISTglobal = list(map(lambda x: x['global'], testaccuracy100pLIST))

trainloss100pMEAN = np.sum(np.array(trainloss100pLIST), axis=0) / len(trainloss100pLIST)
testloss100pMEAN = {"global": np.sum(np.array(testloss100pLISTglobal), axis=0) / len(testloss100pLISTglobal)}
testaccuracy100pMEAN = {"global": np.sum(np.array(testaccuracy100pLISTglobal), axis=0) / len(testaccuracy100pLISTglobal)}

### Plot the data

In [None]:
# Plot train loss
plt.plot(trainloss100pMEAN, label='100% Train Loss')
plt.plot(testloss100pMEAN['global'], label='100% Test Loss')
plt.xlabel('Epochs')
plt.legend()

print(f"{testloss100pMEAN['global'][-1]=}")

In [None]:
# Plot test accuracy
plt.plot(testaccuracy100pMEAN['global'], label='100% Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.grid()

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

In [None]:
X_test_total_tensor = torch.tensor(splitSegmentations['X_test_total'], dtype=torch.float32)
plot_confusion_matrix(splitSegmentations['X_test_total'], net(X_test_total_tensor) >= 0.5, title='100% Test Set Confusion Matrix', labels=[0, 1])

# All Accuracies plotted together:

In [None]:
# Plot test accuracy
plt.plot(testaccuracy100pMEAN['global'], label='100% Test Accuracy')
plt.plot(testaccuracy50pMEAN['global'], label='50% Test Accuracy')
plt.plot(testaccuracy10pMEAN['global'], label='10% Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.grid()

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

In [None]:
print(f"{testaccuracy100pMEAN['global'][-1]=}")
print(f"{testaccuracy50pMEAN['global'][-1]=}")
print(f"{testaccuracy10pMEAN['global'][-1]=}")
print(f"{np.max(testaccuracy100pMEAN['global'])=}")
print(f"{np.max(testaccuracy50pMEAN['global'])=}")
print(f"{np.max(testaccuracy10pMEAN['global'])=}")