# Load data

In [None]:
import sys
sys.path.append('../flwrapp')

In [None]:
import datasets
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from dentalData.Pipelines import entire_data_processing_pipeline

In [None]:
ds = entire_data_processing_pipeline(sys.path[-1])

In [None]:
X, y = np.array(ds['features']), np.array(ds['labels'])

# Convert labels to floats instead of int64
X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [None]:
def kFoldCrossValidationSplit(X, y, k = 5):
    # TODO: Maybe add shuffling before splitting
    segmentSize = X.shape[0] // k
    segmentations = []
    for i in range(k):
        start = i * segmentSize
        end = (i+1) * segmentSize
        segmentation = {
            "X_train": np.concatenate([X[:start, :], X[end:, :]]),
            "y_train": np.concatenate([y[:start], y[end:]]),
            "X_test": X[start:end],
            "y_test": y[start:end],
        }
        segmentations.append(segmentation)
    return segmentations
        
# Set k for the fold crossvalidation

k = 5

segmentations = kFoldCrossValidationSplit(X, y, k)

segmentations[0]["y_train"].shape

In [None]:
def createSubTrainsets(segmentations: list):
    splitSegmentations = []
    for segmentation in segmentations:
        train_len = segmentation["X_train"].shape[0]
        splitSegmentation = {
            "X_train_10p": segmentation["X_train"][:int(0.1*train_len)],
            "y_train_10p": segmentation["y_train"][:int(0.1*train_len)],
            "X_train_50p": segmentation["X_train"][:int(0.5*train_len)],
            "y_train_50p": segmentation["y_train"][:int(0.5*train_len)],
            "X_train_100p": segmentation["X_train"],
            "y_train_100p": segmentation["y_train"],
            "X_test_total": segmentation["X_test"],
            "y_test_total": segmentation["y_test"],
        }
        splitSegmentations.append(splitSegmentation)
    return splitSegmentations

splitSegmentations = createSubTrainsets(segmentations)

splitSegmentations[0]['X_train_10p'].shape, np.mean(splitSegmentations[0]['y_train_100p'])

# Define dmatrix for XGBoost

In [None]:
import xgboost as xgb

def convertToDMatrix(segmentation):
    dmatrix_train_10p = xgb.DMatrix(segmentation["X_train_10p"], label=segmentation["y_train_10p"], enable_categorical=True)
    dmatrix_train_50p = xgb.DMatrix(segmentation["X_train_50p"], label=segmentation["y_train_50p"], enable_categorical=True)
    dmatrix_train_100p = xgb.DMatrix(segmentation["X_train_100p"], label=segmentation["y_train_100p"], enable_categorical=True)

    dmatrix_test = xgb.DMatrix(segmentation["X_test_total"], label=segmentation["y_test_total"], enable_categorical=True)
    
    return {
        "dmatrix_train_10p": dmatrix_train_10p,
        "dmatrix_train_50p": dmatrix_train_50p,
        "dmatrix_train_100p": dmatrix_train_100p,
        "dmatrix_test": dmatrix_test
    }

dmatrixSegmentations = list(map(lambda x: convertToDMatrix(x), splitSegmentations))

dmatrixSegmentations

In [None]:
def createAndTrainModel(dmatrix_train, test_dmatrices, params, num_boost_round=50):
    bst = xgb.train(
        params,
        dmatrix_train,
        num_boost_round=1
    )
    evals = [[bst.eval(dmatrix_test, "Test", 0) for dmatrix_test in test_dmatrices.values()]]
    for i in range(1, num_boost_round):
        bst.update(dmatrix_train, i)
        evals.append([bst.eval(dmatrix_test, "Test", i) for dmatrix_test in test_dmatrices.values()])
    return bst, evals

In [None]:
params = {
    "objective": "binary:logistic",
    "eta": 0.1, 
    "max_depth": 8,
    "eval_metric":["error"],
    "nthread": 16,
    "num_parallel_tree": 1,
    "subsample": 1,
    "tree_method": "hist"
}

def trainModels(dmatrices, params):
    # Create dictionary of test matrices
    test_dmatrices = {
        "Global": dmatrices['dmatrix_test'],
    }
    bst_10p, evals_10p = createAndTrainModel(dmatrices['dmatrix_train_10p'], test_dmatrices, params)
    bst_50p, evals_50p = createAndTrainModel(dmatrices['dmatrix_train_50p'], test_dmatrices, params)
    bst_100p, evals_100p = createAndTrainModel(dmatrices['dmatrix_train_100p'], test_dmatrices, params)
    return {
        "bst_10p": bst_10p,
        "evals_10p": evals_10p,
        "bst_50p": bst_50p,
        "evals_50p": evals_50p,
        "bst_100p": bst_100p,
        "evals_100p": evals_100p,
    }

segmentationsResults = list(map(lambda x: trainModels(x, params), dmatrixSegmentations))

# Plot results from 10%-train

In [None]:
import matplotlib.pyplot as plt

def getValuesFromEvals(evals):
    values = np.array([[float(inner_eval.split(":")[1]) for inner_eval in inner_evals] for inner_evals in evals])
    # Put values into a dictionary
    return values[:, 0]

In [None]:
eval_values_10p = list(map(lambda x: getValuesFromEvals(x['evals_10p']), segmentationsResults))

eval_values_10pMEAN = np.mean(np.array(eval_values_10p), axis=0)

# Plot test accuracy
accuracy = 1 - np.array(eval_values_10pMEAN)
print(f"End of training 10% accuracy: {accuracy[-1]}")
plt.plot(accuracy, label=f'10% Test Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.grid(True)

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

### Confusion matrix

In [None]:
# Make a confusion matrix for the test set
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix', labels=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Known to be true')
    plt.show()

In [None]:
plot_confusion_matrix(y_test_total, bst_10p.predict(dmatrix_test) >= 0.5, title='Confusion Matrix for Test Set', labels=[0, 1])

# Plot results from 50%-train

In [None]:
eval_values_50p = list(map(lambda x: getValuesFromEvals(x['evals_50p']), segmentationsResults))

eval_values_50pMEAN = np.mean(np.array(eval_values_50p), axis=0)

# Plot test accuracy
accuracy = 1 - np.array(eval_values_50pMEAN)
print(f"End of training 50% accuracy: {accuracy[-1]}")
plt.plot(accuracy, label=f'50% Test Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.grid(True)

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

### Confusion matrix

In [None]:
plot_confusion_matrix(y_test_total, bst_50p.predict(dmatrix_test) >= 0.5, title='Confusion Matrix for Test Set', labels=[0, 1])

# Plot results from 100%-train

In [None]:
eval_values_100p = list(map(lambda x: getValuesFromEvals(x['evals_100p']), segmentationsResults))

eval_values_100pMEAN = np.mean(np.array(eval_values_100p), axis=0)

# Plot test accuracy
accuracy = 1 - np.array(eval_values_100pMEAN)
print(f"End of training 100% accuracy: {accuracy[-1]}")
plt.plot(accuracy, label=f'100% Test Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.grid(True)

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()

### Confusion matrix

In [None]:
plot_confusion_matrix(y_test_total, bst_100p.predict(dmatrix_test) >= 0.5, title='Confusion Matrix for Test Set', labels=[0, 1])

# Plot of all accuracies

In [None]:
# Plot test accuracy
accuracy_100p = 1 - np.array(eval_values_100pMEAN)
accuracy_50p = 1 - np.array(eval_values_50pMEAN)
accuracy_10p = 1 - np.array(eval_values_10pMEAN)
print(f"End of training 100% accuracy: {accuracy_100p[-1]}")
print(f"End of training 50% accuracy: {accuracy_50p[-1]}")
print(f"End of training 10% accuracy: {accuracy_10p[-1]}")
plt.plot(accuracy_100p, label=f'100% Test Accuracy')
plt.plot(accuracy_50p, label=f'50% Test Accuracy')
plt.plot(accuracy_10p, label=f'10% Test Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.grid(True)

# Set y-axis limits
plt.ylim(0, 1)

plt.legend()