# Set up

In [None]:
import warnings


def warn(*args, **kwargs):
    pass


warnings.warn = warn

In [None]:
import logging
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import SimpleITK as sitk
from tqdm import tqdm

from radiomics import imageoperations

plt.rcParams["font.family"] = "DeJavu Serif"
plt.rcParams["font.serif"] = ["Times New Roman"]

colors = ["#663171", "#ea7428", "#0c7156", "#cf3a36", "#e2998a"]

In [None]:
root_dir = pathlib.Path("..").resolve()

In [None]:
def build_dataframe_from_csv(
    rater: str = "900", from_image: str = "t2w"
) -> pd.DataFrame:
    labels_df = pd.read_csv(
        root_dir.joinpath("data", f"midasdisclabels{rater}.csv"), sep=","
    )
    labels_df.dropna(inplace=True)
    labels_df.rename(
        columns={"subject_ID": "Subject_XNAT", "ID": "Session_XNAT"}, inplace=True
    )

    midas_img_relation = pd.read_csv(
        root_dir.joinpath("data", "filtered_midas900_t2w.csv"), sep=","
    )
    midas_img_relation["Subject_MIDS"] = midas_img_relation["Image"].map(
        lambda x: x.split("/")[8]
    )
    midas_img_relation["Session_MIDS"] = midas_img_relation["Image"].map(
        lambda x: x.split("/")[9]
    )
    midas_img_relation["Subject_XNAT"] = midas_img_relation["Subject_MIDS"].map(
        lambda x: f"ceibcs_S{int(x.split('sub-S')[1])}"
    )
    midas_img_relation["Session_XNAT"] = midas_img_relation["Session_MIDS"].map(
        lambda x: f"ceibcs_E{int(x.split('ses-E')[1])}"
    )

    id_labels = labels_df.merge(midas_img_relation, on=["Subject_XNAT", "Session_XNAT"])
    id_labels.rename(
        columns={
            "L5-S": "1",
            "L4-L5": "2",
            "L3-L4": "3",
            "L2-L3": "4",
            "L1-L2": "5",
        },
        inplace=True,
    )

    radiomic_features = pd.read_csv(
        root_dir.joinpath("data", f"filtered_midas900_{from_image}_radiomics.csv"),
        sep=",",
    )
    radiomic_features.rename(columns={"Unnamed: 0": "ID"}, inplace=True)

    return id_labels.merge(radiomic_features, on="ID")


def get_labels_and_features(
    rater: str = "900", label: int = 1, from_image: str = "t2w"
) -> tuple:
    """
    Reads a CSV file from the given label and returns the labels and features as separate dataframes.

    :param rater: The rater identifier. Default is "900".
    :type rater: str
    :param label: A number from 1 to 5 indicating the disc of interest. Default is 1.
    :type label: bool
    :return: A tuple containing the labels and features.
    :rtype: tuple
    """

    data = build_dataframe_from_csv(rater=rater, from_image=from_image)

    data = data.rename(columns={str(label): f"label{label}", "ID": f"label{label}ID"})
    columns_mask = data.columns.str.contains(
        f"label{label}"
    ) & ~data.columns.str.contains("Configuration")
    data = data.loc[:, columns_mask]
    data = data.rename(columns={f"label{label}": "label", f"label{label}ID": "ID"})

    label_data = data.dropna(axis=0, how="any")
    label_data = label_data.loc[label_data["label"] != 0]
    label_data["ID"] = label_data["ID"].map(lambda x: x + str(label))
    label_data = label_data.set_index("ID")
    labels = label_data["label"]
    features = label_data[
        label_data.select_dtypes(include="number").columns.tolist()
    ].drop(columns="label")
    return labels, features


def get_labels_and_features_all_discs(
    rater: str = "900", verbose: bool = False, from_image: str = "t1w_t2w"
) -> tuple:
    """
    Get labels and features for all discs.

    :param rater: The rater identifier. Default is "900".
    :type rater: str
    :param verbose: Whether to print additional information and plot the label distribution. Default is False.
    :type verbose: bool
    :return: A tuple containing the labels and features.
    :rtype: tuple
    """
    features = []
    labels = []
    for label in range(1, 6):
        labels_i, features_i = get_labels_and_features(
            rater=rater, label=label, from_image=from_image
        )
        labels.append(labels_i)
        features_i = features_i.rename(
            columns={
                name: name.replace(f"label{label}_", "")
                for name in features_i.columns.to_list()
            }
        )
        features.append(features_i)
    features = pd.concat(features, axis=0)
    labels = pd.concat(labels, axis=0)
    if verbose:
        print(f"Labels shape: {labels.shape}, Features shape: {features.shape}")
        labels.plot(kind="hist", xticks=[1, 2, 3, 4, 5], title="Label distribution")
        plt.show()
    return labels, features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold


class VarianceFeatureReduction(BaseEstimator, TransformerMixin):
    """
    VarianceFeatureReduction is a transformer that reduces the feature space by removing features with low variance.

    Parameters:
    -----------
    threshold : float, optional (default=0.05)
        The threshold below which features will be removed.
    """

    def __init__(self, threshold=0.05):
        self.threshold = threshold
        self.selector = None

    def fit(self, X, y=None):
        """
        Fit the VarianceFeatureReduction transformer to the input data.

        Parameters:
        -----------
        X : array-like, shape (n_samples, n_features)
            The input data.
        y : array-like, shape (n_samples,), optional (default=None)
            The target values.

        Returns:
        --------
        self : object
            Returns self.
        """
        self.selector = VarianceThreshold(threshold=self.threshold)
        self.selector.fit(X)
        return self

    def transform(self, X, y=None):
        """
        Transform the input data by removing features with low variance.

        Parameters:
        -----------
        X : array-like, shape (n_samples, n_features)
            The input data.
        y : array-like, shape (n_samples,), optional (default=None)
            The target values.

        Returns:
        --------
        X_ : array-like, shape (n_samples, n_selected_features)
            The transformed data with low variance features removed.
        """
        X_ = X.copy()
        X_ = X_.loc[:, self.selector.get_support()]
        return X_


class CorrelationFeatureReduction(BaseEstimator, TransformerMixin):
    """
    A transformer class for reducing features based on correlation.

    Parameters:
    -----------
    threshold : float, optional (default=0.8)
        The threshold above which features will be removed.
    """

    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.corr_matrix_var = None
        self.to_keep = None

    def fit(self, X, y=None):
        """
        Fit the transformer to the input data.

        Parameters:
        -----------
        X : pandas DataFrame
            The input data.

        Returns:
        --------
        self : CorrelationFeatureReduction
            The fitted transformer object.

        """
        self.corr_matrix_var = X.corr(
            method="spearman"
        ).abs()  # absolute correlation matrix

        # Initialize the flag vector with True values
        self.to_keep = np.full((self.corr_matrix_var.shape[1]), True, dtype=bool)

        for i in range(self.corr_matrix_var.shape[1]):
            for j in range(i + 1, self.corr_matrix_var.shape[1]):
                if (
                    self.to_keep[i]
                    and self.corr_matrix_var.iloc[i, j] >= self.threshold
                ):
                    if self.to_keep[j]:
                        self.to_keep[j] = False
        return self

    def transform(self, X, y=None):
        """
        Transform the input data by removing highly correlated features.

        Parameters:
        -----------
        X : pandas DataFrame
            The input data.

        Returns:
        --------
        X_ : pandas DataFrame
            The transformed data with highly correlated features removed.

        """
        X_ = X.copy()
        X_ = X_.iloc[:, self.to_keep]
        return X_

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def test_multiple_models(features, labels):
    # Define classifiers to test
    classifiers = {
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "SVM": SVC(),
        "Logistic Regression": LogisticRegression(),
        "Stochastic Gradient Descent": SGDClassifier(),
        "Naive Bayes": GaussianNB(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Multilayer Perceptron": MLPClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "ExtraTrees": ExtraTreesClassifier(),
    }

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.25, random_state=0, stratify=labels
    )

    # Test each classifier
    f1_scores = {}
    for name, clf in classifiers.items():
        pipeline = Pipeline(
            [
                ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
                ("correlationreduction", CorrelationFeatureReduction()),
                ("scaler", StandardScaler()),
                ("classifier", clf),
            ]
        )
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        f1_scores[name] = f1_score(y_test, y_pred, average="weighted")

    # Select the classifier with the highest F1 score
    best_classifier = max(f1_scores, key=f1_scores.get)  # type: ignore
    print("Best classifier:", best_classifier)
    print("F1 score:", f1_scores[best_classifier])

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


def cv(clf, features, labels):
    # Create a stratified 5-fold cross-validation object
    skf = StratifiedKFold(n_splits=5)

    # Perform cross-validation
    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )
    scores = cross_val_score(
        pipeline_clf, features, labels, cv=skf, scoring="f1_weighted"
    )
    print(f"Cross Validation F1 Score: {scores.mean():0.4f} +/- {scores.std():0.2f}")

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler


def imbalanced_learning_suite(features, labels):
    # Define classifiers to test
    classifiers = {
        "Balanced Bagging Classifier": BalancedBaggingClassifier(
            sampler=RandomUnderSampler()
        ),
        "Balanced RandomForest Classifier": BalancedRandomForestClassifier(),
        "RUS Boost Classifier": RUSBoostClassifier(),
        "Easy Ensemble Classifier": EasyEnsembleClassifier(),
    }

    # Create a stratified 5-fold cross-validation object
    skf = StratifiedKFold(n_splits=5)

    # Perform cross-validation
    for name, clf in classifiers.items():
        pipeline_clf = Pipeline(
            [
                ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
                ("correlationreduction", CorrelationFeatureReduction()),
                ("scaler", StandardScaler()),
                ("classifier", clf),
            ]
        )
        scores = cross_val_score(
            pipeline_clf, features, labels, cv=skf, scoring="f1_weighted"
        )
        print(
            f"{name}: {scores.mean():0.2f} f1 with a standard deviation of {scores.std():0.2f}"
        )

In [None]:
import yellowbrick.classifier as viz

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
)
from yellowbrick.style import set_palette

set_palette(colors)


def visual_metrics(clf, features, labels, classes=["1", "2", "3", "4", "5"]):
    labels_ = labels.copy()
    if min(labels_) != 0:
        labels_ = labels_ - min(labels_)
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels_, test_size=0.25, random_state=0, stratify=labels_
    )

    _, axes = plt.subplots(1, 3, figsize=(15, 5))
    labels.plot(
        kind="hist",
        title="Pfirmann grade distribution",
        ax=axes[0],
        xticks=[1, 2, 3, 4, 5],
        align="mid",
    )

    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )
    pipeline_clf.fit(X_train, y_train)
    axes[1].set_title("Classification Report")
    axes[1].set_ylabel("Class")
    visualizer_class = viz.ClassificationReport(
        pipeline_clf, classes=classes[::-1], support=True, ax=axes[1], cmap="Blues"
    )
    visualizer_class.score(X_test, y_test)

    axes[2].set_title("Classification Prediction Error")
    axes[2].set_xlabel("Class")
    axes[2].set_ylabel("Number of Predictions")
    visualizer_pred = viz.ClassPredictionError(
        pipeline_clf, classes=classes, ax=axes[2]
    )
    visualizer_pred.score(X_test, y_test)

    plt.tight_layout()
    plt.show()

    predictions = pipeline_clf.predict(X_test)
    print(f"Accuracy within one grade: {accuracy_within_one(y_test, predictions):0.2f}")
    print(f"Balanced accuracy: {balanced_accuracy_score(y_test, predictions):0.2f}")
    print(classification_report(y_test, predictions, target_names=classes))

In [None]:
from sklearn.model_selection import RandomizedSearchCV


def random_search(clf, distribution, features, labels):
    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )

    skf = StratifiedKFold(n_splits=5)
    rs_clf = RandomizedSearchCV(
        pipeline_clf,
        distribution,
        cv=skf,
        scoring="f1_weighted",
        n_iter=10,
        random_state=0,
    )
    search = rs_clf.fit(features, labels)

    print(f"Best parameter (CV score={search.best_score_:0.3f}): {search.best_params_}")
    return {
        key.replace("classifier__", ""): value
        for key, value in search.best_params_.items()
        if key.startswith("classifier__")
    }

In [None]:
def checkMaskVol(image, mask, label):
    try:
        imageoperations.checkMask(
            image, mask, minimumROIDimensions=3, minimumROISize=1000, label=label
        )
        result = label
    except Exception as e:
        result = None
    return result

In [None]:
def accuracy_within_one(labels, predictions):
    # Calculate the absolute difference between labels and predictions
    diff = abs(labels - predictions)
    # Count the number of differences that are less than or equal to one
    correct_predictions = sum(diff <= 1)
    # Calculate the accuracy
    accuracy = correct_predictions / len(labels)
    return accuracy

# EDA

## Data Sources

In [None]:
midas_img_relation = pd.read_csv(
    root_dir.joinpath("data", "filtered_midas900_t2w.csv"), sep=","
)

In [None]:
import json

devices = []
for _, row in midas_img_relation.iterrows():
    img_path = pathlib.Path(row["Image"])
    metadata_path = img_path.with_suffix("").with_suffix(".json")
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
        manufacturer = metadata.get("00080070", {}).get("Value", ["N/A"])[0]
        model = metadata.get("00081090", {}).get("Value", ["N/A"])[0]
        field_strength = metadata.get("00180087", {}).get("Value", ["N/A"])[0]
    devices.append(
        {
            "Manufacturer": manufacturer,
            "Model name": model,
            "Field Strength": field_strength,
        }
    )

In [None]:
df = pd.DataFrame(devices)

In [None]:
df.groupby("Manufacturer").value_counts()

## Labels distribution

In [None]:
df = build_dataframe_from_csv(rater="JDCarlos", from_image="t2w")

In [None]:
a = []
discs = {
    "1": "L5-S",
    "2": "L4-L5",
    "3": "L3-L4",
    "4": "L2-L3",
    "5": "L1-L2",
}
for i in range(1, 6):
    s = df[f"{i}"].value_counts()
    s.name = discs[f"{i}"]
    a.append(s)

In [None]:
df1 = pd.DataFrame(a).T

fig = px.bar(
    df1, title="Pfirrmann Grade Distribution", color_discrete_sequence=colors
)  # replace 0 with your column name if needed
total_count = df1.sum(axis=1)
fig.add_trace(
    go.Scatter(
        x=df1.index,
        y=total_count,
        mode="text",
        text=total_count,
        textposition="top center",
        showlegend=False,
    )
)
fig.update_traces(textfont_size=12)
fig.update_xaxes(title_text="Pfirrmann Grade")
fig.update_yaxes(
    title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
)
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
    legend_title_text="Intervertebral Disc",
    grid_rows=1,
)
fig.show()
# pio.write_image(fig, root_dir.joinpath("figures", "pfirrmann_grade_distribution.pdf"))

## Histogram

In [None]:
import monai.transforms as transforms
import torch


class CheckMaskVol(transforms.MapTransform):
    def __init__(
        self,
        keys=["image", "mask"],
        minimum_roi_dimensions: int = 3,
        minimum_roi_size: int = 1000,
    ):
        super().__init__(keys)
        self.minimum_roi_dimensions = minimum_roi_dimensions
        self.minimum_roi_size = minimum_roi_size

    def __call__(self, x):
        image = sitk.ReadImage(x[self.keys[0]])
        mask = sitk.ReadImage(x[self.keys[1]])
        labels = np.unique(sitk.GetArrayFromImage(mask).ravel())
        valid_labels = []
        for label in labels:
            if label != 0:
                try:
                    imageoperations.checkMask(
                        image,
                        mask,
                        minimumROIDimensions=self.minimum_roi_dimensions,
                        minimumROISize=self.minimum_roi_size,
                        label=label,
                    )
                    result = label
                except Exception as e:
                    result = None
                if result:
                    valid_labels.append(result)
        x["valid_labels"] = valid_labels[:5]
        return x


class CropForegroundd(transforms.MapTransform):
    def __init__(
        self, keys=["image"], source_key="mask", margin=0, k_divisible=(64, 64, 1)
    ):
        super().__init__(keys)
        self.k_divisible = k_divisible
        self.margin = margin
        self.source_key = source_key

    def __call__(self, x):
        key = self.keys[0]
        bool_mask = torch.where(
            x[self.source_key] == x["valid_labels"][0], x[self.source_key], 0
        )
        for label in x["valid_labels"][1:]:
            bool_mask += torch.where(x[self.source_key] == label, x[self.source_key], 0)
        input_data = {"image": x[key] * bool_mask, "mask": x[self.source_key]}
        discs = []
        labels = []
        for label, disc in enumerate(x["valid_labels"], start=1):
            select_fn = lambda x: x == disc
            crop = transforms.CropForegroundd(
                keys=self.keys,
                source_key=self.source_key,
                select_fn=select_fn,
                margin=self.margin,
                k_divisible=self.k_divisible,
            )(input_data)
            crop2 = transforms.CenterSpatialCropd(keys=["image"], roi_size=(-1, -1, 1))(
                crop
            )
            discs.append(crop2["image"])
            labels.append(x[str(label)])
        return [{"image": disc, "label": label} for disc, label in zip(discs, labels)]

In [None]:
transforms_ = transforms.Compose(
    [
        CheckMaskVol(
            keys=["image", "mask"], minimum_roi_dimensions=3, minimum_roi_size=1000
        ),
        transforms.LoadImaged(
            keys=["image", "mask"], image_only=True, ensure_channel_first=True
        ),
        CropForegroundd(
            keys=["image"], source_key="mask", margin=0, k_divisible=(1, 1, 1)
        ),
    ]
)

In [None]:
results = {"Discs": [], "Pfirmann": [], "Array": []}
for _, row in tqdm(df.iterrows()):
    data = {
        "image": row["Image"],
        "mask": row["Mask"],
        "1": row["1"],
        "2": row["2"],
        "3": row["3"],
        "4": row["4"],
        "5": row["5"],
    }
    result_row = transforms_(data)
    for idx, result in enumerate(result_row, start=1):
        results["Discs"].append(idx)
        results["Pfirmann"].append(result["label"])
        results["Array"].append(result["image"].numpy())

In [None]:
histograms_df = pd.DataFrame(results)
histograms_df.head()

In [None]:
process = histograms_df.copy()
process["Normalized Array"] = process["Array"].map(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
process["Histogram"] = process["Normalized Array"].map(
    lambda x: np.histogram(x, bins=10)[0]
)
grouped_by_disc = process.groupby("Pfirmann")

In [None]:
x_ = np.arange(0, 1.01, 1 / 10)
plt.figure(figsize=(10, 6))
for i in range(4, -1, -1):
    plt.bar(
        x=x_[:-1],
        height=grouped_by_disc["Histogram"].mean().iloc[i],
        width=np.diff(x_),
        label=str(i + 1),
    )
plt.legend()
# plt.ylim(0, 100)
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Plotting
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection="3d")

# X, Y positions for the bars
x_positions = x_[:-1]
y_positions = [0, 1, 2, 3, 4]  # Different datasets on different y-positions

# Width of each bar and yz-space between bars
width = np.diff(x_)
y_space = 0.8

# Adding the histograms to the plot
for i, hist in enumerate(grouped_by_disc["Histogram"].mean()):
    ax.bar(
        x_positions,
        hist,
        zs=y_positions[i],
        zdir="y",
        width=width,
        align="center",
        alpha=0.7,
    )

ax.set_xlabel("X-axis (Value)")
# ax.set_ylabel('Y-axis (Dataset)')
ax.set_zlabel("Z-axis (Frequency)")

# Setting the y-ticks to correspond to different datasets
ax.set_yticks(y_positions)
ax.set_yticklabels(
    ["Pfirrmann 1", "Pfirrmann 2", "Pfirrmann 3", "Pfirrmann 4", "Pfirrmann 5"]
)

plt.title("3D Histograms")
plt.show()

## Feature Selection

### Intraclass correlation coefficient

In [None]:
def ICC_feature_reduction(
    input_dataframe,
    input_dataframe_ICC1,
    input_dataframe_ICC2,
    input_dataframe_ICC3,
    ICC_thresh,
):
    import pandas as pd
    import pingouin as pg
    import numpy as np

    # Initialize an empty flag vector
    to_keep = []
    # Iterate over features
    for feature in input_dataframe.columns:
        # Concatenate feature values vertically
        feature_data = pd.concat(
            [
                input_dataframe[[feature]],
                input_dataframe_ICC1[[feature]],
                input_dataframe_ICC2[[feature]],
                input_dataframe_ICC3[[feature]],
            ],
            axis=0,
            ignore_index=False,
        )

        # Append patient/repetition information
        # Create a repetition/patients column
        result_array = np.repeat(
            [1, 2, 3, 4],
            [
                len(input_dataframe),
                len(input_dataframe),
                len(input_dataframe),
                len(input_dataframe),
            ],
        )
        feature_data["Repetition"] = result_array
        feature_data["Patients"] = pd.factorize(feature_data.index)[0]
        feature_data = feature_data.rename(columns={feature: "FeatureValue"})

        # Compute ICC
        icc_result = pg.intraclass_corr(
            data=feature_data,
            targets="Patients",
            raters="Repetition",
            ratings="FeatureValue",
        )
        # Extract ICC value
        icc_value = icc_result["ICC"].iloc[
            1
        ]  # ICC2: A random sample of raters rate each target. Measure of absolute agreement.

        # Check if ICC is greater than the threshold
        if icc_value > ICC_thresh:
            to_keep.append(True)
        else:
            to_keep.append(False)

    return input_dataframe.loc[:, to_keep]

In [None]:
def compute_ICC_reduced_per_disc(disc: int):
    t2 = pd.read_csv(
        root_dir.joinpath("data", f"filtered_midas900_t2w_radiomics.csv"), sep=","
    )
    t2 = t2.loc[:, t2.columns.str.contains(f"label{disc}")]
    t2 = t2[t2.select_dtypes(include="number").columns.tolist()]

    t2_shifted = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", f"filtered_midas900_t2w_radiomics_shifted.csv"
        ),
        sep=",",
    )
    t2_shifted = t2_shifted.loc[:, t2_shifted.columns.str.contains(f"label{disc}")]
    t2_shifted = t2_shifted[t2_shifted.select_dtypes(include="number").columns.tolist()]

    t2_eroded = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", f"filtered_midas900_t2w_radiomics_eroded.csv"
        ),
        sep=",",
    )
    t2_eroded = t2_eroded.loc[:, t2_eroded.columns.str.contains(f"label{disc}")]
    t2_eroded = t2_eroded[t2_eroded.select_dtypes(include="number").columns.tolist()]

    t2_dilated = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", f"filtered_midas900_t2w_radiomics_dilated.csv"
        ),
        sep=",",
    )
    t2_dilated = t2_dilated.loc[:, t2_dilated.columns.str.contains(f"label{disc}")]
    t2_dilated = t2_dilated[t2_dilated.select_dtypes(include="number").columns.tolist()]

    if disc == 1:
        t2.drop(index=317, inplace=True)
        t2_shifted.drop(index=317, inplace=True)
        t2_eroded.drop(index=317, inplace=True)
        t2_dilated.drop(index=317, inplace=True)

    t2_reduced_ICC = ICC_feature_reduction(t2, t2_shifted, t2_eroded, t2_dilated, 0.85)
    print(f"t1.     {t2_reduced_ICC.shape[1]} features retained from {t2.shape[1]}")

    return t2_reduced_ICC

In [None]:
disc_1_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=1)

In [None]:
disc_2_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=2)

In [None]:
disc_3_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=3)

In [None]:
disc_4_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=4)

In [None]:
disc_5_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=5)

In [None]:
pd.concat(
    [
        disc_1_t2_reduced_ICC,
        disc_2_t2_reduced_ICC,
        disc_3_t2_reduced_ICC,
        disc_4_t2_reduced_ICC,
        disc_5_t2_reduced_ICC,
    ],
    axis=1,
).to_csv(
    root_dir.joinpath(
        "data",
        "mask_perturbation",
        f"filtered_midas900_t2w_radiomics_reduced_ICC_per_disc.csv",
    )
)

### Near-zero variance

In [None]:
import numpy as np

# Feature reduction based on variance thresholding (remove features with variance smaller than 0.05)
from sklearn.feature_selection import VarianceThreshold

# Initialize selector based on VarianceThreshold
selector = VarianceThreshold(threshold=0.05)

#  Estimate variances and reduce features
labels, radiomic_features = get_labels_and_features_all_discs(rater="JDCarlos")
selector.fit_transform(radiomic_features)

# Get the selected feature labels and reduce the Radiomic_Feature dataframe
radiomic_features_var = radiomic_features.loc[:, selector.get_support()]

# Display the number of features removed
print(
    f"{np.count_nonzero(~selector.get_support())}/{radiomic_features.shape[1]} features were removed due to near-zero variance."
)

del selector

### Correlation

In [None]:
corr_matrix_var = radiomic_features_var.corr(
    method="spearman"
).abs()  # absolute correlation matrix

# Initialize the flag vector with True values
to_keep = np.full((corr_matrix_var.shape[1]), True, dtype=bool)

for i in range(corr_matrix_var.shape[1]):
    for j in range(i + 1, corr_matrix_var.shape[1]):
        if to_keep[i] and corr_matrix_var.iloc[i, j] >= 0.8:
            if to_keep[j]:
                to_keep[j] = False

# Retain features that are not higly correlated
radiomic_features_corr = radiomic_features_var.iloc[:, to_keep]

print(
    f"{np.count_nonzero(~to_keep)}/{radiomic_features_var.shape[1]} features were removed due to high correlation. {radiomic_features_corr.shape[1]} features remaining."
)

del to_keep, i, j

In [None]:
import seaborn as sns

# Calculate the correlation matrix of the original feature set
corr_matrix = radiomic_features.corr(method="spearman")
# Display the correlation matrix
plt.figure(figsize=(8, 6.5))
sns.heatmap(corr_matrix, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation of Radiomic features")
plt.show()

# Calculate the correlation matrix of the reduced feature set
corr_matrix_red = radiomic_features_corr.corr(method="spearman")
# Display the correlation matrix
plt.figure(figsize=(8, 6.5))
sns.heatmap(corr_matrix_red, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation of reduced radiomic features")
plt.show()

# Clustering

In [None]:
import copy
from scipy.stats import zscore
from scipy.spatial import distance
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, fcluster

radiomic_features_clus = copy.deepcopy(radiomic_features_corr)
# Normalize the data
radiomic_features_clus = zscore(radiomic_features_clus, axis=0)

# Calculate and plot the clustergram
row_linkage = hierarchy.linkage(
    distance.pdist(radiomic_features_clus.to_numpy()), method="ward"
)
col_linkage = hierarchy.linkage(
    distance.pdist(radiomic_features_clus.T.to_numpy()), method="ward"
)
g = sns.clustermap(
    radiomic_features_clus,
    row_linkage=row_linkage,
    col_linkage=col_linkage,
    method="ward",
    vmin=-3,
    vmax=3,
    figsize=(8, 10),
    cmap="viridis",
)
g.ax_cbar.set_position((0.90, 0.2, 0.03, 0.3))

# Extract 5 disc degeneration clusters and append the "Clusters" variable to the DataFrame
n_clusters = 5
radiomic_features_clus["Clusters"] = fcluster(
    row_linkage, n_clusters, criterion="maxclust"
)

# Print the cluster assignments
print("Cluster Assignments:", radiomic_features_clus["Clusters"])

del g, n_clusters

In [None]:
from scipy.stats import chi2_contingency

# Concatenate the target clinical variable to the radiomic DataFrame
radiomic_features_clus = pd.concat([radiomic_features_clus, labels], axis=1)

# Barplot clusters/grades
plt.figure(figsize=(8, 8))
sns.countplot(
    x="Clusters", hue="label", data=radiomic_features_clus, palette="coolwarm"
)
plt.title("Distribution of Pfirmann grade in each Cluster")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.legend(title="Pfirmann grade", loc="upper right")
plt.show()

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(
    pd.crosstab(radiomic_features_clus["label"], radiomic_features_clus["Clusters"])
)

# Print the results
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")

# Classification

## Per disc

In [None]:
def disc_results(clf, disc: int = 1):
    labels, features = get_labels_and_features(rater="JDCarlos", label=disc)
    cv(clf, features, labels)
    visual_metrics(clf, features, labels)

In [None]:
for label in range(1, 6):
    print(f"Disc: {label}")
    labels, features = get_labels_and_features(rater="JDCarlos", label=label)
    test_multiple_models(features, labels)

 ### Disc 1

In [None]:
clf = GradientBoostingClassifier()
disc_results(clf)

### Disc 2

In [None]:
clf = RandomForestClassifier()
disc_results(clf, disc=2)

### Disc 3

In [None]:
clf = MLPClassifier()
disc_results(clf, disc=3)

### Disc 4

In [None]:
clf = GradientBoostingClassifier()
disc_results(clf, disc=4)

### Disc 5

In [None]:
clf = SVC()
disc_results(clf, disc=5)

## All discs

In [None]:
def all_discs_results(clf):
    labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
    # cv(clf, features, labels)
    visual_metrics(clf, features, labels)

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
test_multiple_models(features, labels)

In [None]:
clf = SVC()
all_discs_results(clf)

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos", from_image="t2w")
labels_ = labels.copy()
if min(labels_) != 0:
    labels_ = labels_ - min(labels_)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels_, test_size=0.25, random_state=0, stratify=labels_
)

clf = ExtraTreesClassifier()
pipeline_clf = Pipeline(
    [
        ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
        ("correlationreduction", CorrelationFeatureReduction()),
        ("scaler", StandardScaler()),
        ("classifier", clf),
    ]
)
pipeline_clf.fit(X_train, y_train)
# Predict the values for the test set
y_pred = pipeline_clf.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the report to a DataFrame
df = pd.DataFrame(report).transpose()

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(pipeline_clf, X_test, y_test, n_repeats=10, random_state=0)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(
            f"{list(X_test.columns)[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}"
        )

In [None]:
float_int = lambda x: str(int(x)) if x > 1 else f"{x:.2f}"

In [None]:
df1 = df.iloc[:5]
df1_ = df1.copy()
df1_["support"] = 0.1
fig = go.Figure(
    data=go.Heatmap(
        z=df1_.values.tolist(),
        x=df1.columns,
        y=[str(i) for i in range(1, 6)],
        colorscale=["#ffffff", colors[1]],
        showscale=True,
        xgap=1,
        ygap=1,
        zmin=0,
        zmax=1,
        hoverinfo="none",
        hoverongaps=False,
        text=[[float_int(val) for val in row] for row in df1.values.tolist()],
        texttemplate="%{text}",
    )
)
fig.update_layout(
    autosize=False,
    margin=dict(pad=10),
    plot_bgcolor="rgba(0,0,0,0)",
    legend_title_text="Intervertebral Disc",
    grid_rows=1,
    title="Per grade classification results for the five level grading task",
)
fig.update_xaxes(
    automargin=True,
    tickvals=list(range(len(df1.columns))),
    ticktext=[col.capitalize() for col in df1.columns],
    showgrid=False,
)
fig.update_yaxes(automargin=True, title_text="Pfirrmann Grade", showgrid=False)
fig.update_traces(
    textfont_size=16,
)
fig.show()
# fig.write_image(root_dir.joinpath("figures", "per_grade_5_levels.pdf"))

In [None]:
pred_series = pd.Series(y_pred, name="Predicted")
y_test.reset_index(drop=True, inplace=True)

In [None]:
df = pd.DataFrame([pred_series, y_test]).add(1).astype(int)
df = df.T.groupby("Predicted").value_counts().unstack()

In [None]:
fig = px.bar(
    df, title="Class Prediction Error", color_discrete_sequence=colors
)  # replace 0 with your column name if needed

total_count = df.sum(axis=1)
fig.add_trace(
    go.Scatter(
        x=df.index,
        y=total_count,
        mode="text",
        text=total_count,
        textposition="top center",
        showlegend=False,
    )
)
fig.update_traces(textfont_size=12)
fig.update_xaxes(title_text="Predicted Pfirrmann Grade")
fig.update_yaxes(
    title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
)
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    legend_title_text="True Pfirrmann Grade",
    grid_rows=1,
)
fig.show()
# fig.write_image(root_dir.joinpath("figures", "class_prediction_error_5level.pdf"))

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
imbalanced_learning_suite(features, labels)

## Combining classes 1 and 2

### Per disc

In [None]:
def disc_results_combining_1_and_2(clf, disc: int = 1):
    labels, features = get_labels_and_features(rater="JDCarlos", label=disc)
    labels.loc[labels == 1] = 2
    cv(clf, features, labels)
    visual_metrics(clf, features, labels, classes=["1 and 2", "3", "4", "5"])

In [None]:
for label in range(1, 6):
    print(f"Disc: {label}")
    labels, features = get_labels_and_features(rater="JDCarlos", label=label)
    labels.loc[labels == 1] = 2
    test_multiple_models(features, labels)

#### Disc 1

In [None]:
clf = MLPClassifier()
disc_results_combining_1_and_2(clf)

#### Disc 2

In [None]:
clf = ExtraTreesClassifier()
disc_results_combining_1_and_2(clf, disc=2)

#### Disc 3

In [None]:
clf = ExtraTreesClassifier()
disc_results_combining_1_and_2(clf, disc=3)

#### Disc 4

In [None]:
clf = RandomForestClassifier()
disc_results_combining_1_and_2(clf, disc=4)

#### Disc 5

In [None]:
clf = GradientBoostingClassifier()
disc_results_combining_1_and_2(clf, disc=5)

### All discs

In [None]:
def all_discs_results_combining_1_and_2(clf):
    labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
    labels.loc[labels == 1] = 2
    cv(clf, features, labels)
    visual_metrics(clf, features, labels, classes=["1 and 2", "3", "4", "5"])

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
labels.loc[labels == 1] = 2
test_multiple_models(features, labels)

In [None]:
clf = RandomForestClassifier()
all_discs_results_combining_1_and_2(clf)

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos")
labels.loc[labels == 1] = 2
imbalanced_learning_suite(features, labels)

In [None]:
labels, features = get_labels_and_features_all_discs(rater="JDCarlos", from_image="t2w")
labels_ = labels.copy()
labels_.loc[labels_ == 1] = 2
if min(labels_) != 0:
    labels_ = labels_ - min(labels_)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels_, test_size=0.25, random_state=0, stratify=labels_
)

clf = RandomForestClassifier()
pipeline_clf = Pipeline(
    [
        ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
        ("correlationreduction", CorrelationFeatureReduction()),
        ("scaler", StandardScaler()),
        ("classifier", clf),
    ]
)
pipeline_clf.fit(X_train, y_train)
# Predict the values for the test set
y_pred = pipeline_clf.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the report to a DataFrame
df = pd.DataFrame(report).transpose()

In [None]:
r = permutation_importance(pipeline_clf, X_test, y_test, n_repeats=10, random_state=0)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(
            f"{list(X_test.columns)[i]:<8} "
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}"
        )

In [None]:
df1_ = df.iloc[:4].copy()
df1_["support"] = 0.1
fig = go.Figure(
    data=go.Heatmap(
        z=df1_.values.tolist(),
        x=df1.columns,
        y=["1 and 2", "3", "4", "5"],
        colorscale=["#ffffff", colors[4]],
        zmin=0,
        zmax=1,
        showscale=True,
        xgap=1,
        ygap=1,
        text=[[float_int(val) for val in row] for row in df.iloc[:4].values.tolist()],
        texttemplate="%{text}",
    )
)
fig.update_layout(
    autosize=False,
    margin=dict(pad=10),  # padding
    plot_bgcolor="rgba(0,0,0,0)",
    legend_title_text="Intervertebral Disc",
    grid_rows=1,
    title="Per grade classification results for the four level grading task",
)
fig.update_xaxes(
    automargin=True,
    tickvals=list(range(len(df1.columns))),
    ticktext=[col.capitalize() for col in df1_.columns],
    showgrid=False,
)
fig.update_yaxes(automargin=True, title_text="Pfirrmann Grade", showgrid=False)
fig.update_traces(
    textfont_size=16,
)
fig.show()
# fig.write_image(root_dir.joinpath("figures", "per_grade_4_levels.pdf"))

In [None]:
y_test.reset_index(drop=True, inplace=True)

In [None]:
pred_series = pd.Series(y_pred, name="Predicted")

In [None]:
df = pd.DataFrame([pred_series, y_test]).add(2).astype(int)
df = df.T.groupby("Predicted").value_counts().unstack()

In [None]:
fig = px.bar(
    df, title="Class Prediction Error", color_discrete_sequence=colors
)  # replace 0 with your column name if needed

for name, trace in zip(["1 and 2", "3", "4", "5"], fig.data):
    trace.name = name

total_count = df.sum(axis=1)
fig.add_trace(
    go.Scatter(
        x=df.index,
        y=total_count,
        mode="text",
        text=total_count,
        textposition="top center",
        showlegend=False,
    )
)
fig.update_traces(textfont_size=12)
fig.update_xaxes(
    title_text="Predicted Pfirrmann Grade",
    tickvals=list(range(2, 6)),
    ticktext=["1 and 2", "3", "4", "5"],
)
fig.update_yaxes(
    title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
)
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    legend_title_text="True Pfirrmann Grade",
    grid_rows=1,
)
fig.show()
# fig.write_image(root_dir.joinpath("figures", "class_prediction_error_4level.pdf"))