# Set up

In [None]:
import pathlib
import sys

root_dir = pathlib.Path("..").resolve()

sys.path.append(str(root_dir))

In [None]:
import warnings

def warn(*args, **kwargs):
    pass

warnings.warn = warn

In [None]:
import logging
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import SimpleITK as sitk
from irrCAC.raw import CAC
from radiomics import imageoperations
from tqdm import tqdm

from src.ml.transforms import CorrelationFeatureReduction, ICCFeatureReduction, mRMRFeatureReduction, VarianceFeatureReduction
from src.ml.utils import build_dataframe_from_csv, get_labels_and_features, get_labels_and_features_all_discs

plt.rcParams["font.family"] = "DeJavu Serif"
plt.rcParams["font.serif"] = ["Times New Roman"]

colors = ["#663171", "#ea7428", "#0c7156", "#cf3a36", "#e2998a"]

In [None]:
t2_img_relation_path = root_dir.joinpath("data", "filtered_midas900_t2w.csv")
label_path = lambda rater: root_dir.joinpath("data", "labels", f"midasdisclabels{rater}.csv")

# Functions

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def test_multiple_models(features, labels):
    # Define classifiers to test
    classifiers = {
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "SVM": SVC(),
        "Logistic Regression": LogisticRegression(),
        "Stochastic Gradient Descent": SGDClassifier(),
        "Naive Bayes": GaussianNB(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Multilayer Perceptron": MLPClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "ExtraTrees": ExtraTreesClassifier(),
    }

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.25, random_state=0, stratify=labels
    )

    # Test each classifier
    f1_scores = {}
    for name, clf in classifiers.items():
        pipeline = Pipeline(
            [
                ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
                ("correlationreduction", CorrelationFeatureReduction()),
                ("scaler", StandardScaler()),
                ("classifier", clf),
            ]
        )
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        f1_scores[name] = f1_score(y_test, y_pred, average="weighted")

    # Select the classifier with the highest F1 score
    best_classifier = max(f1_scores, key=f1_scores.get)  # type: ignore
    print("Best classifier:", best_classifier)
    print("F1 score:", f1_scores[best_classifier])

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


def cv(clf, features, labels):
    # Create a stratified 5-fold cross-validation object
    skf = StratifiedKFold(n_splits=5)

    # Perform cross-validation
    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )
    scores = cross_val_score(
        pipeline_clf, features, labels, cv=skf, scoring="f1_weighted"
    )
    print(f"Cross Validation F1 Score: {scores.mean():0.4f} +/- {scores.std():0.2f}")

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler


def imbalanced_learning_suite(features, labels):
    # Define classifiers to test
    classifiers = {
        "Balanced Bagging Classifier": BalancedBaggingClassifier(
            sampler=RandomUnderSampler()
        ),
        "Balanced RandomForest Classifier": BalancedRandomForestClassifier(),
        "RUS Boost Classifier": RUSBoostClassifier(),
        "Easy Ensemble Classifier": EasyEnsembleClassifier(),
    }

    # Create a stratified 5-fold cross-validation object
    skf = StratifiedKFold(n_splits=5)

    # Perform cross-validation
    for name, clf in classifiers.items():
        pipeline_clf = Pipeline(
            [
                ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
                ("correlationreduction", CorrelationFeatureReduction()),
                ("scaler", StandardScaler()),
                ("classifier", clf),
            ]
        )
        scores = cross_val_score(
            pipeline_clf, features, labels, cv=skf, scoring="f1_weighted"
        )
        print(
            f"{name}: {scores.mean():0.2f} f1 with a standard deviation of {scores.std():0.2f}"
        )

In [None]:
import yellowbrick.classifier as viz

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
)
from yellowbrick.style import set_palette

set_palette(colors)


def visual_metrics(clf, features, labels, classes=["1", "2", "3", "4", "5"]):
    labels_ = labels.copy()
    if min(labels_) != 0:
        labels_ = labels_ - min(labels_)
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels_, test_size=0.25, random_state=0, stratify=labels_
    )

    _, axes = plt.subplots(1, 3, figsize=(15, 5))
    labels.plot(
        kind="hist",
        title="Pfirmann grade distribution",
        ax=axes[0],
        xticks=[1, 2, 3, 4, 5],
        align="mid",
    )

    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )
    pipeline_clf.fit(X_train, y_train)
    axes[1].set_title("Classification Report")
    axes[1].set_ylabel("Class")
    visualizer_class = viz.ClassificationReport(
        pipeline_clf, classes=classes[::-1], support=True, ax=axes[1], cmap="Blues"
    )
    visualizer_class.score(X_test, y_test)

    axes[2].set_title("Classification Prediction Error")
    axes[2].set_xlabel("Class")
    axes[2].set_ylabel("Number of Predictions")
    visualizer_pred = viz.ClassPredictionError(
        pipeline_clf, classes=classes, ax=axes[2]
    )
    visualizer_pred.score(X_test, y_test)

    plt.tight_layout()
    plt.show()

    predictions = pipeline_clf.predict(X_test)
    print(f"Accuracy within one grade: {accuracy_within_one(y_test, predictions):0.2f}")
    print(f"Balanced accuracy: {balanced_accuracy_score(y_test, predictions):0.2f}")
    print(classification_report(y_test, predictions, target_names=classes))

In [None]:
from sklearn.model_selection import RandomizedSearchCV


def random_search(clf, distribution, features, labels):
    pipeline_clf = Pipeline(
        [
            ("variancethreshold", VarianceFeatureReduction(threshold=0.05)),
            ("correlationreduction", CorrelationFeatureReduction()),
            ("scaler", StandardScaler()),
            ("classifier", clf),
        ]
    )

    skf = StratifiedKFold(n_splits=5)
    rs_clf = RandomizedSearchCV(
        pipeline_clf,
        distribution,
        cv=skf,
        scoring="f1_weighted",
        n_iter=10,
        random_state=0,
    )
    search = rs_clf.fit(features, labels)

    print(f"Best parameter (CV score={search.best_score_:0.3f}): {search.best_params_}")
    return {
        key.replace("classifier__", ""): value
        for key, value in search.best_params_.items()
        if key.startswith("classifier__")
    }

In [None]:
def checkMaskVol(image, mask, label):
    try:
        imageoperations.checkMask(
            image, mask, minimumROIDimensions=3, minimumROISize=1000, label=label
        )
        result = label
    except Exception as e:
        result = None
    return result

In [None]:
def accuracy_within_one(labels, predictions):
    # Calculate the absolute difference between labels and predictions
    diff = abs(labels - predictions)
    # Count the number of differences that are less than or equal to one
    correct_predictions = sum(diff <= 1)
    # Calculate the accuracy
    accuracy = correct_predictions / len(labels)
    return accuracy

# EDA

In [None]:
features_path = root_dir.joinpath("data", "features", "t2w_improved_params.csv")

## Data Sources

In [None]:
midas_img_relation = pd.read_csv(t2_img_relation_path, sep=",")

In [None]:
import json

devices = []
for _, row in midas_img_relation.iterrows():
    img_path = pathlib.Path(row["Image"])
    metadata_path = img_path.with_suffix("").with_suffix(".json")
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
        manufacturer = metadata.get("00080070", {}).get("Value", ["N/A"])[0]
        model = metadata.get("00081090", {}).get("Value", ["N/A"])[0]
        field_strength = metadata.get("00180087", {}).get("Value", ["N/A"])[0]
    devices.append(
        {
            "Manufacturer": manufacturer,
            "Model name": model,
            "Field Strength": field_strength,
        }
    )

In [None]:
df = pd.DataFrame(devices)

In [None]:
df.groupby("Manufacturer").value_counts()

## Labels

### Distribution

In [None]:
df = build_dataframe_from_csv(t2_img_relation_path, label_path("MODE"), features_path)

In [None]:
a = []
discs = {
    "1": "L5-S",
    "2": "L4-L5",
    "3": "L3-L4",
    "4": "L2-L3",
    "5": "L1-L2",
}
for i in range(1, 6):
    s = df[f"{i}"].value_counts()
    s.name = discs[f"{i}"]
    a.append(s)

In [None]:
df1 = pd.DataFrame(a).T

fig = px.bar(
    df1, title="Pfirrmann Grade Distribution", color_discrete_sequence=colors
)  # replace 0 with your column name if needed
total_count = df1.sum(axis=1)
fig.add_trace(
    go.Scatter(
        x=df1.index,
        y=total_count,
        mode="text",
        text=total_count,
        textposition="top center",
        showlegend=False,
    )
)
fig.update_traces(textfont_size=12)
fig.update_xaxes(title_text="Pfirrmann Grade")
fig.update_yaxes(
    title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
)
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
    legend_title_text="Intervertebral Disc",
    grid_rows=1,
)
fig.show()
pio.write_image(fig, root_dir.joinpath("figures", "pfirrmann_grade_distribution_v2.pdf"))

### Inter-rater agreement

In [None]:
labels_jd, _ = get_labels_and_features_all_discs(t2_img_relation_path, label_path("JDCarlos"), features_path)
labels_rafa, _ = get_labels_and_features_all_discs(t2_img_relation_path, label_path("Rafa"), features_path)
labels_rodro, _ = get_labels_and_features_all_discs(t2_img_relation_path, label_path("Rodro"), features_path)

arr = pd.concat([labels_jd, labels_rodro, labels_rafa], axis=1)
arr.dropna(inplace=True)
arr = arr.values

In [None]:
cac3raters = CAC(pd.concat([labels_jd, labels_rodro, labels_rafa], axis=1))
print(cac3raters.fleiss()["est"])
print(cac3raters.gwet()["est"])  

In [None]:
labels_jd.loc[(labels_jd==1).values] = 2
labels_rodro.loc[(labels_rodro==1).values] = 2
labels_rafa.loc[(labels_rafa==1).values] = 2

In [None]:
cac3raters = CAC(pd.concat([labels_jd, labels_rodro, labels_rafa], axis=1))
print(cac3raters.fleiss()["est"])
print(cac3raters.gwet()["est"])  

In [None]:
from statsmodels.stats import inter_rater as irr

# Function to calculate Fleiss' Kappa
def calculate_fleiss_kappa(ratings):
    table = irr.aggregate_raters(ratings)
    return irr.fleiss_kappa(table[0], method='fleiss')

# Perform sensitivity analysis by removing one item at a time
def sensitivity_analysis(ratings):
    n_items = ratings.shape[0]
    original_kappa = calculate_fleiss_kappa(ratings)
    print(f"Original Fleiss' Kappa: {original_kappa:.4f}")
    
    kappas = []
    for i in range(n_items):
        subset = np.delete(ratings, i, axis=0)
        kappa = calculate_fleiss_kappa(subset)
        kappas.append(kappa)
    
    return kappas

# Run sensitivity analysis
sensitivity_kappas = sensitivity_analysis(arr)

print(f"Standard deviation: {np.std(sensitivity_kappas):.4f}")

plt.plot(range(1, len(sensitivity_kappas) + 1), sensitivity_kappas, marker='o')
plt.axhline(y=calculate_fleiss_kappa(arr), color='r', linestyle='--', label='Original Kappa')
plt.xlabel('Excluded Item Index')
plt.ylabel('Fleiss\' Kappa')
plt.title('Sensitivity Analysis of Fleiss\' Kappa')
plt.legend()
plt.show()

## Histogram

In [None]:
def get_cropped_discs(image_path, mask_path): 
    image = sitk.ReadImage(image_path)
    mask = sitk.ReadImage(mask_path)

    disc_segmentations = np.unique(sitk.GetArrayFromImage(mask).ravel())
    valid_disc_segmentations = []
    for disc_segmentation in disc_segmentations:
        if result := checkMaskVol(image, mask, disc_segmentation):
            valid_disc_segmentations.append(int(result))
        if len(valid_disc_segmentations) == 5:
            break

    orient = sitk.DICOMOrientImageFilter()
    orient.SetDesiredCoordinateOrientation("LPI") # Left Posterior Inferior
    image = orient.Execute(image)
    mask = orient.Execute(mask)

    # image = imageoperations.normalizeImage(image, scale=100)

    center_slice = image.GetSize()[0] // 2
    image = image[center_slice, ...]
    mask = mask[center_slice, ...]

    maskfilter = sitk.MaskImageFilter()
    maskfilter.SetMaskingValue(0.0)
    maskfilter.SetOutsideValue(np.nan)
    masked_image = maskfilter.Execute(image, mask)

    labelimfilter=sitk.LabelShapeStatisticsImageFilter()
    labelimfilter.Execute(mask)

    cropped_discs = []
    for i in labelimfilter.GetLabels():
        if i in valid_disc_segmentations:
            box=labelimfilter.GetBoundingBox(i)
            roifilter = sitk.RegionOfInterestImageFilter()
            roifilter.SetRegionOfInterest(box)
            cropped_discs.append(roifilter.Execute(masked_image))
    return cropped_discs

In [None]:
results = {"Discs": [], "Pfirmann": [], "Array": []}
for _, row in tqdm(df.iterrows()):
    data = {
        "image": row["Image"],
        "mask": row["Mask"],
        "1": row["1"],
        "2": row["2"],
        "3": row["3"],
        "4": row["4"],
        "5": row["5"],
    }
    cropped_discs = get_cropped_discs(row["Image"], row["Mask"])
    for idx, disc in enumerate(cropped_discs, start=1):
        results["Discs"].append(idx)
        results["Pfirmann"].append(row[str(idx)])
        disc_array = sitk.GetArrayFromImage(disc)
        results["Array"].append(disc_array[~np.isnan(disc_array)])

In [None]:
histograms_df = pd.DataFrame(results)
histograms_df.head()

In [None]:
bins = 10

process = histograms_df.copy()
process["Scaled Array"] = process["Array"].map(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
process["Histogram"] = process["Scaled Array"].map(
    lambda x: np.histogram(x, bins=bins)[0]
)
grouped_by_disc = process.groupby("Pfirmann")

In [None]:
x_ = np.arange(0, 1.01, 1 / bins)
plt.figure(figsize=(10, 6))
for i in range(4, -1, -1):
    plt.plot(x_[:-1], grouped_by_disc["Histogram"].mean().iloc[i])
    plt.bar(
        x=x_[:-1],
        height=grouped_by_disc["Histogram"].mean().iloc[i],
        width=np.diff(x_),
        label=str(i + 1),
    )
plt.legend()
# plt.ylim(0, 100)
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Plotting
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection="3d")

# X, Y positions for the bars
x_positions = x_[:-1]
y_positions = [0, 1, 2, 3, 4]  # Different datasets on different y-positions

# Width of each bar and yz-space between bars
width = np.diff(x_)
y_space = 0.8

# Adding the histograms to the plot
for i, hist in enumerate(grouped_by_disc["Histogram"].mean()):
    ax.bar(
        x_positions,
        hist,
        zs=y_positions[i],
        zdir="y",
        width=width,
        align="center",
        alpha=0.7,
    )

ax.set_xlabel("X-axis (Value)")
# ax.set_ylabel('Y-axis (Dataset)')
ax.set_zlabel("Z-axis (Frequency)")

# Setting the y-ticks to correspond to different datasets
ax.set_yticks(y_positions)
ax.set_yticklabels(
    ["Pfirrmann 1", "Pfirrmann 2", "Pfirrmann 3", "Pfirrmann 4", "Pfirrmann 5"]
)

plt.title("3D Histograms")
plt.show()

## Feature Selection

### Intraclass correlation coefficient

In [None]:
def ICC_feature_reduction(
    input_dataframe,
    input_dataframe_ICC1,
    input_dataframe_ICC2,
    input_dataframe_ICC3,
    ICC_thresh,
):
    import pandas as pd
    import pingouin as pg
    import numpy as np

    # Initialize an empty flag vector
    to_keep = []
    # Iterate over features
    for feature in input_dataframe.columns:
        # Concatenate feature values vertically
        feature_data = pd.concat(
            [
                input_dataframe[[feature]],
                input_dataframe_ICC1[[feature]],
                input_dataframe_ICC2[[feature]],
                input_dataframe_ICC3[[feature]],
            ],
            axis=0,
            ignore_index=False,
        )
        print(feature_data.shape)
        # Append patient/repetition information
        # Create a repetition/patients column
        result_array = np.repeat(
            [1, 2, 3, 4],
            [
                len(input_dataframe),
                len(input_dataframe),
                len(input_dataframe),
                len(input_dataframe),
            ],
        )
        feature_data["Repetition"] = result_array
        feature_data["Patients"] = pd.factorize(feature_data.index)[0]
        feature_data = feature_data.rename(columns={feature: "FeatureValue"})

        # Compute ICC
        icc_result = pg.intraclass_corr(
            data=feature_data,
            targets="Patients",
            raters="Repetition",
            ratings="FeatureValue",
        )
        # Extract ICC value
        icc_value = icc_result["ICC"].iloc[
            1
        ]  # ICC2: A random sample of raters rate each target. Measure of absolute agreement.

        # Check if ICC is greater than the threshold
        if icc_value > ICC_thresh:
            to_keep.append(True)
        else:
            to_keep.append(False)

    return input_dataframe.loc[:, to_keep]

In [None]:
_, features_t2 = get_labels_and_features_all_discs(t2_img_relation_path, label_path("AVG"), features_path)
features_t2.drop(index='s_8141', inplace=True)
_, features_t2_shift = get_labels_and_features_all_discs(
    t2_img_relation_path, 
    label_path("AVG"), 
    root_dir.joinpath(
        "data", "mask_perturbation", "t2w_improved_params_shift.csv"
    )
)
features_t2_shift.drop(index='s_8141', inplace=True)
_, features_t2_erode = get_labels_and_features_all_discs(
    t2_img_relation_path, 
    label_path("AVG"), 
    root_dir.joinpath(
        "data", "mask_perturbation", "t2w_improved_params_erode.csv"
    )
)
_, features_t2_dilate = get_labels_and_features_all_discs(
    t2_img_relation_path, 
    label_path("AVG"), 
    root_dir.joinpath(
        "data", "mask_perturbation", "t2w_improved_params_dilate.csv"
    )
)
features_t2_dilate.drop(index='s_8141', inplace=True)
t2_reduced_ICC = ICC_feature_reduction(features_t2, features_t2_shift, features_t2_erode, features_t2_dilate, 0.85)
print(f"t2.     {t2_reduced_ICC.shape[1]} features retained from {features_t2.shape[1]}")

In [None]:
t2_reduced_ICC.to_csv(
    root_dir.joinpath(
        "data",
        "mask_perturbation",
        "t2w_improved_params_reduced_ICC.csv",
    )
)

In [None]:
def compute_ICC_reduced_per_disc(disc: int):
    t2 = pd.read_csv(
        features_path, sep=","
    )
    t2 = t2.loc[:, t2.columns.str.contains(f"label{disc}")]
    t2 = t2[t2.select_dtypes(include="number").columns.tolist()]

    t2_shifted = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", "t2w_improved_params_shift.csv"
        ),
        sep=",",
    )
    t2_shifted = t2_shifted.loc[:, t2_shifted.columns.str.contains(f"label{disc}")]
    t2_shifted = t2_shifted[t2_shifted.select_dtypes(include="number").columns.tolist()]

    t2_eroded = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", "t2w_improved_params_erode.csv"
        ),
        sep=",",
    )
    t2_eroded = t2_eroded.loc[:, t2_eroded.columns.str.contains(f"label{disc}")]
    t2_eroded = t2_eroded[t2_eroded.select_dtypes(include="number").columns.tolist()]

    t2_dilated = pd.read_csv(
        root_dir.joinpath(
            "data", "mask_perturbation", "t2w_improved_params_dilate.csv"
        ),
        sep=",",
    )
    t2_dilated = t2_dilated.loc[:, t2_dilated.columns.str.contains(f"label{disc}")]
    t2_dilated = t2_dilated[t2_dilated.select_dtypes(include="number").columns.tolist()]

    if disc == 1:
        t2.drop(index=317, inplace=True)
        t2_shifted.drop(index=317, inplace=True)
        t2_eroded.drop(index=317, inplace=True)
        t2_dilated.drop(index=317, inplace=True)

    t2_reduced_ICC = ICC_feature_reduction(t2, t2_shifted, t2_eroded, t2_dilated, 0.85)
    print(f"t2.     {t2_reduced_ICC.shape[1]} features retained from {t2.shape[1]}")

    return t2_reduced_ICC

In [None]:
disc_1_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=1)

In [None]:
disc_2_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=2)

In [None]:
disc_3_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=3)

In [None]:
disc_4_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=4)

In [None]:
disc_5_t2_reduced_ICC = compute_ICC_reduced_per_disc(disc=5)

In [None]:
t2_reduced_ICC = pd.concat(
    [
        disc_1_t2_reduced_ICC,
        disc_2_t2_reduced_ICC,
        disc_3_t2_reduced_ICC,
        disc_4_t2_reduced_ICC,
        disc_5_t2_reduced_ICC,
    ],
    axis=1,
)
t2_reduced_ICC.to_csv(
    root_dir.joinpath(
        "data",
        "mask_perturbation",
        "t2w_improved_params_reduced_ICC_per_disc.csv",
    )
)

### Near-zero variance

In [None]:
import numpy as np

# Feature reduction based on variance thresholding (remove features with variance smaller than 0.05)
from sklearn.feature_selection import VarianceThreshold

# Initialize selector based on VarianceThreshold
selector = VarianceThreshold(threshold=0.05)

#  Estimate variances and reduce features
selector.fit_transform(t2_reduced_ICC)

# Get the selected feature labels and reduce the Radiomic_Feature dataframe
radiomic_features_var = t2_reduced_ICC.loc[:, selector.get_support()]

# Display the number of features removed
print(
    f"{np.count_nonzero(~selector.get_support())}/{t2_reduced_ICC.shape[1]} features were removed due to near-zero variance."
)

del selector

### Correlation

In [None]:
corr_matrix_var = radiomic_features_var.corr(
    method="spearman"
).abs()  # absolute correlation matrix

# Initialize the flag vector with True values
to_keep = np.full((corr_matrix_var.shape[1]), True, dtype=bool)

for i in range(corr_matrix_var.shape[1]):
    for j in range(i + 1, corr_matrix_var.shape[1]):
        if to_keep[i] and corr_matrix_var.iloc[i, j] >= 0.8:
            if to_keep[j]:
                to_keep[j] = False

# Retain features that are not higly correlated
radiomic_features_corr = radiomic_features_var.iloc[:, to_keep]

print(
    f"{np.count_nonzero(~to_keep)}/{radiomic_features_var.shape[1]} features were removed due to high correlation. {radiomic_features_corr.shape[1]} features remaining."
)

del to_keep, i, j

In [None]:
import seaborn as sns

# Calculate the correlation matrix of the original feature set
corr_matrix = t2_reduced_ICC.corr(method="spearman")
# Display the correlation matrix
plt.figure(figsize=(8, 6.5))
sns.heatmap(corr_matrix, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation of Radiomic features")
plt.show()

# Calculate the correlation matrix of the reduced feature set
corr_matrix_red = radiomic_features_corr.corr(method="spearman")
# Display the correlatiradiomic_features_corron matrix
plt.figure(figsize=(8, 6.5))
sns.heatmap(corr_matrix_red, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation of reduced radiomic features")
plt.show()

In [None]:
print(f"Remaining features: {list(radiomic_features_corr.columns)}")

# Clustering

In [None]:
import copy
from scipy.stats import zscore
from scipy.spatial import distance
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, fcluster

radiomic_features_clus = copy.deepcopy(radiomic_features_corr)
# Normalize the data
radiomic_features_clus = zscore(radiomic_features_clus, axis=0)

# Calculate and plot the clustergram
row_linkage = hierarchy.linkage(
    distance.pdist(radiomic_features_clus.to_numpy()), method="ward"
)
col_linkage = hierarchy.linkage(
    distance.pdist(radiomic_features_clus.T.to_numpy()), method="ward"
)
g = sns.clustermap(
    radiomic_features_clus,
    row_linkage=row_linkage,
    col_linkage=col_linkage,
    method="ward",
    vmin=-3,
    vmax=3,
    figsize=(8, 10),
    cmap="viridis",
)
g.ax_cbar.set_position((0.90, 0.2, 0.03, 0.3))

# Extract 5 disc degeneration clusters and append the "Clusters" variable to the DataFrame
n_clusters = 5
radiomic_features_clus["Clusters"] = fcluster(
    row_linkage, n_clusters, criterion="maxclust"
)

# Print the cluster assignments
print("Cluster Assignments:", radiomic_features_clus["Clusters"])

del g, n_clusters

In [None]:
from scipy.stats import chi2_contingency

# Concatenate the target clinical variable to the radiomic DataFrame
radiomic_features_clus = pd.concat([radiomic_features_clus, labels], axis=1)

# Barplot clusters/grades
plt.figure(figsize=(8, 8))
sns.countplot(
    x="Clusters", hue="label", data=radiomic_features_clus, palette="coolwarm"
)
plt.title("Distribution of Pfirmann grade in each Cluster")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.legend(title="Pfirmann grade", loc="upper right")
plt.show()

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(
    pd.crosstab(radiomic_features_clus["label"], radiomic_features_clus["Clusters"])
)

# Print the results
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")

# Classification

In [None]:
train_labels = pd.read_csv(root_dir.joinpath("data", "labels", "train.csv"), index_col="ID")
train_features = pd.read_csv(root_dir.joinpath("data", "features", "scaled_train.csv"), index_col="ID")
test_labels = pd.read_csv(root_dir.joinpath("data", "labels", "test.csv"), index_col="ID")
test_features = pd.read_csv(root_dir.joinpath("data", "features", "scaled_test.csv"), index_col="ID")

In [None]:
def plot_classification_report_5_classes(df, save_fig=False):
    float_int = lambda x: str(int(x)) if x > 1 else f"{x:.2f}"

    df1 = df.iloc[:5]
    df1_ = df1.copy()
    df1_["support"] = 0.1
    fig = go.Figure(
        data=go.Heatmap(
            z=df1_.values.tolist(),
            x=df1.columns,
            y=[str(i) for i in range(1, 6)],
            colorscale=["#ffffff", colors[1]],
            showscale=True,
            xgap=1,
            ygap=1,
            zmin=0,
            zmax=1,
            hoverinfo="none",
            hoverongaps=False,
            text=[[float_int(val) for val in row] for row in df1.values.tolist()],
            texttemplate="%{text}",
        )
    )
    fig.update_layout(
        autosize=False,
        margin=dict(pad=10),
        plot_bgcolor="rgba(0,0,0,0)",
        legend_title_text="Intervertebral Disc",
        grid_rows=1,
        title="Per grade classification results for the five level grading task",
    )
    fig.update_xaxes(
        automargin=True,
        tickvals=list(range(len(df1.columns))),
        ticktext=[col.capitalize() for col in df1.columns],
        showgrid=False,
    )
    fig.update_yaxes(automargin=True, title_text="Pfirrmann Grade", showgrid=False)
    fig.update_traces(
        textfont_size=16,
    )
    fig.show()
    if save_fig:
        fig.write_image(root_dir.joinpath("figures", "per_grade_5_levels_v2.pdf"))

In [None]:
def plot_class_prediction_error_5_classes(df, save_fig=False):    
    fig = px.bar(
        df, title="Class Prediction Error", color_discrete_sequence=colors
    )  # replace 0 with your column name if needed

    total_count = df.sum(axis=1)
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=total_count,
            mode="text",
            text=total_count,
            textposition="top center",
            showlegend=False,
        )
    )
    fig.update_traces(textfont_size=12)
    fig.update_xaxes(title_text="Predicted Pfirrmann Grade")
    fig.update_yaxes(
        title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
    )
    fig.update_layout(
        plot_bgcolor="rgba(0,0,0,0)",
        legend_title_text="True Pfirrmann Grade",
        grid_rows=1,
    )
    fig.show()
    if save_fig:
        fig.write_image(root_dir.joinpath("figures", "class_prediction_error_5level_v2.pdf"))

In [None]:
def plot_lr_feature_importance(lr_model, feature_names, save_fig=False):
    feature_importances = lr_model.coef_[0]

    # Sort the feature importances in descending order
    sorted_indices = feature_importances.argsort()[::-1]
    sorted_indices = list(sorted_indices[:10])+list(sorted_indices[-10:])
    sorted_features = [feature_names[i]+"   " for i in sorted_indices]
    sorted_importances = feature_importances[sorted_indices]

    # Create a trace for the feature importance bar plot
    trace = go.Bar(
        x=sorted_importances,
        y=sorted_features,
        orientation='h',
        marker=dict(
            color=sorted_importances,
            colorscale=colors,
            reversescale=True
        ),
        text=sorted_importances.round(2),
        textposition='outside',
        hoverinfo='text',
        opacity=0.8
    )

    # Create the layout for the plot
    layout = go.Layout(
        title='Feature Importance (Top 10 positive and negative)',
        xaxis=dict(title='Importance'),
        yaxis=dict(title='Feature', tickangle=-35),
        width=1000,
        height=600,
        plot_bgcolor="rgba(0,0,0,0)",
        grid_rows=1,
    )

    # Create the figure and plot it
    fig = go.Figure(data=[trace], layout=layout)
    fig.show()
    if save_fig:
        fig.write_image(root_dir.joinpath("figures", "feature_importance_5level.pdf"))

In [None]:
def visual_metrics(
    clf, 
    params, 
    train_features, 
    train_labels, 
    test_features, 
    test_labels, 
    plot_classification_report=plot_classification_report_5_classes,
    plot_class_prediction_error=plot_class_prediction_error_5_classes,
    save_fig=False
):
    pipeline_clf = Pipeline(
        [
            ("reduce_dim", "passthrough"),
            ("classifier", clf),
        ]
    )
    pipeline_clf.set_params(**params).fit(train_features, train_labels)

    plot_lr_feature_importance(pipeline_clf.named_steps["classifier"], train_features.columns.values, save_fig)

    # # Predict the values for the test set
    # y_pred = pipeline_clf.predict(test_features)

    # # Generate a classification report
    # report = classification_report(test_labels, y_pred, output_dict=True)

    # # Convert the report to a DataFrame
    # df_classification_report = pd.DataFrame(report).transpose()

    # test_labels_ = test_labels.reset_index(drop=True)
    # test_labels_["Predicted"] = pd.Series(y_pred, name="Predicted")
    # df_class_prediction_error = test_labels_.astype(int)
    # df_class_prediction_error = df_class_prediction_error.groupby("Predicted").value_counts().unstack()

    # predictions = [y_pred]
    # for image_type in ["original", "log", "wavelet"]:
    #     X_train = train_features.loc[:,train_features.columns.str.contains(image_type) & ~train_features.columns.str.contains("diagnostics")].copy()
    #     X_test = test_features.loc[:,test_features.columns.str.contains(image_type) & ~test_features.columns.str.contains("diagnostics")].copy()
    #     pipeline_clf.set_params(**params).fit(X_train, train_labels)
    #     predictions.append(pipeline_clf.predict(X_test))
    
    # plot_classification_report(df_classification_report, save_fig)
    # plot_class_prediction_error(df_class_prediction_error, save_fig)
    # print(f"Accuracy within one grade: {accuracy_within_one(test_labels['label'], y_pred):0.2f}")
    # print(f"Balanced accuracy: {balanced_accuracy_score(test_labels['label'], y_pred):0.2f}")
    # cac_image_type = CAC(pd.DataFrame(np.vstack(predictions).T))
    # print(cac_image_type.fleiss()["est"])

## All discs

In [None]:
clf = LogisticRegression()
params = {
    "classifier__solver": "liblinear",
    "classifier__penalty": "l2",
    "classifier__max_iter": 300,
    "classifier__C": 10.0,
}
visual_metrics(clf, params, train_features, train_labels, test_features, test_labels, save_fig=True)

## Per disc

In [None]:
def visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels):
    y_train = train_labels.loc[
        train_labels.index.str.endswith(disc)
    ]
    X_train = train_features.loc[
        train_features.index.str.endswith(disc)
    ]
    y_test = test_labels.loc[
        test_labels.index.str.endswith(disc)
    ]
    X_test = test_features.loc[
        test_features.index.str.endswith(disc)
    ]
    visual_metrics(clf, params, X_train, y_train, X_test, y_test)

 ### Disc 1

In [None]:
disc = "1"
clf = LogisticRegression()
params = {
    "reduce_dim": PCA(n_components=0.95, random_state=0),
    "classifier__solver": "sag",
    "classifier__penalty": "l2",
    "classifier__max_iter": 200,
    "classifier__C": 1.0,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

### Disc 2

In [None]:
disc = "2"
clf = GradientBoostingClassifier()
params = {
    "classifier__subsample": 1.0,
    "classifier__n_estimators": 200,
    "classifier__max_features": "sqrt",
    "classifier__max_depth": 4,
    "classifier__learning_rate": 0.1,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

### Disc 3

In [None]:
disc = "3"
clf = MLPClassifier()
params = {
    "reduce_dim": mRMRFeatureReduction(K=20),
    "classifier__solver": "adam",
    "classifier__learning_rate": "adaptive",
    "classifier__hidden_layer_sizes": (50, 50),
    "classifier__alpha": 0.01,
    "classifier__activation": "relu",
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

### Disc 4

In [None]:
disc = "4"
clf = GradientBoostingClassifier()
params = {
    "reduce_dim": mRMRFeatureReduction(K=20),
    "classifier__subsample": 0.7,
    "classifier__n_estimators": 100,
    "classifier__max_features": "log2",
    "classifier__max_depth": 2,
    "classifier__learning_rate": 0.1,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

### Disc 5

In [None]:
disc = "5"
clf = MLPClassifier()
params = {
    "reduce_dim": mRMRFeatureReduction(K=20),
    "classifier__solver": "adam",
    "classifier__learning_rate": "constant",
    "classifier__hidden_layer_sizes": (50, 50),
    "classifier__alpha": 0.01,
    "classifier__activation": "tanh",
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

## Combining classes 1 and 2

In [None]:
train_labels.loc[(train_labels==1).values] = 2
test_labels.loc[(test_labels==1).values] = 2

In [None]:
def plot_classification_report_4_classes(df, save_fig=False):
    float_int = lambda x: str(int(x)) if x > 1 else f"{x:.2f}"

    df1_ = df.iloc[:4].copy()
    df1_["support"] = 0.1
    fig = go.Figure(
        data=go.Heatmap(
            z=df1_.values.tolist(),
            x=df1.columns,
            y=["1 and 2", "3", "4", "5"],
            colorscale=["#ffffff", colors[4]],
            zmin=0,
            zmax=1,
            showscale=True,
            xgap=1,
            ygap=1,
            text=[[float_int(val) for val in row] for row in df.iloc[:4].values.tolist()],
            texttemplate="%{text}",
        )
    )
    fig.update_layout(
        autosize=False,
        margin=dict(pad=10),  # padding
        plot_bgcolor="rgba(0,0,0,0)",
        legend_title_text="Intervertebral Disc",
        grid_rows=1,
        title="Per grade classification results for the four level grading task",
    )
    fig.update_xaxes(
        automargin=True,
        tickvals=list(range(len(df1.columns))),
        ticktext=[col.capitalize() for col in df1_.columns],
        showgrid=False,
    )
    fig.update_yaxes(automargin=True, title_text="Pfirrmann Grade", showgrid=False)
    fig.update_traces(
        textfont_size=16,
    )
    fig.show()
    if save_fig:
        fig.write_image(root_dir.joinpath("figures", "per_grade_4_levels_v2.pdf"))

In [None]:
def plot_class_prediction_error_4_classes(df, save_fig=False):
    fig = px.bar(
        df, title="Class Prediction Error", color_discrete_sequence=colors
    )  # replace 0 with your column name if needed

    for name, trace in zip(["1 and 2", "3", "4", "5"], fig.data):
        trace.name = name

    total_count = df.sum(axis=1)
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=total_count,
            mode="text",
            text=total_count,
            textposition="top center",
            showlegend=False,
        )
    )
    fig.update_traces(textfont_size=12)
    fig.update_xaxes(
        title_text="Predicted Pfirrmann Grade",
        tickvals=list(range(2, 6)),
        ticktext=["1 and 2", "3", "4", "5"],
    )
    fig.update_yaxes(
        title_text="Frequency", showgrid=True, gridcolor="rgba(184, 184, 184, 0.3)"
    )
    fig.update_layout(
        plot_bgcolor="rgba(0,0,0,0)",
        legend_title_text="True Pfirrmann Grade",
        grid_rows=1,
    )
    fig.show()
    if save_fig:
        fig.write_image(root_dir.joinpath("figures", "class_prediction_error_4level_v2.pdf"))

### All discs

In [None]:
clf = SVC(kernel="linear")
params = {"classifier__gamma": "scale", "classifier__C": 1}
visual_metrics(clf, 
               params, 
               train_features, 
               train_labels, 
               test_features, 
               test_labels,
               plot_classification_report_4_classes,
               plot_class_prediction_error_4_classes,
               save_fig=True)

### Per disc

In [None]:
def visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels):
    y_train = train_labels.loc[
        train_labels.index.str.endswith(disc)
    ]
    X_train = train_features.loc[
        train_features.index.str.endswith(disc)
    ]
    y_test = test_labels.loc[
        test_labels.index.str.endswith(disc)
    ]
    X_test = test_features.loc[
        test_features.index.str.endswith(disc)
    ]
    visual_metrics(clf, 
                   params, 
                   X_train, 
                   y_train,  
                   X_test,
                   y_test,
                   plot_classification_report_4_classes,
                   plot_class_prediction_error_4_classes)

#### Disc 1

In [None]:
disc = "1"
clf = SVC(kernel="linear")
params = {
    "reduce_dim": PCA(n_components=0.95, random_state=0),
    "classifier__gamma": "scale",
    "classifier__C": 1,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

#### Disc 2

In [None]:
disc = "2"
clf = GradientBoostingClassifier()
params = {
    "classifier__subsample": 1.0,
    "classifier__n_estimators": 500,
    "classifier__max_features": "sqrt",
    "classifier__max_depth": 2,
    "classifier__learning_rate": 0.2,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

#### Disc 3

In [None]:
disc = "3"
clf = MLPClassifier()
params = {
    "reduce_dim": mRMRFeatureReduction(K=20),
    "classifier__solver": "adam",
    "classifier__learning_rate": "constant",
    "classifier__hidden_layer_sizes": (50, 50),
    "classifier__alpha": 0.0001,
    "classifier__activation": "relu",
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

#### Disc 4

In [None]:
disc = "4"
clf = GradientBoostingClassifier()
params = {
    "reduce_dim": mRMRFeatureReduction(K=10),
    "classifier__subsample": 0.9,
    "classifier__n_estimators": 500,
    "classifier__max_features": "sqrt",
    "classifier__max_depth": 4,
    "classifier__learning_rate": 0.05,
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)

#### Disc 5

In [None]:
disc = "5"
clf = MLPClassifier()
params = {
    "reduce_dim": PCA(n_components=0.99, random_state=0),
    "classifier__solver": "adam",
    "classifier__learning_rate": "invscaling",
    "classifier__hidden_layer_sizes": (50,),
    "classifier__alpha": 0.0001,
    "classifier__activation": "tanh",
}
visual_metrics_per_disc(clf, params, disc, train_features, train_labels, test_features, test_labels)