In [26]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score,balanced_accuracy_score
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8-paper")
import seaborn as sns

sns.set_theme(context="paper", font_scale=1)

In [27]:
# set the paths
BASE_DIR = "../../dataset/"

# features
FEATURES_BASE = "../../features/"
FEATURES = FEATURES_BASE + 'balanced/both/'

# Models
MODELS = "../../models/"
MODELS_RESULTS = MODELS + "results/"

# report
PAPER = "../../paper/"
IMAGES_PATH = PAPER + "images/"

# HYERPARAMETERS
SEED = 42
INTERVAL = 2

BALANCING_TYPE = "both"
RESULT_NAME = f"results_models_comparison_best_features_{BALANCING_TYPE}.csv"
CM_ARTIFACT_NAME = f"confusion_matrix_models_comparison_artifact_recognition_{BALANCING_TYPE}.npy"
CM_DISEASE_NAME = (
    f"confusion_matrix_models_comparison_disease_recognition_{BALANCING_TYPE}.npy"
)

In [28]:
# -----------------------------------------Constants-----------------------------------------
full_data_dict_keys = ["artifacts", "extrahls", "murmurs", "normals", "extrastoles"]
interval = INTERVAL  # You need to define INTERVAL somewhere
sample_rates = [4000]
num_feats = {
    "30 MFCC": "25mfcc",
    "12  Chroma": "12chroma",
    "70 CQT": "1cqt",
    "40 RMS": "0rms",
    "40 Zero Crossing Rates": "1zcr",
    "40 Spectral Centroid": "0sc",
    "60 Spectral Bandwidth": "0sb",
    "40 Spectral Rolloff": "0sr",
}

COMPLETE_DATA_PRIOR_CORR_NAME = f'full_data_filtered_{INTERVAL}s_4000hz_' + '_'.join(numvalue for _, numvalue in num_feats.items()) + '.npy'
# load the data
full_data = np.load(FEATURES + COMPLETE_DATA_PRIOR_CORR_NAME, allow_pickle=True).item()
n_cols = full_data['train']['X'].shape[1]

# Extract data from the dictionary
X_train = full_data["train"]["X"]
y_train = full_data["train"]["y"]
X_test = full_data["test"]["X"]
y_test = full_data["test"]["y"]
y_train[y_train != 0] = 1
y_test[y_test != 0] = 1


### Artifact recognition model

In [29]:
# ----------------------------------------- Create DataFrame to store results-----------------------------------------
MODELS = {'Random Forest': RandomForestClassifier(random_state=SEED),
        'XGBoost': XGBClassifier(random_state=SEED),
        'MLP': MLPClassifier(hidden_layer_sizes=(128, 64, 32,), activation='relu', solver='adam', random_state=SEED),
        'CatBoost': CatBoostClassifier(random_state=SEED, verbose=0),}

METRICS = {
    "Test Acc": accuracy_score,
    "Macro F1": f1_score,
    "Balanced Accuracy": balanced_accuracy_score,
}

result_df = pd.DataFrame(columns=list(METRICS.keys()), index=list(MODELS.keys()))
cm_dict = {}

# ----------------------------------------- Loop over each feature-----------------------------------------
for model_name, clf in MODELS.items():
    print(f"\nTraining {model_name}")


    # fit the model
    clf.fit(X_train, y_train)
    
    for metric, funct in METRICS.items():
        print(f"\tCalculating {metric}")
        y_pred = clf.predict(X_test)

        if "Macro" in metric:
                result_df.loc[model_name, metric] = funct(y_test, y_pred, average='macro')
        else:
                result_df.loc[model_name, metric] = funct(y_test, y_pred)
        # cf
        cf = confusion_matrix(y_test, y_pred)
        cm_dict[model_name] = cf

# ----------------------------------------- Save the results-----------------------------------------
result_df.to_csv(MODELS_RESULTS + RESULT_NAME)
np.save(MODELS_RESULTS + CM_ARTIFACT_NAME, cm_dict)
result_df


Training Random Forest
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training XGBoost
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training MLP
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training CatBoost
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy


Unnamed: 0,Test Acc,Macro F1,Balanced Accuracy
Random Forest,0.985915,0.977451,0.964912
XGBoost,0.985915,0.977451,0.964912
MLP,0.984155,0.97472,0.963811
CatBoost,0.987676,0.980338,0.969298


In [30]:
best_artifact_model = CatBoostClassifier(random_state=SEED, verbose=0)
best_artifact_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7e4cdcf7ee70>

## Heart disease prediction

In [31]:
# Extract data from the dictionary
full_data = np.load(FEATURES + COMPLETE_DATA_PRIOR_CORR_NAME, allow_pickle=True).item()
n_cols = full_data["train"]["X"].shape[1]

# Extract data from the dictionary
X_train = full_data["train"]["X"]
y_train = full_data["train"]["y"]
X_test = full_data["test"]["X"]
y_test = full_data["test"]["y"]
X_train = full_data["train"]["X"]
y_train = full_data["train"]["y"].reshape(-1)
X_test = full_data["test"]["X"]
y_test = full_data["test"]["y"]
train_mask = y_train != 0
test_mask = y_test != 0

X_train = X_train[train_mask]
y_train = y_train[train_mask]
X_test = X_test[test_mask]
y_test = y_test[test_mask]

y_train = y_train - 1
y_test = y_test - 1
result_df = pd.DataFrame(columns=list(METRICS.keys()), index=list(MODELS.keys()))
cm_dict = {}

# ----------------------------------------- Loop over each feature-----------------------------------------
for model_name, clf in MODELS.items():
    print(f"\nTraining {model_name}")

    # fit the model
    clf.fit(X_train, y_train)

    for metric, funct in METRICS.items():
        print(f"\tCalculating {metric}")
        y_pred = clf.predict(X_test)

        if "Macro" in metric:
            result_df.loc[model_name, metric] = funct(y_test, y_pred, average="macro")
        else:
            result_df.loc[model_name, metric] = funct(y_test, y_pred)
        # cf
        cf = confusion_matrix(y_test, y_pred)
        cm_dict[model_name] = cf

# ----------------------------------------- Save the results-----------------------------------------
result_df.to_csv(MODELS_RESULTS + RESULT_NAME)
np.save(MODELS_RESULTS + CM_DISEASE_NAME, cm_dict)
result_df


Training Random Forest
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training XGBoost
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training MLP
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy

Training CatBoost
	Calculating Test Acc
	Calculating Macro F1
	Calculating Balanced Accuracy


Unnamed: 0,Test Acc,Macro F1,Balanced Accuracy
Random Forest,0.843612,0.844835,0.816424
XGBoost,0.854626,0.857045,0.838387
MLP,0.861233,0.86682,0.865041
CatBoost,0.865639,0.870926,0.848684


In [32]:
best_disease_model = CatBoostClassifier(random_state=SEED, verbose=0)
best_disease_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7e4cddf52120>

## SOluzione Semplice

In [33]:
def predict(artifact_model, disease_model, X):
    # Predict artifacts (0 for artifact, 1 for not an artifact)
    artifact_pred = artifact_model.predict(X)

    # Predict disease (assuming output is an integer class label 0, 1, 2 for diseases)
    disease_pred = disease_model.predict(X)

    # Ensure disease predictions are reshaped to match artifact predictions
    disease_pred = disease_pred.flatten()

    # Add 1 to disease predictions to differentiate disease classes from artifact class
    y_pred = disease_pred + 1

    # Multiply by artifact predictions to set disease predictions to 0 for artifacts
    y_pred = y_pred * artifact_pred

    print(f"artifact_pred shape: {artifact_pred.shape}")
    print(f"disease_pred shape: {disease_pred.shape}")
    print(f"y_pred shape: {y_pred.shape}")

    return y_pred


# Extract data from the dictionary
full_data = np.load(FEATURES + COMPLETE_DATA_PRIOR_CORR_NAME, allow_pickle=True).item()
n_cols = full_data["train"]["X"].shape[1]

# Extract data from the dictionary
X_train = full_data["train"]["X"]
y_train = full_data["train"]["y"]
X_test = full_data["test"]["X"]
y_test = full_data["test"]["y"]


y_pred = predict(best_artifact_model, best_disease_model, X_test)

# Calculate evaluation metrics

f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Accuracy Score: {accuracy}")
print(f"Balanced Accuracy Score: {balanced_accuracy}")

artifact_pred shape: (568,)
disease_pred shape: (568,)
y_pred shape: (568,)
F1 Score: 0.8845618939562814
Accuracy Score: 0.8802816901408451
Balanced Accuracy Score: 0.8666662415234828


## Uso un terzo modello per predire

In [34]:
# Extract data from the dictionary
full_data = np.load(FEATURES + COMPLETE_DATA_PRIOR_CORR_NAME, allow_pickle=True).item()
n_cols = full_data["train"]["X"].shape[1]

# Extract data from the dictionary
X_train = full_data["train"]["X"]
y_train = full_data["train"]["y"]
X_test = full_data["test"]["X"]
y_test = full_data["test"]["y"]


# Predictions for training data
pred_artifact_train = best_artifact_model.predict(X_train)
pred_disease_train = best_disease_model.predict(X_train)
pred_artifact_test = best_artifact_model.predict(X_test)
pred_disease_test = best_disease_model.predict(X_test)

# Concatenate predictions to form new feature set
X_train_new = np.concatenate(
    (pred_artifact_train.reshape(-1, 1), pred_disease_train.reshape(-1, 1)), axis=1
)

X_test_new = np.concatenate(
    (pred_artifact_test.reshape(-1, 1), pred_disease_test.reshape(-1, 1)), axis=1
)

# Train the final model
last_model = CatBoostClassifier(random_state=SEED, verbose=0)
last_model.fit(X_train_new, y_train)

# Predict using the final model
y_pred = last_model.predict(X_test_new)

# Calculate and print metrics
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Accuracy Score: {accuracy}")
print(f"Balanced Accuracy Score: {balanced_accuracy}")

F1 Score: 0.8845618939562814
Accuracy Score: 0.8802816901408451
Balanced Accuracy Score: 0.8666662415234828
