In [1]:
import os
from typing import Union

import numpy as np
import scipy.io
from scipy.stats import mode
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix
from sklearn.svm import SVC

In [10]:
train_path = "CoordinateData\\train"
validation_path = "CoordinateData\\validation"

In [11]:
def segment_data(data: np.ndarray, window_size: int = 180, overlap_ratio: float = 0.75, by_type: bool = True, min_frame: int = 12) -> np.ndarray:
    """
    Segment the input data into smaller windows based on the given parameters.

    Args:
        data (np.ndarray): Input data with shape (num_samples, num_features).
        window_size (int, optional): The length of each window. Defaults to 180.
        overlap_ratio (float, optional): The ratio of overlap between consecutive windows. Defaults to 0.75.
        by_type (bool, optional): Whether to segment the data by type (assuming the 71st feature is the type). Defaults to True.
        min_frame (int, optional): The minimum number of frames required to create a new instance. Defaults to 12.

    Returns:
        np.ndarray: The segmented data with shape (num_windows, window_size, num_features).
    """

    # Check the input constraints
    assert data.shape[0] > 0
    assert window_size > 0
    assert 0 <= overlap_ratio < 1
    assert 0 <= min_frame < window_size

    dim = data.shape[1]
    instances = []

    if not by_type:
        instances.append(data)
    else:
        assert data.shape[1] >= 71

        num_data = data.shape[0]
        left, right = 0, 1
        pre_type = -1
        cur_type = data[left, 70]

        # Segment the data by exercise type
        while right < num_data:
            if data[right, 70] == cur_type:
                right += 1
                continue

            if right - left <= min_frame:
                left = right
                cur_type = data[left, 70]
                right += 1
                continue

            new_instance = np.take(data, range(left, right), axis=0)
            if pre_type == new_instance[0, 70]:
                instances[-1] = np.vstack([instances[-1], new_instance])
            else:
                instances.append(new_instance)

            left = right
            pre_type = cur_type
            cur_type = data[left, 70]
            right += 1

        # Handle the remaining data
        new_instance = np.take(data, range(left, right), axis=0)
        last = instances[-1]
        if last[0, 70] == new_instance[0, 70]:
            instances[-1] = np.vstack([last, new_instance])
        else:
            instances.append(new_instance)

    # print(len(instances))

    step_size = int(window_size * (1 - overlap_ratio))
    windows = []

    # Create windows for each instance
    for instance in instances:
        if instance.shape[0] < window_size:
            instance = np.vstack([instance, np.zeros((window_size - instance.shape[0], dim))])
            windows.append(instance)
            continue

        if (instance.shape[0] - window_size) % step_size != 0:
            pad_size = step_size - (instance.shape[0] - window_size) % step_size
            instance = np.vstack([instance, np.zeros((pad_size, dim))])

        for i in range(0, instance.shape[0] - window_size + 1, step_size):
            windows.append(np.take(instance, range(i, i + window_size), axis=0))

    return np.array(windows)

In [2]:
def ensemble_predict(y_predicts: np.ndarray, scores: np.ndarray, ensemble_method: int = 0) -> np.ndarray:
    """
    Perform ensemble prediction using different methods based on multiple models' predictions and their confusion matrices.

    Parameters:
    y_predicts (np.ndarray): A 3D array of shape (num_modalities, num_windows, 2) containing prediction probabilities for each modality.
    scores (np.ndarray): A 2D array of shape (num_modalities, 4) containing metrics (accuracy, precision, recall, and F1-score) for each modality.
    ensemble_method (int): An integer (0-7) indicating the ensemble method to be used. Default is 0 (Simple Average).

    Returns:
    np.ndarray: A 1D array of shape (num_windows,) containing the ensemble predictions for each window.
    """

    # Check input assertions
    assert y_predicts.shape[0] > 1
    assert y_predicts.shape[2] == 2
    assert y_predicts.shape[0] == scores.shape[0]
    assert 0 <= ensemble_method < 8

    num_modalities = y_predicts.shape[0]
    num_windows = y_predicts.shape[1]
    print(f'{num_windows} data are predicted using combined results of {num_modalities} modalities')

    # Perform ensemble prediction based on the selected method
    if ensemble_method == 0:                                            # Simple Average
        final_predict = np.argmax(np.mean(y_predicts, axis=0), axis=1)
    elif ensemble_method in [1, 2, 3, 4]:                               # Weighted Average (using different metrics)
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []

        for m in scores:
            accuracy, precision, recall, f1 = m.ravel()

            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)

        # Normalize the metrics
        accuracies = np.array(accuracies) / np.sum(accuracies)
        precisions = np.array(precisions) if np.sum(precisions) == 0 else np.array(precisions) / np.sum(precisions)
        recalls = np.array(recalls) if np.sum(recalls) == 0 else np.array(recalls) / np.sum(recalls)
        f1_scores = np.array(f1_scores) if np.sum(f1_scores) == 0 else np.array(f1_scores) / np.sum(f1_scores)

        metrics = {
                1: accuracies,
                2: precisions,
                3: recalls,
                4: f1_scores
        }
        # Choose the appropriate metric for weighted averaging
        weights = metrics[ensemble_method]

        # Perform weighted averaging
        y_predicts = weights[:, np.newaxis, np.newaxis] * y_predicts
        final_predict = np.argmax(np.sum(y_predicts, axis=0), axis=1)
    elif ensemble_method == 5:                                          # Maximum Rule
        final_predict = np.argmax(np.max(y_predicts, axis=0), axis=1)
    elif ensemble_method == 6:                                          # Product Rule
        final_predict = np.argmax(np.prod(y_predicts, axis=0), axis=1)
    else:                                                               # Voting
        final_predict, _ = mode(np.argmax(y_predicts, axis=2), axis=0)
        final_predict = final_predict.flatten()

    return final_predict

In [13]:
def load_data(path: str, downsampling: bool, concat: bool, seg_parameters: dict = None) -> (Union[list, np.ndarray], Union[list, np.ndarray]):
    X_list = []
    y_list = []

    selected_data_list = []

    # Iterate through the files in the provided path
    for file in os.listdir(path):
        mat = scipy.io.loadmat(os.path.join(path, file))

        # If downsampling is True, ignore data with only one unique value in column 72
        if downsampling and np.unique(mat['data'][:, 72]).size == 1:
            continue
        else:
            selected_data_list.append(mat['data'])

    # Process and segment the selected data files
    for data in selected_data_list:

        # If seg_parameters are provided, segment the data using those parameters
        processed_data = segment_data(data, **seg_parameters) if seg_parameters else segment_data(data)

        # Extract feature data (columns 0 to 69) and labels (column 72)
        X_segmented = processed_data[:, :, 0:70]
        y_segmented = processed_data[:, :, 72]
        y_segmented = np.apply_along_axis(lambda x: mode(x)[0], 1, y_segmented)

        X_list.append(X_segmented)
        y_list.append(y_segmented.flatten())

    # If concat is True, concatenate the lists into numpy arrays
    if concat:
        return np.concatenate(X_list, axis=0), np.concatenate(y_list, axis=0)

    return X_list, y_list

The code below feeds coordinate information as a modality to the Random Forest for training and provides surface electromyography data to the SVM for training. The two models make predictions separately and then perform fusion. Additionally, the code attempts various fusion methods.

In [14]:
def rf_svm_late_fusion(SVC_parameters: dict, ensemble_method: int, num_validation_indices: int = 2) -> None:
    assert 0 <= ensemble_method < 8

    # Calculate evaluation metrics for the given true and predicted labels
    def cal_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> list:
        ac = accuracy_score(y_true, y_pred)
        pr = precision_score(y_true, y_pred, average='macro', zero_division=0)
        re = recall_score(y_true, y_pred, average='macro')
        f1 = f1_score(y_true, y_pred, average='macro')
        return [ac, pr, re, f1]

    # Load training data
    X, y = load_data(path=train_path, downsampling=True, concat=False)

    rf_scores = []
    svm_scores = []

    # Calculate metrics if ensemble method is weighted averaging (1 to 4)
    if 1 <= ensemble_method <= 4:
        validation_indices = np.random.choice(range(len(X)), num_validation_indices, replace=False)
        for i in validation_indices:
            X_t = np.concatenate(X[:i] + X[i+1:], axis=0)
            X_t_rf = np.take(X_t, range(0, 66), axis=2).reshape(X_t.shape[0], -1)
            X_t_svm = np.take(X_t, range(66, 70), axis=2).reshape(X_t.shape[0], -1)
            y_t = np.concatenate(y[:i] + y[i+1:])

            X_v = X[i]
            X_v_rf = np.take(X_v, range(0, 66), axis=2).reshape(X_v.shape[0], -1)
            X_v_svm = np.take(X_v, range(66, 70), axis=2).reshape(X_v.shape[0], -1)
            y_v = y[i]

            rf = RandomForestClassifier(random_state=42)
            rf.fit(X_t_rf, y_t)

            svm = SVC(**SVC_parameters)
            svm.fit(X_t_svm, y_t)

            rf_pred = rf.predict(X_v_rf)
            svm_pred = svm.predict(X_v_svm)

            rf_scores.append(cal_metrics(y_v, rf_pred))
            svm_scores.append(cal_metrics(y_v, svm_pred))

        rf_scores = np.mean(rf_scores, axis=0)
        svm_scores = np.mean(svm_scores, axis=0)

        print(rf_scores, svm_scores)

    X, y = np.concatenate(X, axis=0), np.concatenate(y, axis=0)
    X_rf = np.take(X, range(0, 66), axis=2).reshape(X.shape[0], -1)
    X_svm = np.take(X, range(66, 70), axis=2).reshape(X.shape[0], -1)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_rf, y)

    svm = SVC(**SVC_parameters)
    svm.fit(X_svm, y)

    X_v, y_v = load_data(path=validation_path, downsampling=False, concat=True, seg_parameters={'min_frame': 0})
    X_v_rf = np.take(X_v, range(0, 66), axis=2).reshape(X_v.shape[0], -1)
    X_v_svm = np.take(X_v, range(66, 70), axis=2).reshape(X_v.shape[0], -1)

    rf_pred = rf.predict_proba(X_v_rf)
    print(classification_report(y_v, np.argmax(rf_pred, axis=1)))
    svm_pred = svm.predict_proba(X_v_svm)
    print(classification_report(y_v, np.argmax(svm_pred, axis=1)))

    final_pred = ensemble_predict(np.array([rf_pred, svm_pred]), np.array([rf_scores, svm_scores]), ensemble_method)

    print(confusion_matrix(y_v, final_pred))
    print(classification_report(y_v, final_pred))

In [18]:
rf_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 5)

              precision    recall  f1-score   support

         0.0       0.94      0.73      0.83      2706
         1.0       0.06      0.28      0.10       166

    accuracy                           0.71      2872
   macro avg       0.50      0.51      0.46      2872
weighted avg       0.89      0.71      0.78      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2600  106]
 [ 103   63]]
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2706
         1.0       0.37      0.38      0.38       166

    accuracy                           0.93      2872
   macro avg       0.67      0.67     

In [19]:
rf_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 1, 3)

[0.89588407 0.79132017 0.73788667 0.75598024] [0.88534111 0.44267056 0.5        0.46817063]
              precision    recall  f1-score   support

         0.0       0.94      0.73      0.83      2706
         1.0       0.06      0.28      0.10       166

    accuracy                           0.71      2872
   macro avg       0.50      0.51      0.46      2872
weighted avg       0.89      0.71      0.78      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2600  106]
 [ 103   63]]
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2706
         1.0       0.37      0.38      0.38       166



In [20]:
rf_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 2, 3)

[0.80829213 0.67149338 0.6391855  0.64771677] [0.83922226 0.41961113 0.5        0.45514884]
              precision    recall  f1-score   support

         0.0       0.94      0.73      0.83      2706
         1.0       0.06      0.28      0.10       166

    accuracy                           0.71      2872
   macro avg       0.50      0.51      0.46      2872
weighted avg       0.89      0.71      0.78      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2392  314]
 [ 109   57]]
              precision    recall  f1-score   support

         0.0       0.96      0.88      0.92      2706
         1.0       0.15      0.34      0.21       166



In [21]:
rf_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 3, 3)

[0.9521202  0.91784361 0.73905229 0.76893646] [0.92210324 0.46105162 0.5        0.47965816]
              precision    recall  f1-score   support

         0.0       0.94      0.73      0.83      2706
         1.0       0.06      0.28      0.10       166

    accuracy                           0.71      2872
   macro avg       0.50      0.51      0.46      2872
weighted avg       0.89      0.71      0.78      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2472  234]
 [ 107   59]]
              precision    recall  f1-score   support

         0.0       0.96      0.91      0.94      2706
         1.0       0.20      0.36      0.26       166



In [22]:
rf_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 4, 3)

[0.77580811 0.72715685 0.58813526 0.59629894] [0.81865369 0.40932684 0.5        0.44519793]
              precision    recall  f1-score   support

         0.0       0.94      0.73      0.83      2706
         1.0       0.06      0.28      0.10       166

    accuracy                           0.71      2872
   macro avg       0.50      0.51      0.46      2872
weighted avg       0.89      0.71      0.78      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2517  189]
 [ 104   62]]
              precision    recall  f1-score   support

         0.0       0.96      0.93      0.94      2706
         1.0       0.25      0.37      0.30       166



Since the performance of the Random Forest was found to be poor, the following code replaces the Random Forest Classifier with the AdaBoostClassifier for experimentation.

In [15]:
def ada_svm_late_fusion(SVC_parameters: dict, ensemble_method: int, num_validation_indices: int = 2) -> None:
    assert 0 <= ensemble_method < 8

    # Calculate evaluation metrics for the given true and predicted labels
    def cal_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> list:
        ac = accuracy_score(y_true, y_pred)
        pr = precision_score(y_true, y_pred, average='macro', zero_division=0)
        re = recall_score(y_true, y_pred, average='macro')
        f1 = f1_score(y_true, y_pred, average='macro')
        return [ac, pr, re, f1]

    # Load training data
    X, y = load_data(path=train_path, downsampling=True, concat=False)

    ada_scores = []
    svm_scores = []

    # Calculate metrics if ensemble method is weighted averaging (1 to 4)
    if 1 <= ensemble_method <= 4:
        validation_indices = np.random.choice(range(len(X)), num_validation_indices, replace=False)
        for i in validation_indices:
            X_t = np.concatenate(X[:i] + X[i+1:], axis=0)
            X_t_ada = np.take(X_t, range(0, 66), axis=2).reshape(X_t.shape[0], -1)
            X_t_svm = np.take(X_t, range(66, 70), axis=2).reshape(X_t.shape[0], -1)
            y_t = np.concatenate(y[:i] + y[i+1:])

            X_v = X[i]
            X_v_ada = np.take(X_v, range(0, 66), axis=2).reshape(X_v.shape[0], -1)
            X_v_svm = np.take(X_v, range(66, 70), axis=2).reshape(X_v.shape[0], -1)
            y_v = y[i]

            ada = AdaBoostClassifier(random_state=42)
            ada.fit(X_t_ada, y_t)

            svm = SVC(**SVC_parameters)
            svm.fit(X_t_svm, y_t)

            ada_pred = ada.predict(X_v_ada)
            svm_pred = svm.predict(X_v_svm)

            ada_scores.append(cal_metrics(y_v, ada_pred))
            svm_scores.append(cal_metrics(y_v, svm_pred))

        ada_scores = np.mean(ada_scores, axis=0)
        svm_scores = np.mean(svm_scores, axis=0)

        print(ada_scores, svm_scores)

    X, y = np.concatenate(X, axis=0), np.concatenate(y, axis=0)
    X_ada = np.take(X, range(0, 66), axis=2).reshape(X.shape[0], -1)
    X_svm = np.take(X, range(66, 70), axis=2).reshape(X.shape[0], -1)

    ada = AdaBoostClassifier(random_state=42)
    ada.fit(X_ada, y)

    svm = SVC(**SVC_parameters)
    svm.fit(X_svm, y)

    X_v, y_v = load_data(path=validation_path, downsampling=False, concat=True, seg_parameters={'min_frame': 0})
    X_v_ada = np.take(X_v, range(0, 66), axis=2).reshape(X_v.shape[0], -1)
    X_v_svm = np.take(X_v, range(66, 70), axis=2).reshape(X_v.shape[0], -1)

    ada_pred = ada.predict_proba(X_v_ada)
    print(classification_report(y_v, np.argmax(ada_pred, axis=1)))
    svm_pred = svm.predict_proba(X_v_svm)
    print(classification_report(y_v, np.argmax(svm_pred, axis=1)))

    final_pred = ensemble_predict(np.array([ada_pred, svm_pred]), np.array([ada_scores, svm_scores]), ensemble_method)

    print(confusion_matrix(y_v, final_pred))
    print(classification_report(y_v, final_pred))

The following function calls might take a long time to run.

In [16]:
ada_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 5)

              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95      2706
         1.0       0.08      0.07      0.07       166

    accuracy                           0.90      2872
   macro avg       0.51      0.51      0.51      2872
weighted avg       0.89      0.90      0.90      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2626   80]
 [  88   78]]
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.49      0.47      0.48       166

    accuracy                           0.94      2872
   macro avg       0.73      0.72     

In [17]:
ada_svm_late_fusion({'kernel': 'rbf', 'gamma': 'auto', 'probability': True, 'random_state': 42}, 4, 3)

[0.82880308 0.69042592 0.72795438 0.65449831] [0.83472678 0.41736339 0.5        0.45375127]
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95      2706
         1.0       0.08      0.07      0.07       166

    accuracy                           0.90      2872
   macro avg       0.51      0.51      0.51      2872
weighted avg       0.89      0.90      0.90      2872

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.48      0.48      0.48       166

    accuracy                           0.94      2872
   macro avg       0.72      0.72      0.72      2872
weighted avg       0.94      0.94      0.94      2872

2872 data are predicted using combined results of 2 modalities
[[2629   77]
 [  88   78]]
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2706
         1.0       0.50      0.47      0.49       166

