This notebook investigates whether using flattened data after sliding window segmentation with random forest and Adaboost classifiers produces better results compared to directly using frame-by-frame data for model training.
Additionally, this notebook explores if downsampling leads to improved model performance.

In [1]:
import os
import scipy.io
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import mode
from tqdm import tqdm
from typing import Any

In [2]:
train_path = "CoordinateData\\train"
validation_path = "CoordinateData\\validation"

In [3]:
def segment_data(data: np.ndarray, window_size: int = 180, overlap_ratio: float = 0.75, by_type: bool = True, min_frame: int = 12) -> np.ndarray:
    """
    Segment the input data into smaller windows based on the given parameters.

    Args:
        data (np.ndarray): Input data with shape (num_samples, num_features).
        window_size (int, optional): The length of each window. Defaults to 180.
        overlap_ratio (float, optional): The ratio of overlap between consecutive windows. Defaults to 0.75.
        by_type (bool, optional): Whether to segment the data by type (assuming the 71st feature is the type). Defaults to True.
        min_frame (int, optional): The minimum number of frames required to create a new instance. Defaults to 12.

    Returns:
        np.ndarray: The segmented data with shape (num_windows, window_size, num_features).
    """

    # Check the input constraints
    assert data.shape[0] > 0
    assert window_size > 0
    assert 0 <= overlap_ratio < 1
    assert 0 <= min_frame < window_size

    dim = data.shape[1]
    instances = []

    if not by_type:
        instances.append(data)
    else:
        assert data.shape[1] >= 71

        num_data = data.shape[0]
        left, right = 0, 1
        pre_type = -1
        cur_type = data[left, 70]

        # Segment the data by exercise type
        while right < num_data:
            if data[right, 70] == cur_type:
                right += 1
                continue

            if right - left <= min_frame:
                left = right
                cur_type = data[left, 70]
                right += 1
                continue

            new_instance = np.take(data, range(left, right), axis=0)
            if pre_type == new_instance[0, 70]:
                instances[-1] = np.vstack([instances[-1], new_instance])
            else:
                instances.append(new_instance)

            left = right
            pre_type = cur_type
            cur_type = data[left, 70]
            right += 1

        # Handle the remaining data
        new_instance = np.take(data, range(left, right), axis=0)
        last = instances[-1]
        if last[0, 70] == new_instance[0, 70]:
            instances[-1] = np.vstack([last, new_instance])
        else:
            instances.append(new_instance)

    # print(len(instances))

    step_size = int(window_size * (1 - overlap_ratio))
    windows = []

    # Create windows for each instance
    for instance in instances:
        if instance.shape[0] < window_size:
            instance = np.vstack([instance, np.zeros((window_size - instance.shape[0], dim))])
            windows.append(instance)
            continue

        if (instance.shape[0] - window_size) % step_size != 0:
            pad_size = step_size - (instance.shape[0] - window_size) % step_size
            instance = np.vstack([instance, np.zeros((pad_size, dim))])

        for i in range(0, instance.shape[0] - window_size + 1, step_size):
            windows.append(np.take(instance, range(i, i + window_size), axis=0))

    return np.array(windows)

In [9]:
def fit(estimator: Any, parameters: dict, path: str = train_path, base_parameters: dict = None, seg: bool = False, downsampling: bool = False) -> [Any]:
    # The function takes an estimator object, a dictionary of parameters to be used with the estimator, a path to the directory containing the training data,
    # a dictionary of parameters to be passed to a base estimator (if any), a flag indicating whether to segment the data or not, and a flag indicating whether to use downsampling.

    # Create an empty list to store the models
    models = []

    # Get the class of the estimator object
    estimator_class = type(estimator)

    # Iterate over all files in the directory pointed by 'path'
    for i, file in tqdm(enumerate(os.listdir(path))):
        # Load data from the file in a Matlab format using SciPy's 'loadmat' function
        mat = scipy.io.loadmat(os.path.join(path, file))

        # Extract the data from the dictionary using the 'data' key
        data = mat['data']

        # If downsampling flag is True and all labels are the same, then skip this file and go to the next iteration
        if downsampling and np.unique(data[:, 72:73]).size == 1:
            print("downsampling!!!")
            continue

        # If a base estimator is specified, create an instance of it with the given base parameters and replace the 'base_estimator' key in the parameters dictionary with this instance
        if 'base_estimator' in parameters:
            base_class = type(parameters['base_estimator'])
            parameters['base_estimator'] = base_class(**base_parameters) if base_parameters else base_class()

        # If the 'seg' flag is True, segment the data and reshape X_train
        if seg:
            data = segment_data(data)
            X_train = data[:, :, 0:70]
            X_train = X_train.reshape(X_train.shape[0], -1)
            y_train = data[:, :, 72:73]
            # Compute the mode along the second axis of the y_train array to obtain the most frequent class in each segment
            y_train = np.apply_along_axis(lambda x: mode(x)[0], 1, y_train[:, :, 0]).flatten()
        # If the 'seg' flag is False, simply remove the columns containing the labels from the data to obtain X_train and keep only the column with labels to obtain y_train
        else:
            X_train = np.delete(data, range(70, 78), axis=1)
            y_train = data[:, 72]

        # Create a new instance of the estimator class with the specified parameters and fit the model to the training data
        model = estimator_class(**parameters)
        model.fit(X_train, y_train)

        # Append the tuple containing the file name and the trained model object to the models list
        models.append((file, model))

    print("Downsampling:", downsampling)
    return models

In [6]:
def validate(models: list, path: str = validation_path, seg: bool = False) -> None:
    # The function takes a list of models, a path to the directory containing the validation data, and a flag indicating whether to segment the data or not

    # Initialize variables to store the number of true protective samples, total protective samples, true non-protective samples, and total non-protective samples
    total_true_protective = 0
    total_protective = 0
    total_true_non_protective = 0
    total_non_protective = 0

    for file in os.listdir(path):
        mat = scipy.io.loadmat(os.path.join(path, file))
        data = mat['data']

        # If the 'seg' flag is True, segment the data and reshape X_validation
        if seg:
            data = segment_data(data)
            X_validation = data[:, :, 0:70]
            X_validation = X_validation.reshape(X_validation.shape[0], -1)
            y_validation = data[:, :, 72:73]
            # Compute the mode along the second axis of the y_validation array to obtain the most frequent class in each segment
            y_validation = np.apply_along_axis(lambda x: mode(x)[0], 1, y_validation[:, :, 0]).flatten()
            assert X_validation.shape[0] == y_validation.shape[0]
        # If the 'seg' flag is False, simply remove the columns containing the labels from the data to obtain X_validation and keep only the column with labels to obtain y_validation
        else:
            X_validation = np.delete(data, range(70, 78), axis=1)
            y_validation = data[:, 72]

        # Initialize an empty list to store the predictions from all models
        predictions = []
        for model in models:
            predictions.append(model[1].predict(X_validation))

        # Compute the mode of the predictions along the first axis to obtain the final prediction for each sample in X_validation
        y_pred, _ = mode(predictions, axis=0)
        y_pred = y_pred.flatten()

        protective = np.where(y_validation == 1)
        non_protective = np.where(y_validation == 0)

        # Compute the number of true protective samples, total protective samples, true non-protective samples, and total non-protective samples
        total_true_protective += np.sum(y_validation[protective] == y_pred[protective])
        total_protective += len(y_validation[protective])

        total_true_non_protective += np.sum(y_validation[non_protective] == y_pred[non_protective])
        total_non_protective += len(y_validation[non_protective])

    print("Files are segmented:", seg)
    print("Accuracy(protective):", total_true_protective / total_protective, total_true_protective, total_protective)
    print("Accuracy(non-protective):", total_true_non_protective / total_non_protective, total_true_non_protective, total_non_protective)
    TN, FN = total_true_non_protective, total_non_protective - total_true_non_protective
    FP, TP = total_protective - total_true_protective, total_true_protective
    precision = TP / (TP + FP)
    recall = 0.0 if TP == 0.0 else TP / (TP + FN)
    f1_score = 0.0 if TP == 0.0 else 2 * (precision * recall) / (precision + recall)
    print("F1-score:", f1_score)
    print("Overall accuracy:", (total_true_protective + total_true_non_protective) / (total_protective + total_non_protective))

In [7]:
rfcs = fit(RandomForestClassifier(), {'random_state': 42})
validate(rfcs)
rfcs = fit(RandomForestClassifier(), {'random_state': 42}, downsampling=True)
validate(rfcs)

23it [03:27,  9.02s/it]


Downsampling: False


2it [00:00, 15.25it/s]

Files are segmented: False
Accuracy(protective): 0.0 0 10721
Accuracy(non-protective): 0.9999932302984761 147716 147717
F1-score: 0.0
Overall accuracy: 0.9323268407831454


23it [03:26,  8.99s/it]


Downsampling: True
Files are segmented: False
Accuracy(protective): 0.34381121164070516 3686 10721
Accuracy(non-protective): 0.5322271641043347 78619 147717
F1-score: 0.0882821387940842
Overall accuracy: 0.5194776505636274


In [8]:
abcs = fit(AdaBoostClassifier(), {'random_state': 42})
validate(abcs)
abcs = fit(AdaBoostClassifier(), {'random_state': 42}, downsampling=True)
validate(abcs)

23it [01:19,  3.46s/it]


Downsampling: False


4it [00:00, 34.45it/s]

Files are segmented: False
Accuracy(protective): 0.0 0 10721
Accuracy(non-protective): 1.0 147717 147717
F1-score: 0.0
Overall accuracy: 0.932333152400308


23it [01:18,  3.42s/it]


Downsampling: True
Files are segmented: False
Accuracy(protective): 0.06435966794142338 690 10721
Accuracy(non-protective): 0.8747672915101173 129218 147717
F1-score: 0.04613841524573721
Overall accuracy: 0.819929562352466


In [9]:
abcs_dtc = fit(AdaBoostClassifier(), {'base_estimator': DecisionTreeClassifier(), 'random_state': 42})
validate(abcs_dtc)
abcs_dtc = fit(AdaBoostClassifier(), {'base_estimator': DecisionTreeClassifier(), 'random_state': 42}, downsampling=True)
validate(abcs_dtc)

23it [00:14,  1.63it/s]


Downsampling: False


4it [00:00, 35.37it/s]

Files are segmented: False
Accuracy(protective): 0.0427198955321332 458 10721
Accuracy(non-protective): 0.9379286067277294 138548 147717
F1-score: 0.04501670925889523
Overall accuracy: 0.8773526552973403


23it [00:13,  1.66it/s]


Downsampling: True
Files are segmented: False
Accuracy(protective): 0.6675683238503871 7157 10721
Accuracy(non-protective): 0.18035838799867313 26642 147717
F1-score: 0.10301324908422273
Overall accuracy: 0.21332634847700677


In [10]:
rfcs_seg = fit(RandomForestClassifier(), {'random_state': 42}, seg=True)
validate(rfcs_seg, seg=True)
rfcs_seg = fit(RandomForestClassifier(), {'random_state': 42}, seg=True, downsampling=True)
validate(rfcs_seg, seg=True)

23it [00:13,  1.67it/s]


Downsampling: False


4it [00:00, 35.37it/s]

Files are segmented: True
Accuracy(protective): 0.0 0 171
Accuracy(non-protective): 0.9985174203113417 2694 2698
F1-score: 0.0
Overall accuracy: 0.9390031369815267


23it [00:12,  1.81it/s]


Downsampling: True
Files are segmented: True
Accuracy(protective): 0.39766081871345027 68 171
Accuracy(non-protective): 0.6942179392142328 1873 2698
F1-score: 0.12781954887218044
Overall accuracy: 0.67654234925061


In [11]:
abcs_seg = fit(AdaBoostClassifier(), {'random_state': 42}, seg=True)
validate(abcs_seg, seg=True)
abcs_seg = fit(AdaBoostClassifier(), {'random_state': 42}, seg=True, downsampling=True)
validate(abcs_seg, seg=True)

23it [03:13,  8.43s/it]


Downsampling: False


4it [00:00, 35.37it/s]

Files are segmented: True
Accuracy(protective): 0.0 0 171
Accuracy(non-protective): 1.0 2698 2698
F1-score: 0.0
Overall accuracy: 0.9403973509933775


23it [03:14,  8.44s/it]


Downsampling: True
Files are segmented: True
Accuracy(protective): 0.2982456140350877 51 171
Accuracy(non-protective): 0.9521868050407709 2569 2698
F1-score: 0.2905982905982906
Overall accuracy: 0.9132101777622865


In [12]:
abcs_dtc_seg = fit(AdaBoostClassifier(), {'base_estimator': DecisionTreeClassifier(), 'random_state': 42}, seg=True)
validate(abcs_dtc_seg, seg=True)
abcs_dtc_seg = fit(AdaBoostClassifier(), {'base_estimator': DecisionTreeClassifier(), 'random_state': 42}, seg=True, downsampling=True)
validate(abcs_dtc_seg, seg=True)

23it [00:20,  1.13it/s]


Downsampling: False


4it [00:00, 35.68it/s]

Files are segmented: True
Accuracy(protective): 0.0 0 171
Accuracy(non-protective): 1.0 2698 2698
F1-score: 0.0
Overall accuracy: 0.9403973509933775


23it [00:19,  1.16it/s]


Downsampling: True
Files are segmented: True
Accuracy(protective): 0.09941520467836257 17 171
Accuracy(non-protective): 0.9844329132690882 2656 2698
F1-score: 0.14782608695652175
Overall accuracy: 0.9316835134193099
