In [2]:
# # no need for this since it has nothing outside classes and functions
# print(__name__)
# if __name__ == "__main__" and hasattr(__builtins__,'__IPYTHON__') and ('google.colab' in str(get_ipython())):
#     from google.colab import drive
#     drive.mount('/content/drive')
#     %cd /content/drive/MyDrive/PressureReliefWorkArea/SummerWork/
#     !ls

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
import numpy as np

%run -n HelperFunctions.ipynb
# import ipynb
# from ipynb.fs.full.HelperFunctions import *

This code loads your CSV file, splits the data into a training set and a test set, and creates a DataLoader for each. The DataLoader can be used to iterate through the data in batches, which is useful for training a neural network.

You can replace 'yourfile.csv' with the path to your actual file. Also, note that this assumes your CSV file doesn't have a header. If it does, you might need to skip the first row.

In [28]:
class JFSKAccelDataset(Dataset):
    def __init__(self, data, labels, sequence_length=10):
        # if(labels == None):
        #     self.data = data.data
        #     self.labels = data.labels
        #     self.sequence_length = data.sequence_length
        #     return
        self.data = data
        self.labels = labels
        self.sequence_length = sequence_length
        
    def group(self):
        self.data = [self.data[i:i+self.sequence_length] for i in range(len(self.data) - self.sequence_length + 1)]
        self.labels = self.labels[(int)(self.sequence_length/2) - 1 : len(self.labels) - (self.sequence_length - (int)(self.sequence_length/2))]
        # change to get the majority

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx].transpose(0, 1), self.labels[idx]  # Transposing the sequence and channel dimensions

In [6]:
class SKInputConverter:
    def __init__(self, dataframe, classtype, bufferpref = "Inner"):
        self.dataframe = dataframe
        SKDescriptors.validate_class_type(classtype)
        self.classtype = classtype
        self.bufferpref = bufferpref
        self.inputnum = SKDescriptors.NUM_OF_INPUTS_PER_TYPE[classtype]

    def result(self):
        return self.dataframe, self.inputnum

    def diff(self):
        pass
        return self

    def remove_outliers(self, rem_type = None, rem_func = None, *args, **kwargs):
        pass
        return self

    def normalize(self, norm_type = None, norm_func = None, *args, **kwargs):
        pass
        return self

    def combine(self, comb_type = None, comb_func = None, *args, **kwargs):
        if comb_type != None or comb_func != None or args or kwargs:
            raise NotImplementedError(f"SKInputConverter.combine() has no implemented parameters")
        self.dataframe = self.dataframe.iloc[:,:self.inputnum].apply(np.linalg.norm).join(self.dataframe.iloc[:,self.inputnum:])
        self.dataframe.columns = pd.Index(np.arange(len(self.dataframe.columns) - self.inputnum + 1))
        self.inputnum = 1
        return self

In [5]:
class JFSKLoader:
    def __init__(self, file_path, sequence_length = 10, *args, **kwargs):
        self.sequence_length = sequence_length


        # 1. open file

        # Gather file info
        # self.file_directory, self.beginning_descriptors, self.file_name, self.ending_descriptors, self.file_extension, self.specifier_values = SKFileNameHandler.read_data_file_name(file_path)
        self.file_directory, _, _, _, file_extension, self.specifier_values = SKFileNameHandler.read_data_file_name(file_path)
        classification_type = self.specifier_values[SKDescriptors.CLASSIFICATION_TYPE_FS]
        input_num = SKDescriptors.NUM_OF_INPUTS_PER_TYPE[classification_type]

        match file_extension:
            case ".csv":
                dataframe = pd.read_csv(file_path)
                # code test file: Data/Week 1/Left then Right/Processed/Type3-Freq10-Labeled_Motion-sessions_2023-08-26_17-25-54.csv
                # classifier training file: Data/COMBINED_Type3-Freq10-Labeled_Motion-sessions_23-24_Fall.csv
            case _:
                raise NotImplementedError(f"JFSKLoader is not equipped to open {file_extension} files.")
            
        # # This combines the input names with the class names
        #     # example: ['x', 'y', 'z', 'Forward Lean', 'Left Lean', 'Right Lean', 'Pushup', 'Other']
        #     # I use the * operator in conjunction with Python 3.5+'s "Additional Unpacking Generalizations"
        #         # to unpack two list-likes and combine them into one list
        # dataframe.columns = pd.Index([*SKDescriptors.INPUT_NAMES[1], *(str(c) for c in SKDescriptors.CTS_PER_TYPE[classification_type])])


        # 2. split dataset into data and labels

        # Get data and labels from dataframe
        data = dataframe.iloc[:, :input_num].to_numpy()  # x, y, z data
        labels = dataframe.iloc[:, input_num:].to_numpy()  # labels


        # 3. make adjustments related to the data
        # THIS is where we would use SKInputConverter


        # 4. group
        self.g_dataset = JFSKAccelDataset(data, labels, sequence_length)
        self.g_dataset.group()


        # 5. make adjustments related to the labels
        # if repress_classes:
        if args and args[0]:
            if len(args) > 1:
                self.repress_classes(*(args[1:]), **kwargs)
            else:
                self.repress_classes(**kwargs)
        elif kwargs.get("repress_classes", False):
            self.repress_classes(*args, **kwargs)


        # 6. randomize
        data_train, data_test, labels_train, labels_test = train_test_split(self.g_dataset.data, self.g_dataset.labels, test_size=0.2, random_state=42)


        # 7. create dataloader


        # Convert data to tensors
        data_train = torch.tensor(np.array(data_train), dtype=torch.float32)  
        data_test = torch.tensor(np.array(data_test), dtype=torch.float32)

        # Convert labels to tensors and get max index (assuming one-hot encoding)
        labels_train = torch.argmax(torch.tensor(np.array(labels_train), dtype=torch.float32), dim=1)
        labels_test = torch.argmax(torch.tensor(np.array(labels_test), dtype=torch.float32), dim=1)

        # Create data loaders
        train_dataset = JFSKAccelDataset(data_train, labels_train, self.sequence_length)
        test_dataset = JFSKAccelDataset(data_test, labels_test, self.sequence_length)

        self.train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        self.test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)




    # target_classes = ["Other", "Stationary"], rep_format = 'tag', rep_func = np.mean, skip_repressed = True, *args, **kwargs
    def repress_classes(self, target_classes = None, rep_format = 'tag', rep_func = np.mean, skip_repressed = True, *args, **kwargs):
        classification_type = self.specifier_values[SKDescriptors.CLASSIFICATION_TYPE_FS]
        
        # NOTE: the logic here only works for one-hot vectors
        if not SKDescriptors.NUM_OF_OUTPUTS_PER_TYPE[classification_type] == 1:
            raise NotImplementedError(f"JFSKLoader.repress_classes() with repress_stationary=True may not be equipped to fairly sample stationary data for classification types with outputs that are not one-hot vectors.")

        if not callable(rep_func):
            raise NotImplementedError(f"JFSKLoader.repress_classes()'s rep_func must currently be a function\
                                      \n\tIf you want more complex logic where you'd test for a string or something, feel free to alter the code")

        if target_classes is None:
            target_classes = [SKDescriptors.OTHER_TAG, SKDescriptors.STATIONARY_TAG]
            rep_format = 'tag'

        

        temp_list = []
        match rep_format:
            case 'str' | 'cts':
                for i, cts in enumerate(SKDescriptors.CTS_PER_TYPE[classification_type]):
                    if cts in target_classes:
                        temp_list.append(i)

                if len(temp_list) != len(target_classes):
                    print(f"\nWARNING: The following target_classes were not kept as they do not correspond to classification_type '{classification_type}':\n")
                    for cts in target_classes:
                        if not any(cts == SKDescriptors.CTS_PER_TYPE[classification_type][i] for i in temp_list):
                            print(f"\t{cts}\n")

            case 'tag':
                for i, cts in enumerate(SKDescriptors.CTS_PER_TYPE[classification_type]):
                    for tag in target_classes:
                        if tag in cts:
                            temp_list.append(i)

            case 'num' | 'int':
                forwarn_active = False
                for i in target_classes:
                    if i in np.arange(SKDescriptors.NUM_OF_CLASSES_PER_TYPE[classification_type]):
                        temp_list.append(i)
                        continue

                    if not forwarn_active:
                        print(f"\nWARNING: The following target_classes were not kept as they do not correspond to classification_type '{classification_type}':\n")
                        forwarn_active = True
                    print(f"\t{i}\n")
                
            case _ :
                raise AssertionError("You can only pass 'str', 'tag', 'cts', 'num', or 'int' into the rep_format parameter of JFSKLoader.repress_classes()\
                                     \n\t\t'str' is used when you want to pass the class names (as strings) of classes it should repress\
                                     \n\t\t'tag' is used when you want to pass the class tags (as strings) it should repress\
                                     \n\t\t'cts' is used when you want to pass the classes (as ClassTagSets) it should repress\
                                     \n\t\t'int' and 'num' are used when you want to pass the indices of the classes it should repress")
            
        target_classes = temp_list


        # # my population sample randomizer from STAT 2113
        # for i in range(SAMPLE_SIZE):
        #     choice_index = random.randint(1, unchosen_len) - 1
        #     choice = unchosen[choice_index]
        #     unchosen.pop(choice_index)
        #     unchosen_len -= 1
        #     choices_dict[choice] = choices_dict.setdefault(choice, 0)
        #     choices_dict[choice] += 1
        # choices_list = sorted(choices_dict)

        # we should not iterate over a dataframe
        # if class_index != -1:
        # check the counts of every class
        # this first removes the input columns;
            # then it removes the Stationary column;
            # then it counts each remaining columns' counts of '1' (everything inside .apply());
                # inside .apply(), we count how many of each number we have;
                # then we change it to a zip, then a dict;
                # then we take the count from key 1,
                # and if it doesn't have a 1 key, we return 0;
            # then it converts it to a numpy array;
            # then it takes the mean of the counts;
            # then it turns this into an int

        # use this if we want the mean, but take into account the fact that some values may overpower the others
        if skip_repressed:
            accounted = np.delete(self.g_dataset.labels, target_classes, axis=1)
        else:
            accounted = self.g_dataset.labels
        adjustment_height = int(rep_func(np.apply_along_axis(lambda x: dict(zip(*np.unique(x, return_counts = True))).get(1, 0), 1, accounted), *args, **kwargs))
        # avg_count = int(dataframe[dataframe.columns[3 : len(dataframe.columns) - 2]].apply(class_counter).to_numpy().mean())
        
        NOT REIMPLEMENTED FURTHER YET -- 6/15/2024

        # class_counts = {}
        # for i in dataframe.columns[input_num : ]:
        #     if i == SKDescriptors.STATIONARY_CLASS:
        #         stationary_rows = dataframe[dataframe[i] == 1]
        #         continue
        #     class_counts[i] = len(dataframe[dataframe[i] == 1])
        stationary_rows = dataframe[dataframe[len(dataframe.columns) - 2] == 1]
        other_rows = dataframe[dataframe[len(dataframe.columns) - 1] == 1]
        print()
        print(avg_count)
        print("statlen: " + str(len(stationary_rows)))
        print("otherlen: " + str(len(other_rows)))
        # this is the line doing the actual randomization
        sample_rows = stationary_rows.sample(avg_count, random_state=42)
        print(len(sample_rows))
        dataframe = dataframe.drop(stationary_rows.drop(sample_rows.index).index)
        print(len(dataframe))
        # this is the line doing the actual randomization
        sample_rows = other_rows.sample(avg_count, random_state=42)
        print(len(sample_rows))
        dataframe = dataframe.drop(other_rows.drop(sample_rows.index).index)
        print(len(dataframe))


        # want min instead?
        avg_count = int(dataframe[dataframe.columns[input_num : ]].apply(JFSKLoader.count_instances).to_numpy().min())

        # median
        avg_count = int(np.median(dataframe[dataframe.columns[3 : ]].apply(lambda x: skcounter(x)).to_numpy()))

        for i in dataframe.columns[input_num : ]:
            class_i_rows = dataframe[dataframe[i] == 1]
            # this is the line doing the actual randomization
            sample_rows = class_i_rows.sample(min(len(class_i_rows), avg_count), random_state=42)
            # print(len(sample_rows))
            dataframe = dataframe.drop(class_i_rows.drop(sample_rows.index).index)
            # print(len(dataframe))



        avg_count = int(dataframe[dataframe.columns[input_num : ]].drop(SKDescriptors.STATIONARY_CLASS, axis=1).apply(lambda x: dict(zip(np.unique(x, return_counts = True))).get(1, 0)).to_numpy().mean())
        # class_counts = {}
        # for i in dataframe.columns[input_num : ]:
        #     if i == SKDescriptors.STATIONARY_CLASS:
        #         stationary_rows = dataframe[dataframe[i] == 1]
        #         continue
        #     class_counts[i] = len(dataframe[dataframe[i] == 1])
        stationary_rows = dataframe[dataframe[SKDescriptors.STATIONARY_CLASS] == 1]
        # this is the line doing the actual randomization
        sample_rows = stationary_rows.sample(avg_count, random_state=42)
        dataframe = dataframe.drop(stationary_rows.drop(sample_rows.index).index)





    def count_instances(series):
        return dict(zip(*np.unique(series, return_counts = True))).get(1, 0)

        # ci_arr = np.unique(series, return_counts = True)
        # return dict(zip(ci_arr[0], ci_arr[1])).get(1, 0)
    
        # print(skarr)
        # find_index = np.where(skarr[0], True, False)
        # for i in range(len(find_index)):
        #     if find_index[i]: ret_valsk = skarr[1][i]
        # print(ret_dict)
        # ret_valsk = ret_dict.get(1, 0)
        # print(ret_valsk)
        # # print(dataframe)
        # return ret_valsk
