In [57]:
import os
import re
from tqdm import tqdm

import scipy.io
import numpy as np
import pandas as pd

from label_dict import label_dict


In [58]:
NUM_COLUMNS = 10

sum_columns = [f'sum_{i+1}' for i in range(NUM_COLUMNS)]
mean_columns = [f'mean_{i+1}' for i in range(NUM_COLUMNS)]
waveform_columns = [f'waveform_{i+1}' for i in range(NUM_COLUMNS)]
av_energy_columns = [f'av_energy_{i+1}' for i in range(NUM_COLUMNS)]
cls = ["class"]

column_names = sum_columns + mean_columns + waveform_columns + av_energy_columns + cls

# create an empty dataframe
final_database = pd.DataFrame(columns=column_names)
final_database

Unnamed: 0,sum_1,sum_2,sum_3,sum_4,sum_5,sum_6,sum_7,sum_8,sum_9,sum_10,...,av_energy_2,av_energy_3,av_energy_4,av_energy_5,av_energy_6,av_energy_7,av_energy_8,av_energy_9,av_energy_10,class


In [59]:
def extract_cols(file, ex):
    rep = file["rerepetition"].copy()
    emg = file["emg"].copy()
    lab = file["restimulus"].copy() 

    # rename the labels according to the label_dict
    new_lab = np.array([[label_dict[ex][lab[i][0]]] for i in range(lab.shape[0])])  
    return rep, emg, new_lab


def wavelength_form(x):
    return x.diff().abs().sum()


def av_signal_energy(x):
    return np.sum(x**2) / len(x)


def extract_features(dataframe):
    summed_data = dataframe.groupby([11, 0], as_index=False).sum()
    summed_data.columns = ['class', 'Group_0'] + sum_columns

    mean_data = dataframe.groupby([11, 0], as_index=False).mean()
    mean_data.columns = ['class', 'Group_0'] + mean_columns

    waveform_data = dataframe.groupby([11, 0], as_index=False).agg(wavelength_form)
    waveform_data.columns = ['class', 'Group_0'] + waveform_columns

    av_energy_data = dataframe.groupby([11, 0], as_index=False).agg(av_signal_energy)
    av_energy_data.columns = ['class', 'Group_0'] + av_energy_columns

    combined_data = pd.concat([summed_data.drop(columns=['Group_0']),
                               mean_data.drop(columns=['class', 'Group_0']),
                               waveform_data.drop(columns=['class', 'Group_0']),
                               av_energy_data.drop(columns=['class', 'Group_0'])], axis=1)

    return combined_data


In [60]:
directory = r'..\data\ninapro_DB1'
pattern = r'E(\d+)\.mat$'
files_limit = 0

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.mat'):
        files_limit += 1

        # if files_limit > 6:
        #     break

        file_path = os.path.join(directory, filename)
        matlab_f = scipy.io.loadmat(file_path)

        exercise = int(re.search(pattern, filename).group(1))
        rep, emg, lab = extract_cols(matlab_f, exercise)
        
        df = pd.DataFrame(np.concatenate((rep, emg, lab), axis=1))
        df.drop(df[df[11] == 0.0].index, inplace=True)

        sub_df = extract_features(df)
        
        final_database = pd.concat([final_database, sub_df], ignore_index=True)
        

  0%|          | 0/82 [00:00<?, ?it/s]

100%|██████████| 82/82 [03:23<00:00,  2.48s/it]


In [61]:
final_database.to_csv('processed_NinaDB1.csv', index=False) 

final_database.head()

Unnamed: 0,sum_1,sum_2,sum_3,sum_4,sum_5,sum_6,sum_7,sum_8,sum_9,sum_10,...,av_energy_2,av_energy_3,av_energy_4,av_energy_5,av_energy_6,av_energy_7,av_energy_8,av_energy_9,av_energy_10,class
0,2.0756,140.2917,66.3296,20.434,12.742,32.2717,210.0611,114.9544,34.3061,141.6654,...,0.164524,0.057497,0.005643,0.001827,0.008486,0.249535,0.064521,0.012317,0.113875,1.0
1,2.7194,274.0862,179.5969,54.6926,28.9846,39.6708,211.5952,204.0859,102.7817,140.8581,...,0.487476,0.239901,0.023825,0.005832,0.010447,0.19385,0.162839,0.046481,0.083402,1.0
2,4.3991,302.8373,206.8631,65.8918,35.7191,45.4727,235.5715,348.2053,203.2624,161.9789,...,0.544896,0.274729,0.029842,0.0082,0.012597,0.240106,0.493717,0.187266,0.114946,1.0
3,2.5975,304.5564,224.5466,77.1302,37.3749,41.788,165.9408,206.5482,112.3977,117.1096,...,1.048227,0.577249,0.067096,0.016156,0.019927,0.268142,0.370768,0.122115,0.138651,1.0
4,3.2473,237.2615,165.5956,53.378,27.0767,32.2748,137.1389,189.4433,118.3059,93.1836,...,0.769883,0.396475,0.042627,0.01106,0.014737,0.21717,0.376357,0.173942,0.10506,1.0


# STARE

In [62]:
# mat = scipy.io.loadmat(r'..\data\ninapro_DB1\S1_A1_E2.mat')

# rep = mat["rerepetition"].copy()
# emg = mat["emg"].copy()
# lab = mat["restimulus"].copy() # label

# new_lab = np.array([[label_dict[2][lab[i][0]]] for i in range(lab.shape[0])])

In [63]:
# df = pd.DataFrame(np.concatenate((rep, emg, new_lab), axis=1))
# df.drop(df[df[11] == 0.0].index, inplace=True)

In [64]:
# def wavelength_form(x):
#     return x.diff().abs().sum()

# # Average signal's energy
# def av_signal_energy(x):
#     return np.sum(x**2) / len(x)


In [65]:
# summed_data = df.groupby([11, 0]).sum()
# summed_data.columns = sum_columns

# mean_data = df.groupby([11, 0]).mean()
# mean_data.columns = mean_columns

# waveform_data = df.groupby([11, 0]).agg(wavelength_form)
# waveform_data.columns = waveform_columns

# av_energy_data = df.groupby([11, 0]).agg(av_signal_energy)
# av_energy_data.columns = av_energy_columns

# notfinal = pd.concat([summed_data, mean_data, waveform_data, av_energy_data], axis=1)


In [66]:

# lol = pd.concat([final_database, notfinal], ignore_index=True)
# lol

In [67]:
# lol.to_csv('processed_NinaDB1.csv', index=False) 
