# Notebook for the generation and extraction of features of the raw data

In [1]:
# Standard library imports
import os  # Provides functions to interact with the operating system (e.g., file handling)
import math  # Provides mathematical functions such as trigonometry, logarithms, and constants
import random  # Generates random numbers, used for simulations and randomized processes
import re  # Provides regular expression matching operations for string manipulation
import sys  # Provides access to some variables used or maintained by the interpreter

# Third-party library imports
import numpy as np  # Library for numerical computations, especially with arrays and matrices
import pandas as pd  # Library for data manipulation and analysis, including DataFrame objects
import matplotlib.pyplot as plt  # Plotting library used for creating static, interactive, and animated visualizations
import seaborn as sns  # Data visualization library, based on Matplotlib, for statistical graphics
from tqdm import tqdm  # Progress bar for loops and processes, useful for long-running tasks
from scipy.signal import find_peaks  # Detects peaks in 1D data arrays, often used in signal processing
from scipy.stats import skew  # Computes the skewness (asymmetry) of data distribution
from scipy.linalg import eigh  # Eigenvalue and eigenvector solver for symmetric (or Hermitian) matrices
from sklearn import preprocessing  # Provides data preprocessing tools such as scaling and normalization
import networkx as nx  # Library for the creation, manipulation, and study of complex networks

# Custom imports
sys.path.insert(0, "..")  # Add the parent directory to the system path to import custom modules
from src.utils.data_extraction import get_features  # Custom function for feature extraction


File finder function

In [2]:
def find_files(path):
    files = []
    for dirName, subdirList, fileList in os.walk(path):
        files.extend([os.path.join(dirName, f) for f in fileList])
    return files


Path Configuration and Initializing Variables

In [3]:
# Paths of the directories to read
path_files = ['../data/raw/COP_Tests/HG', '../data/raw/COP_Tests/DG', '../data/raw/COP_Tests/DNG']

# List to store data rows
rows = []

# Dictionary to map indices to corresponding classes
class_map = {0: 'Healthy', 1: 'Diabetic', 2: 'Neuropathic'}


Reading Files and Building Data Rows

In [4]:
# Reading files and constructing data rows
for i, path in enumerate(path_files):
    files = find_files(path)
    for file in tqdm(files, desc=f'Processing {path}'):
        with open(file, 'r') as f:
            next(f)  # Skip the first line of the file
            lines = f.readlines()  # Read the lines of the file
            X_series = []  # List to store X time series
            Y_series = []  # List to store Y time series
            exam_name = os.path.basename(file).split('.')[0]  # Get exam name without extension
            for line in lines:
                values = line.strip().split(';')
                X_series.append(float(values[1].replace(',', '.')))  # Convert X values to float
                Y_series.append(float(values[2].replace(',', '.')))  # Convert Y values to float
            row = {'class': class_map[i], 'file_path': file, 'exam': exam_name, 'cop_x': X_series, 'cop_y': Y_series}
            rows.append(row)


Processing ../data/raw/COP_Tests/HG: 100%|██████████| 87/87 [00:00<00:00, 887.38it/s]
Processing ../data/raw/COP_Tests/DG: 100%|██████████| 145/145 [00:00<00:00, 941.31it/s]
Processing ../data/raw/COP_Tests/DNG: 100%|██████████| 201/201 [00:00<00:00, 909.27it/s]


Creating the DataFrame

In [5]:
# Create DataFrame from the data rows
timeSeries = pd.DataFrame(rows)

Cleaning Exam Name Function

In [6]:
def clean_exam_name(exam_name):
    # Remove 'SWARII'
    exam_name = exam_name.replace('SWARII', '')
    # Remove non-alphabetical characters except spaces
    exam_name = re.sub(r'[^a-zA-Z\s]', '', exam_name)
    # Remove extra spaces at the beginning and end
    exam_name = exam_name.strip()
    return exam_name

# Apply cleaning function to the 'exam' column
timeSeries['exam'] = timeSeries['exam'].apply(clean_exam_name)

Adding Flag and Saving as CSV

In [7]:
# Assign 'real' flag to the data
timeSeries = timeSeries.assign(flag='real')

# Save the resulting DataFrame to a CSV file
timeSeries.to_csv('../data/interim/interim_time_series.csv', index=False)

print("Data stored as 'interim_time_series.csv'")


Data stored as 'interim_time_series.csv'


Applying get_features Function

In [8]:
get_features(timeSeries)

Calculating accelerations: 100%|██████████| 433/433 [00:00<00:00, 7096.88it/s]
Calculating RMS: 100%|██████████| 433/433 [00:00<00:00, 13968.86it/s]
Calculating path length: 100%|██████████| 433/433 [00:00<00:00, 6764.86it/s]
Calculating Sample Entropy for cop_x: 100%|██████████| 433/433 [00:12<00:00, 34.77it/s]
Calculating Sample Entropy for cop_y: 100%|██████████| 433/433 [00:12<00:00, 35.29it/s]
Calculating F80 for cop_x: 100%|██████████| 433/433 [00:00<00:00, 13118.75it/s]
Calculating F80 for cop_y: 100%|██████████| 433/433 [00:00<00:00, 13962.74it/s]
Calculating mean frequencies: 100%|██████████| 3/3 [00:00<00:00, 15.38it/s]
Calculating RMS for Cop: 100%|██████████| 433/433 [00:00<00:00, 8486.88it/s]


Unnamed: 0,class,file_path,exam,cop_x,cop_y,flag,acc_x,acc_y,rms_acc_x,rms_acc_y,...,f80_x,f80_y,mf_lf_x,mf_lf_y,mf_mf_x,mf_mf_y,mf_hf_x,mf_hf_y,rms_x,rms_y
0,Healthy,../data/raw/COP_Tests/HG\participant_21\OASETD...,OASETD,"[-0.5907142773263209, -0.5830138724554864, -0....","[0.27127575567479667, 0.23229672622691488, 0.2...",real,"[0.0, 0.19251012177086135, 0.3659675176743149,...","[0.0, -0.9744757361970446, -0.8038894263146634...",0.275944,1.017366,...,193.615346,453.199431,11.402748,26.146214,3.098147,9.071309,0.876182,0.822447,0.174228,0.443474
1,Healthy,../data/raw/COP_Tests/HG\participant_21\OASE_S...,OASE,"[-0.12162960815483115, -0.12554430626852875, -...","[0.23829658131254128, 0.19021049634102294, 0.1...",real,"[0.0, -0.09786745284244003, 0.0866168582420334...","[0.0, -1.2021521242879585, -1.1661812174404762...",0.348282,1.094997,...,154.466090,424.615996,10.249035,20.396859,2.652427,9.220760,0.275486,1.254465,0.146995,0.347908
2,Healthy,../data/raw/COP_Tests/HG\participant_21\OASITD...,OASITD,"[-0.16731413160366831, -0.16177967029908652, -...","[0.8381318584951023, 0.9215735610382687, 1.008...",real,"[0.0, 0.13836153261454487, 0.2499263745907837,...","[0.0, 2.08604256357916, 2.1683932987347943, 2....",0.584157,1.367482,...,221.181984,792.630826,12.012784,50.680674,4.606157,12.561370,0.454124,2.678517,0.157215,0.594472
3,Healthy,../data/raw/COP_Tests/HG\participant_21\OCSETD...,OCSETD,"[-0.024951842513941502, -0.018349447211223502,...","[0.5626136740197065, 0.5075073804473442, 0.464...",real,"[0.0, 0.16505988256795, 0.057363732182467986, ...","[0.0, -1.3776573393090574, -1.087025150906895,...",0.330530,1.242923,...,119.803139,487.840236,6.199874,26.488442,2.494176,9.817008,0.323247,1.260141,0.080185,0.369688
4,Healthy,../data/raw/COP_Tests/HG\participant_21\OCSE_S...,OCSE,"[0.030813172239558373, 0.03789838001201207, 0....","[0.1325425969532965, 0.10703910260127625, 0.09...",real,"[0.0, 0.17713019431134236, 0.2069230272512046,...","[0.0, -0.6375873588005065, -0.3209753905534684...",0.408705,1.535297,...,169.824356,673.075707,11.367458,38.918436,2.797528,13.425002,0.367001,1.235255,0.140936,0.563218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,Neuropathic,../data/raw/COP_Tests/DNG\participant_20\EBOCT...,EBOCTD,"[0.268740962314687, 0.2971290968116864, 0.3297...","[-0.09398484159438025, -0.006395170271205508, ...",real,"[0.0, 0.7097033624249849, 0.8162385477682388, ...","[0.0, 2.1897417830793686, 1.6750820971971891, ...",0.870562,1.678451,...,446.323302,702.654730,41.990345,36.403881,4.186282,15.464272,0.310305,1.258786,0.535775,0.574260
429,Neuropathic,../data/raw/COP_Tests/DNG\participant_20\EBOCT...,EBOCTD,"[0.22591373093800104, 0.19081937834680884, 0.1...","[-0.009166553970872293, -0.053337505342418545,...",real,"[0.0, -0.877358814779805, -0.694187386040257, ...","[0.0, -1.1042737842886563, -0.819128950549719,...",0.816791,1.736424,...,500.814891,705.528652,45.340831,39.616737,5.003792,14.675227,0.562414,1.137166,0.554092,0.531645
430,Neuropathic,../data/raw/COP_Tests/DNG\participant_20\EBOCT...,EBOCTDSI,"[-0.8156239056766825, -0.7569614031375326, -0....","[-0.36745781660768184, -0.4496811004870598, -0...",real,"[0.0, 1.4665625634787487, 0.497431098301776, -...","[0.0, -2.0555820969844483, -1.8632227024482306...",2.072932,2.248739,...,1055.730600,995.563479,67.426475,65.435659,18.742228,17.299386,2.078291,1.777904,0.862278,0.840202
431,Neuropathic,../data/raw/COP_Tests/DNG\participant_20\EBOCT...,EBOCTDSI,"[1.0270392662965229, 0.7484775808059427, 0.446...","[0.14759914717267453, 0.21324516210345212, 0.2...",real,"[0.0, -6.964042137264503, -7.556758723981416, ...","[0.0, 1.64115037326944, -0.04568575647532214, ...",1.803191,2.329126,...,765.280869,868.766104,40.297005,45.197907,14.358763,19.209475,3.071704,1.442357,0.556908,0.729415


Saving the Processed DataFrame as JSON

In [9]:
# Save the processed DataFrame to a JSON file
timeSeries.to_json('../data/processed/processed_time_series.json', orient='records', lines=True)

print("Processed data stored as 'processed_time_series.json'")

Processed data stored as 'processed_time_series.json'
