In [2]:
import os
import pandas as pd
import csv

In [3]:
import os

directory_path = '../dataset2_filtered/'
output_file = '../dataset/processed_data/Sequence_dataset2.tab'
cols= ['Frame #', 'Time [s]', 'ACC ML [g]', 'ACC AP [g]', 'ACC SI [g]', 'GYR ML [deg/s]', 'GYR AP [deg/s]', 'GYR SI [deg/s]', 'Freezing event [flag]']
# Open output file for writing
with open(output_file, 'w') as outfile:
    
    # Write header row to output file
    outfile.write('Entry\n')
    
    # Loop through directory and process each file
    for filename in os.listdir(directory_path):
        
        # Select only files with .txt extension
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path, delimiter='\t', header=None, names=cols)
            name = filename
            outfile.write(f'{name}\n')

In [4]:
import pandas as pd

file_path = "../dataset/processed_data/Sequence_dataset2.tab"

df = pd.read_csv(file_path, sep="\t")
df = df['Entry']

filename_sequence = df.values.tolist()

In [5]:
import os
import pandas as pd
import warnings

# ignore warnings
warnings.filterwarnings('ignore')

# define directory path and file format
dir_path = '../dataset2_filtered/'


# define column names to calculate coefficient of variation
col_names = ['ACC ML [g]', 'ACC AP [g]', 'ACC SI [g]', 'GYR ML [deg/s]', 'GYR AP [deg/s]', 'GYR SI [deg/s]']

# create an empty dataframe to store the results
result_df = pd.DataFrame(columns=['ACC ML [g]', 'ACC AP [g]', 'ACC SI [g]', 'GYR ML [deg/s]', 'GYR AP [deg/s]', 'GYR SI [deg/s]', 'label'])

# iterate through files in directory
for file in filename_sequence:
    # read file into pandas dataframe
    df = pd.read_csv(os.path.join(dir_path, file), delimiter='\t')
    # calculate coefficient of variation for selected columns
    cv = df[col_names].std() / df[col_names].mean()
    # add label based on freezing event flag
    if 1 in df['Freezing event [flag]'].values:
        label = 'Pt'
    else:
        label = 'Co'
    # get filename without file extension
    filename = os.path.splitext(file)[0]
    # add result to dataframe
    result_df = result_df.append({ 'ACC ML [g]': cv['ACC ML [g]'], 
                                    'ACC AP [g]': cv['ACC AP [g]'], 
                                    'ACC SI [g]': cv['ACC SI [g]'], 
                                    'GYR ML [deg/s]': cv['GYR ML [deg/s]'], 
                                    'GYR AP [deg/s]': cv['GYR AP [deg/s]'], 
                                    'GYR SI [deg/s]': cv['GYR SI [deg/s]'], 
                                    'label': label}, ignore_index=True)

# save results to file
result_df.to_csv('../dataset/processed_data/Parkinson_CV_dataset2_ensemble.tab', sep='\t', index=False)


In [6]:
import numpy as np
import os
import pandas as pd
import csv

In [7]:
import numpy as np

def transform_into_frequency_domain(time_series_data):
    # Define the number of samples in the time series
    N = len(time_series_data)

    # Compute the discrete Fourier transform of the time series data using np.fft.fft()
    fft_result = np.fft.fft(time_series_data, axis=0)

    # Since the time series data are real values, we only use the modulus of the frequencies
    mod_fft_result = np.abs(fft_result[:N//2, :])

    # Keep only k = 0, 1, ..., 3999 values
    mod_fft_result = mod_fft_result[:4000, :]

    # Reshape the result into a col_len x 4000 array to represent each patient's frequency domain representation
    freq_domain_data = mod_fft_result.reshape((1, -1))
    return freq_domain_data.tolist()[0]

In [9]:
import csv
import os
import pandas as pd

result_directory ='../dataset/processed_data/'
directory_path = '../dataset2_filtered/'
cols = ['Frame #', 'Time [s]', 'ACC ML [g]', 'ACC AP [g]', 'ACC SI [g]', 'GYR ML [deg/s]', 'GYR AP [deg/s]', 'GYR SI [deg/s]', 'Freezing event [flag]']
flag_col = 'Freezing event [flag]'

filtered_cols = ['ACC ML [g]', 'ACC AP [g]', 'ACC SI [g]', 'GYR ML [deg/s]', 'GYR AP [deg/s]', 'GYR SI [deg/s]']


control=0
with open(result_directory+'Parkinson_FD_dataset2_ensemble.tab', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for filename in filename_sequence:
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path, delimiter='\t', header=None, names=cols)
        df_new = df[filtered_cols]
        filtered_df = df_new[filtered_cols].apply(pd.to_numeric, errors='coerce')
        filtered_df = filtered_df.dropna() 
        res = transform_into_frequency_domain(filtered_df) # Pass the actual time series data
        freezing_events = df["Freezing event [flag]"]
        
        if '1' not in freezing_events.values:
            label = 'Co'
        else:
            label = 'Pt'
        res.append(label)
        writer.writerow(res)



In [10]:
from sklearn.decomposition import PCA

result_file = '../dataset/processed_data/Parkinson_FD_dataset2_ensemble.tab'

# Read in the original DataFrame
df = pd.read_csv(result_file, delimiter='\t', header=None)

# Separate the label column from the rest of the data
label_col = df.iloc[:, -1]
df_features = df.iloc[:, :-1]

# Perform PCA on the data to reduce the number of components
pca = PCA(n_components=6)
df_pca = pd.DataFrame(pca.fit_transform(df_features))

# Add the label column back to the DataFrame
df_pca['label'] = label_col.values

# Print the new shape of the DataFrame
print(df_pca.shape)
df_pca.to_csv('../dataset/processed_data/Parkinson_FD_dataset2_ensemble_pca.tab', sep='\t', index=False)

(71, 7)
