## Data Processing

### Workflow
- Iterates through all JSON files in the movement/ folder.
- Extracts metadata like participant ID, session name, and device location.
- Finds and loads corresponding .txt files from movement/timeseries/.
- Merges all sensor data into a single structured DataFrame for analysis.
- Saves the processed data as a CSV for machine learning and visualization.

### Import libraries

In [9]:
#import libaries
import os
import json
import pandas as pd
from glob import glob

### Load and parse Json file. Then read time-series data from a .txt file

In [10]:
# define function to load and parse JSON file for movement data and patient information data
def load_json(json_file):
    """Load and parse a JSON file."""
    with open(json_file, 'r') as f:
        data = json.load(f)
    return data

In [11]:
# define function to read the corresponding time-series data from a .txt file
def load_txt_data(txt_file):
    """Load .txt file and return a DataFrame with sensor readings."""
    try:
        df = pd.read_csv(txt_file, header=None)
        df.columns = ["Time", "Accel_X", "Accel_Y", "Accel_Z", "Gyro_X", "Gyro_Y", "Gyro_Z"]
        return df
    except Exception as e:
        print(f"Error reading {txt_file}: {e}")
        return None

In [12]:

# function to process all observation JSON and TXT files while integrating patient metadata
def process_all_observations(observation_folder, txt_folder, patient_folder):
    """Parse observation JSON metadata, load associated time-series files, and merge patient metadata."""
    
    # get all observation JSON files
    observation_files = glob(os.path.join(observation_folder, "*.json"))
    combined_data = []

    # load all patient metadata into a dictionary
    patient_metadata = {}
    patient_files = glob(os.path.join(patient_folder, "*.json"))

    for file in patient_files:
        metadata = load_json(file)
        patient_id = metadata["id"]
        patient_metadata[patient_id] = {
            "Age": metadata["age"],
            "Height": metadata["height"],
            "Weight": metadata["weight"],
            "Gender": metadata["gender"],
            "Condition": metadata["condition"],
            "Age_At_Diagnosis": metadata.get("age_at_diagnosis", None),  # handle missing key
            "Handedness": metadata["handedness"]
        }

    for observation_file in observation_files:
        metadata = load_json(observation_file)
        patient_id = metadata["subject_id"]

        # ensure patient metadata corresponds to the current observation's participant
        if patient_id not in patient_metadata:
            print(f"Warning: No patient metadata found for patient_id {patient_id}. Skipping.")
            continue

        patient_info = patient_metadata[patient_id]  # get correct patient metadata

        session_data = metadata["session"]
        for session in session_data:
            task = session["record_name"]
            for record in session["records"]:
                device_location = record["device_location"]
                file_name = record["file_name"].split("/")[-1]  # Extract filename
                
                # construct full file path
                txt_file_path = os.path.join(txt_folder, file_name)
                
                # load corresponding time-series data
                if os.path.exists(txt_file_path):
                    df = load_txt_data(txt_file_path)
                    if df is not None:
                        df["Patient_id"] = patient_id
                        df["Task"] = task
                        df["Device_Location"] = device_location
                        
                        # ensure correct patient metadata is added
                        for key, value in patient_info.items():
                            df[key] = value

                        combined_data.append(df)
                else:
                    print(f"File {txt_file_path} not found.")

    # Combine all observations into a single DataFrame
    if combined_data:
        pd_df = pd.concat(combined_data, ignore_index=True)
        return pd_df
    else:
        print("No valid data files found.")
        return None

# Folder paths (update these to actual dataset locations)
observation_folder = "../pads-dataset/movement/"
txt_folder = "../pads-dataset/movement/timeseries/"
patient_folder = "../pads-dataset/patients/"

# call function to process all observations
pd_df = process_all_observations(observation_folder, txt_folder, patient_folder)



In [13]:
# save the merged dataset
if pd_df is not None:
    pd_df.to_csv("../processed-data/parkinsons-data.csv", index=False)
    print("Data processing complete. Saved as parkinsons-data.csv")
else:
    print("Error: Could not merge datasets due to missing files.")

Data processing complete. Saved as parkinsons-data.csv


In [14]:
# Display top 10 processed data
pd_df.head(10)

Unnamed: 0,Time,Accel_X,Accel_Y,Accel_Z,Gyro_X,Gyro_Y,Gyro_Z,Patient_id,Task,Device_Location,Age,Height,Weight,Gender,Condition,Age_At_Diagnosis,Handedness
0,0.0,0.00667,-0.002256,0.002058,-0.008716,-0.009256,-0.006805,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
1,0.009899,0.005264,-0.001595,0.001769,-0.008172,-0.009983,-0.009679,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
2,0.019915,0.004956,-0.002175,0.000285,-0.005393,-0.011929,-0.010095,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
3,0.03002,0.002007,-9.6e-05,0.001867,-0.007201,-0.009405,-0.012129,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
4,0.040001,3e-06,-0.00157,0.002561,-0.008943,-0.00875,-0.013045,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
5,0.050035,-0.001642,-0.001632,0.000432,-0.014358,-0.002504,-0.005161,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
6,0.061798,-0.001287,-0.001137,0.001092,-0.015129,-0.005307,-0.005009,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
7,0.070397,-0.001198,-0.002115,0.001643,-0.019401,-0.003047,-0.001175,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
8,0.080331,-0.002781,-0.000454,0.001215,-0.016535,-0.001563,0.000376,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right
9,0.090531,-0.000313,0.002764,-9.9e-05,-0.014588,-0.002419,0.001318,356,Relaxed,LeftWrist,76,176,106,male,Parkinson's,73,right


#### Extract Sample Data

In [15]:
# extract sample data
# Sort by time (if a time column exists)
df = pd_df.sort_values(by=["Patient_id", "Task", "Device_Location", "Time"])  

# Extract first 20 rows per group

df = df.groupby(["Patient_id", "Task", "Device_Location", "Gender"], group_keys=False).head(20)
# save the extracted data
df.to_csv("../processed-data/extracted-data.csv", index=False)

##### Next steps
- clean data 