Importing the libaray

In [1]:
import os
import pandas as pd
import json
import os
import struct

In [15]:
# Specify the folder name containing JSON files (in the same directory as the script)
folder_name = "patients"

# Get the absolute path of the folder
folder_path = os.path.join(os.getcwd(), folder_name)

# List to store all records
all_records = []

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):  # Ensure it's a JSON file
        file_path = os.path.join(folder_path, file_name)
        
        # Open and load JSON data
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Append data to the list (assumes data is a dictionary)
            all_records.append(data)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_records)

# Specify the output CSV file name
output_csv_path = "details.csv"

# Export the DataFrame to CSV
df.to_csv(output_csv_path, index=False)

print(f"CSV file has been saved at {output_csv_path}")


CSV file has been saved at details.csv


In [None]:
#Extracting the questionnaire folder JSON files along the with data from the question and answer 

In [7]:
# Specify the folder name containing JSON files (in the same directory as the script)
folder_name = "questionnaire"

# Get the absolute path of the folder
folder_path = os.path.join(os.getcwd(), folder_name)

# List to store all records
all_records = []

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):  # Ensure it's a JSON file
        file_path = os.path.join(folder_path, file_name)
        
        # Open and load JSON data
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Extract the common metadata
            resource_type = data.get("resource_type", "")
            subject_id = data.get("subject_id", "")
            study_id = data.get("study_id", "")
            questionnaire_id = data.get("id", "")
            questionnaire_name = data.get("questionnaire_name", "")
            
            # Extract the items (questions and answers)
            for item in data.get("item", []):
                link_id = item.get("link_id", "")
                text = item.get("text", "")
                answer = item.get("answer", "")
                
                # Append a record to the list
                all_records.append({
                    "Resource Type": resource_type,
                    "Subject ID": subject_id,
                    "Study ID": study_id,
                    "Questionnaire ID": questionnaire_id,
                    "Questionnaire Name": questionnaire_name,
                    "Link ID": link_id,
                    "Question": text,
                    "Answer": answer
                })

# Convert the list of records to a DataFrame
df = pd.DataFrame(all_records)

# Specify the output CSV file name
output_csv_path = "questionnaire_responses.csv"

# Export the DataFrame to CSV
df.to_csv(output_csv_path, index=False)

print(f"CSV file has been saved at {output_csv_path}")


CSV file has been saved at questionnaire_responses.csv


In [25]:
#Checking the dataframe for patients records
df.head()

Unnamed: 0,resource_type,id,study_id,condition,disease_comment,age_at_diagnosis,age,height,weight,gender,handedness,appearance_in_kinship,appearance_in_first_grade_kinship,effect_of_alcohol_on_tremor
0,patient,1,PADS,Healthy,-,56,56,173,78,male,right,True,True,Unknown
1,patient,2,PADS,Other Movement Disorders,Left-Sided resting tremor and hypokinesia with...,69,81,193,104,male,right,False,,No effect
2,patient,3,PADS,Healthy,-,45,45,170,78,female,right,False,,Unknown
3,patient,4,PADS,Parkinson's,IPS akinetic-rigid type,63,67,161,90,female,right,False,,No effect
4,patient,5,PADS,Parkinson's,IPS tremordominant type,65,75,172,86,male,left,False,,Unknown


In [21]:
#checking the null values in the patient details dataframe 
df.isna().sum()

resource_type                          0
id                                     0
study_id                               0
condition                              0
disease_comment                        0
age_at_diagnosis                       0
age                                    0
height                                 0
weight                                 0
gender                                 0
handedness                             0
appearance_in_kinship                  0
appearance_in_first_grade_kinship    288
effect_of_alcohol_on_tremor            0
dtype: int64

In [23]:
#list the columns in the patient details dataframe
df.columns

Index(['resource_type', 'id', 'study_id', 'condition', 'disease_comment',
       'age_at_diagnosis', 'age', 'height', 'weight', 'gender', 'handedness',
       'appearance_in_kinship', 'appearance_in_first_grade_kinship',
       'effect_of_alcohol_on_tremor'],
      dtype='object')

In [None]:
#Extracting the movement folder json files and coresponding text for the channels

In [3]:
# Base directories
base_folder = "movement"
timeseries_folder = os.path.join(base_folder, "timeseries")
output_csv = "movement_data_flattened.csv"

# Initialize a list for storing processed data
data = []

# Traverse through all JSON files
for file_name in os.listdir(base_folder):
    if file_name.endswith(".json"):
        json_path = os.path.join(base_folder, file_name)
        
        with open(json_path, "r") as json_file:
            record = json.load(json_file)
            
            # Extract metadata
            subject_id = record.get("subject_id")
            study_id = record.get("study_id")
            device_id = record.get("device_id")
            record_id = record.get("id")
            
            # Iterate through each session
            for session in record.get("session", []):
                record_name = session.get("record_name")
                rows = session.get("rows")
                
                # Iterate through each record
                for rec in session.get("records", []):
                    device_location = rec.get("device_location")
                    channels = rec.get("channels")
                    units = rec.get("units")
                    file_name = rec.get("file_name")
                    
                    # Construct the path to the timeseries file
                    timeseries_path = os.path.join(base_folder, file_name)
                    
                    if os.path.exists(timeseries_path):
                        # Read the timeseries file
                        with open(timeseries_path, "r") as ts_file:
                            for line in ts_file:
                                values = line.strip().split(",")  # Split by comma
                                
                                # Create a row with metadata and timeseries data
                                row_data = {
                                    "subject_id": subject_id,
                                    "study_id": study_id,
                                    "device_id": device_id,
                                    "record_id": record_id,
                                    "record_name": record_name,
                                    "rows": rows,
                                    "device_location": device_location,
                                }
                                
                                # Add channels and their corresponding values
                                for i, value in enumerate(values):
                                    channel_name = channels[i] if i < len(channels) else f"channel_{i}"
                                    row_data[channel_name] = float(value)
                                
                                data.append(row_data)
                    else:
                        print(f"File not found: {timeseries_path}")

# Create a DataFrame and save it to CSV
df = pd.DataFrame(data)
df.to_csv(output_csv, index=False)

print(f"Processed data saved to {output_csv}.")


Processed data saved to movement_data_flattened.csv.


In [5]:
df.head()

Unnamed: 0,subject_id,study_id,device_id,record_id,record_name,rows,device_location,Time,Accelerometer_X,Accelerometer_Y,Accelerometer_Z,Gyroscope_X,Gyroscope_Y,Gyroscope_Z
0,1,PADS,Apple Watch Series 4,Neurological Assessment,Relaxed,2048,LeftWrist,0.0,-0.003958,0.00236,0.00174612,-0.015492,0.005616,0.001034
1,1,PADS,Apple Watch Series 4,Neurological Assessment,Relaxed,2048,LeftWrist,0.009903,-0.004005,0.002259,0.0008355394,-0.012276,0.005605,-0.003218
2,1,PADS,Apple Watch Series 4,Neurological Assessment,Relaxed,2048,LeftWrist,0.019901,0.000833,0.002225,0.0009437218,-0.009051,0.002432,-0.002137
3,1,PADS,Apple Watch Series 4,Neurological Assessment,Relaxed,2048,LeftWrist,0.029907,-0.000191,0.004109,4.783e-07,-0.008911,0.002154,-0.000931
4,1,PADS,Apple Watch Series 4,Neurological Assessment,Relaxed,2048,LeftWrist,0.039984,0.001769,0.004064,-0.0019024,-0.00783,3.4e-05,0.000145


In [7]:
df.shape

(13447168, 14)

In [9]:
df.tail()

Unnamed: 0,subject_id,study_id,device_id,record_id,record_name,rows,device_location,Time,Accelerometer_X,Accelerometer_Y,Accelerometer_Z,Gyroscope_X,Gyroscope_Y,Gyroscope_Z
13447163,469,PADS,Apple Watch Series 4,Neurological Assessment,Entrainment,2048,RightWrist,20.567547,0.001274,0.016558,0.040977,-0.140257,-0.086629,-0.08038
13447164,469,PADS,Apple Watch Series 4,Neurological Assessment,Entrainment,2048,RightWrist,20.577599,0.003401,0.020048,0.028026,-0.128313,-0.106679,-0.07933
13447165,469,PADS,Apple Watch Series 4,Neurological Assessment,Entrainment,2048,RightWrist,20.587748,0.007735,0.018754,0.014137,-0.121611,-0.136325,-0.078285
13447166,469,PADS,Apple Watch Series 4,Neurological Assessment,Entrainment,2048,RightWrist,20.597736,0.006451,0.024316,0.002172,-0.119266,-0.157518,-0.077235
13447167,469,PADS,Apple Watch Series 4,Neurological Assessment,Entrainment,2048,RightWrist,20.607882,0.003335,0.022134,-0.003934,-0.117109,-0.159601,-0.074051


In [11]:
df['device_location'].value_counts()

device_location
LeftWrist     6723584
RightWrist    6723584
Name: count, dtype: int64