In [67]:
import os
from pyedflib import EdfReader
from datetime import datetime
import csv
import re


def extract_age_gender_from_edf(directory):
    """
    Extracts age and gender from EDF files in the given directory and saves it to a CSV file,
    ensuring samples are processed in order (S01, S02, etc.).

    Args:
        directory (str): Path to the directory containing EDF files.
    """

    age_gender_data = []
    processed_samples = set()  # To keep track of processed samples

    # Use a dictionary to store filenames keyed by sample number for sorting
    files_by_sample = {}

    for filename in os.listdir(directory):
        if filename.endswith(".edf"):
            # Extract sample ID from filename using regex (e.g., 'S01')
            match = re.search(r'S(\d+)', filename)
            if match:
                sample_id = match.group(1)  # Extracts '01', '02', etc.

                # Store the filename in the dictionary, using the integer version of the sample ID as key for sorting
                if int(sample_id) not in files_by_sample:
                    files_by_sample[int(sample_id)] = []
                files_by_sample[int(sample_id)].append(filename)
            else:
                print(
                    f"Warning: Could not extract sample ID from filename: {filename}")
                continue  # Skip to the next file

    # Iterate through the samples in sorted order based on the sample number
    for sample_num in sorted(files_by_sample.keys()):
        # Convert back to '01', '02', etc. format
        sample_id = f"{sample_num:02}"

        # If this sample ID is already processed, skip
        if f'S{sample_id}' in processed_samples:  # Using the S01 as a key
            continue

        for filename in files_by_sample[sample_num]:
            filepath = os.path.join(directory, filename)

            try:
                with EdfReader(filepath) as f:
                    experiment_date = f.getStartdatetime()
                    birthday = f.getBirthdate()
                    gender = f.getSex()  # Get gender from the EDF file

                    if birthday:
                        # Adjust format if necessary
                        birth_date = datetime.strptime(birthday, '%d %b %Y')

                        # Calculate age in years
                        age = (experiment_date - birth_date).days // 365

                        # Convert gender code to M/F/Unknown representation
                        # ***ADJUST THESE CONDITIONS TO MATCH YOUR DATA***
                        gender_str = "M" if gender == "Male" else (
                            "F" if gender == "Female" else "Unknown")

                        print(
                            f"File: {filename}, Age: {age}, Gender: {gender_str}")
                        age_gender_data.append(
                            {'Sample': f'HS{sample_id}', 'Age': age, 'Gender': gender_str})  # Prepend 'HS'
                        # Mark sample as processed
                        processed_samples.add(f'S{sample_id}')
                        break  # Break after processing the first file of the sample
                    else:
                        # Convert gender code to M/F/Unknown representation
                        # ***ADJUST THESE CONDITIONS TO MATCH YOUR DATA***
                        gender_str = "M" if gender == "Male" else (
                            "F" if gender == "Female" else "Unknown")
                        print(
                            f"File: {filename}, Birthday not found, Gender: {gender_str}")
                        # Store None if birthday not found
                        age_gender_data.append(
                            {'Sample': f'HS{sample_id}', 'Age': None, 'Gender': gender_str})
                        processed_samples.add(f'S{sample_id}')
                        break  # Break after processing the first file of the sample

            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                age_gender_data.append(
                    {'Sample': f'HS{sample_id}', 'Age': 'Error', 'Gender': 'Error'})  # Mark as error
                processed_samples.add(f'S{sample_id}')
                break

    # Write the age data to a CSV file
    csv_filename = "./input/age_gender_data.csv"
    with open(csv_filename, 'w', newline='') as csvfile:
        fieldnames = ['Sample', 'Age', 'Gender']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(age_gender_data)

    print(f"Age and gender data saved to {csv_filename}")


# Specify the directory containing EDF files
edf_directory = "edf_dataset_2"  # Replace with the actual directory path

# Run the function to extract age and save to CSV
extract_age_gender_from_edf(edf_directory)

File: H_S01_EC.edf, Age: 24, Gender: M
File: H_S02_EC.edf, Age: 19, Gender: M
File: H_S03_EC.edf, Age: 21, Gender: M
File: H_S04_EC.edf, Age: 35, Gender: M
File: H_S05_EC.edf, Age: 26, Gender: F
File: H_S06_EC.edf, Age: 76, Gender: M
File: H_S07_EC.edf, Age: 21, Gender: M
File: H_S08_EC.edf, Age: 29, Gender: F
File: H_S09_EC.edf, Age: 36, Gender: M
Error processing file H_S10_EC.edf: edf_dataset_2\H_S10_EC.edf: the file is not EDF(+) or BDF(+) compliant (it contains format errors)
File: H_S11_EC.edf, Age: 20, Gender: M
File: H_S12_EO.edf, Age: 24, Gender: M
File: H_S13_EC.edf, Age: 57, Gender: F
File: H_S14_EC.edf, Age: 31, Gender: F
File: H_S15_EC.edf, Age: 51, Gender: F
File: H_S16_EC.edf, Age: 12, Gender: M
Error processing file H_S17_EC.edf: edf_dataset_2\H_S17_EC.edf: the file is not EDF(+) or BDF(+) compliant (it contains format errors)
File: H_S18_EO.edf, Age: 37, Gender: F
File: H_S19_EC.edf, Age: 59, Gender: M
Error processing file H_S20_EC.edf: edf_dataset_2\H_S20_EC.edf: the

In [68]:
import csv
import re


def combine_metadata(metadata_csv, output_csv):
    """
    Reads metadata from a CSV file, modifies the 'File' column,
    and writes the combined data (Sample, Gender, Age) to a new CSV file.

    Args:
        metadata_csv (str): Path to the original metadata CSV file.
        output_csv (str): Path to the output CSV file.
    """

    output_rows = []
    try:
        with open(metadata_csv, 'r', newline='', encoding='utf-8-sig') as f:  # Specify encoding
            reader = csv.DictReader(f)
            for row in reader:
                # Extract sample number from the 'File' column (e.g., '01' from '1')
                # No regex needed if 'File' column directly contains sequential number
                sample_num = row['File']

                # Convert to int and format
                new_sample_value = f"MDDS{int(sample_num):02d}"

                # Extract Gender and Age from metadata
                gender = row['Gender']
                age = row['Age']

                # Create a new row with the modified 'File' value and appended data
                new_row = {
                    'Sample': new_sample_value,
                    'Age': age,
                    'Gender': gender,
                }
                output_rows.append(new_row)
    except FileNotFoundError:
        print(f"Error: {metadata_csv} not found.")
        return
    except Exception as e:
        print(f"Error reading {metadata_csv}: {e}")
        return

    # Write the combined data to a new CSV file
    try:
        with open(output_csv, 'w', newline='') as f:
            fieldnames = ['Sample', 'Gender', 'Age']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(output_rows)

        print(f"Combined data saved to {output_csv}")

    except Exception as e:
        print(f"Error writing to {output_csv}: {e}")


# Specify the file paths
metadata_csv_file = "./input/trueages.csv"  # Replace with the path to your metadata CSV
output_csv_file = "./input/mdd_demographics.csv"  # Name for the new combined CSV

# Run the function to combine the metadata
combine_metadata(metadata_csv_file, output_csv_file)

Combined data saved to ./input/mdd_demographics.csv


In [69]:
import csv


def combine_and_sort_data(demographics_csv, age_gender_csv, output_csv):
    """
    Combines data from demographics.csv and age_gender_data.csv, then sorts the output.

    Args:
        demographics_csv (str): Path to the demographics CSV file (MDD data).
        age_gender_csv (str): Path to the age_gender_data CSV file (Healthy data).
        output_csv (str): Path to the output CSV file.
    """

    # Read both CSV files into lists of dictionaries
    mdd_data = []
    healthy_data = []

    try:
        with open(demographics_csv, 'r', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                row['Health'] = 'MDD'  # Add Health column
                mdd_data.append(row)
    except FileNotFoundError:
        print(f"Error: {demographics_csv} not found.")
        return
    except Exception as e:
        print(f"Error reading {demographics_csv}: {e}")
        return

    try:
        with open(age_gender_csv, 'r', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                row['Health'] = 'Healthy'  # Add Health column
                healthy_data.append(row)
    except FileNotFoundError:
        print(f"Error: {age_gender_csv} not found.")
        return
    except Exception as e:
        print(f"Error reading {age_gender_csv}: {e}")
        return

    # Combine the data
    combined_data = healthy_data + mdd_data

    # Sort the combined data: "Healthy" samples first, then "MDD" samples
    # If two have the same condition it is sorted by the sample.

    combined_data.sort(key=lambda x: (
        x['Health'] == 'MDD', x['Sample']))  # Sort criteria

    # Write the combined and sorted data to a new CSV file
    try:
        with open(output_csv, 'w', newline='') as f:
            fieldnames = ['Sample', 'Gender',
                          'Age', 'Health']  # New health col

            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(combined_data)

        print(f"Combined and sorted data saved to {output_csv}")

    except Exception as e:
        print(f"Error writing to {output_csv}: {e}")


# Specify the file paths
demographics_csv_file = "./input/mdd_demographics.csv"  # The MDD samples file
age_gender_csv_file = "./input/age_gender_data.csv"  # The Healthy samples file

output_csv_file = "./input/demographics.csv"  # Name for the new combined CSV

# Run the function to combine and sort the metadata
combine_and_sort_data(demographics_csv_file,
                      age_gender_csv_file, output_csv_file)

Combined and sorted data saved to ./input/demographics.csv
