# 2. Metadata Manifest
In this notebook I create a manifest file based on the information from the `.mat` file and the explanatory information given in the repository from Chennu (2016). The purpose of the manifest file is to structure the access to the individual `.set` files in later analysis. I want to combine the following information in the final file:

| Column Name       | Description                                                                 | Type      | Example Value                        |
|--------------------|------------------------------------------------------------------------------|------------|--------------------------------------|
| **Subject**        | Participant identifier derived from the filename.                           | int        | `2`   
| **BaseName**       | Original dataset filename (without extension). Each entry corresponds to one `.set` EEG recording. | string     | `02-2010-anest 20100210 135.003`    |                               |
| **SedationCode**   | Numeric code indicating sedation level: <br>1 = Baseline, 2 = Mild, 3 = Moderate, 4 = Recovery. | int        | `3`                                  |
| **SedationLabel**  | Human-readable sedation label derived from `SedationCode`.                  | string     | `Moderate`                           |
| **Propofol_ugL**   | Propofol plasma concentration (µg/L) measured at that sedation level.       | float      | `689`                                |
| **RT_ms**          | Mean reaction time in milliseconds from a two-choice task.                  | float      | `945`                                |
| **Correct**        | Number of correct responses (max = 40).                                     | int        | `37`                                 |
| **SetPath**        | Absolute or relative file path to the `.set` EEG file.                      | string     | `data/eeg/02-2010-anest 20100210 135.003.set` |


In [1]:
import mne
import scipy
from scipy.io import loadmat
from pathlib import Path 
import re
import math
import pandas as pd

In [None]:
DATA_RAW = Path("data/data_raw")
MAT_PATH = Path("data/data_derivatives/datainfo.mat")

metadata = loadmat(MAT_PATH)
rows = metadata["datainfo"] 

# Helper to turn the MATLAB table into tidy rows
def parse_row(row):
    """
    row[0]: dataset name string (no extension)
    row[1]: sedation code (uint8)
    row[2]: propofol (µg/L)
    row[3]: reaction time (ms)
    row[4]: correct responses (0-40)
    """

    base_name = str(row[0].item())
    sed_code_raw = row[1].squeeze()
    propofol_raw = row[2].squeeze()
    rt_raw = row[3].squeeze()
    correct_raw = row[4].squeeze()

    # handle NaNs
    sedation_code = int(sed_code_raw) if not math.isnan(sed_code_raw) else None
    propofol_ugL = int(propofol_raw) if not math.isnan(propofol_raw) else None
    rt_ms = int(rt_raw) if not math.isnan(rt_raw) else None
    correct = int(correct_raw) if not math.isnan(correct_raw) else None


    # Derive human-readable label
    code2label = {1: "Baseline", 2: "Mild", 3: "Moderate", 4: "Recovery"}
    sedation_label = code2label.get(sedation_code, f"Unknown_{sedation_code}")

    # Subject = first token before the first '-' (e.g., "27" from "27-2010-anest ...") -> CHATGPT
    subj_match = re.match(r"^(\d+)-", base_name)
    subject = subj_match.group(1) if subj_match else "NA"


    set_path = DATA_RAW / f"{base_name}.set"


    return {
        "Subject": subject,
        "BaseName": base_name,
        "SedationCode": sedation_code,
        "SedationLabel": sedation_label,
        "Propofol_ugL": propofol_ugL,
        "RT_ms": rt_ms,
        "Correct": correct,
        "SetPath": str(set_path) if set_path else None,
    }

records = []

for i in range(rows.shape[0]):
    row_data = rows[i, :]       
    parsed = parse_row(row_data) 
    records.append(parsed)       

manifest = pd.DataFrame.from_records(records)


# Save for pipeline
OUT = Path("data/data_derivatives/manifests")
OUT.mkdir(parents=True, exist_ok=True)
manifest_path = OUT / "manifest.csv"
manifest.to_csv(manifest_path, index=False)

manifest.head(10)

Unnamed: 0,Subject,BaseName,SedationCode,SedationLabel,Propofol_ugL,RT_ms,Correct,SetPath
0,2,02-2010-anest 20100210 135.003,1,Baseline,0,903.0,40,data/data_raw/02-2010-anest 20100210 135.003.set
1,2,02-2010-anest 20100210 135.006,2,Mild,204,675.0,39,data/data_raw/02-2010-anest 20100210 135.006.set
2,2,02-2010-anest 20100210 135.014,3,Moderate,506,846.0,39,data/data_raw/02-2010-anest 20100210 135.014.set
3,2,02-2010-anest- 20100210 16.003,4,Recovery,299,739.0,38,data/data_raw/02-2010-anest- 20100210 16.003.set
4,3,03-2010-anest 20100211 142.003,1,Baseline,0,630.0,37,data/data_raw/03-2010-anest 20100211 142.003.set
5,3,03-2010-anest 20100211 142.008,2,Mild,246,637.0,37,data/data_raw/03-2010-anest 20100211 142.008.set
6,3,03-2010-anest 20100211 142.021,3,Moderate,689,945.0,3,data/data_raw/03-2010-anest 20100211 142.021.set
7,3,03-2010-anest 20100211 142.026,4,Recovery,224,669.0,38,data/data_raw/03-2010-anest 20100211 142.026.set
8,5,05-2010-anest 20100223 095.004,1,Baseline,0,855.0,37,data/data_raw/05-2010-anest 20100223 095.004.set
9,5,05-2010-anest 20100223 095.009,2,Mild,525,871.0,37,data/data_raw/05-2010-anest 20100223 095.009.set
