In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install mne

Collecting mne
  Downloading mne-1.9.0-py3-none-any.whl.metadata (20 kB)
Downloading mne-1.9.0-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mne
Successfully installed mne-1.9.0


In [None]:
import os
import mne
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [None]:
high_ahi_path = "/content/drive/MyDrive/SleepApnea/Data/High_AHI"
low_ahi_path = "/content/drive/MyDrive/SleepApnea/Data/Low_AHI"

one_patient_txt = os.path.join(high_ahi_path, 'OSAA10002 epoch and event list.txt')
one_patient_edf = os.path.join(high_ahi_path, 'OSAA10002_New.edf')

In [None]:
def read_txt_file(txt_path, start_line=13):
    with open(txt_path, 'r', encoding='latin-1') as f:
        lines = f.readlines()
        txt_start_line = lines[4].strip().split('\t')[1]
        txt_start = datetime.strptime(txt_start_line, "%Y-%m-%d %I:%M:%S %p")

        data = [line.strip().split('\t') for line in lines[start_line:] if line.strip()]
        df = pd.DataFrame(data)
        df.columns = df.iloc[0]
        df = df[1:].reset_index(drop=True)

        return df, txt_start

df, txt_start = read_txt_file(one_patient_txt)
print("TXT start time:", txt_start)
print(df.head())


TXT start time: 2016-08-15 22:14:53
0 Epoch                              Event   Start Time Duration
0     1   Bad Data (Channel SpO2  (Amp 1))  10:14:53 PM    28.94
1     1  Bad Data (Channel Pulse  (Amp 1))  10:14:53 PM    28.94
2     9                        Eyes Closed  10:19:05 PM     None
3    10                          Eyes Open  10:19:29 PM     None
4    11                    Eyes Left/Right  10:20:03 PM     None


In [None]:
from datetime import datetime, timedelta

def add_seconds_since_start(df, start_time):
  df['time_only'] = pd.to_datetime(df['Start Time'], format="%I:%M:%S %p", errors='coerce').dt.time
  df['Start Time'] = df['time_only'].apply(lambda t: datetime.combine(edf_start.date(), t))
  df.loc[df['Start Time'] < edf_start, 'Start Time'] += timedelta(days=1)

  df['seconds_since_start'] = (df['Start Time'] - edf_start).dt.total_seconds()
  print(df[['Event', 'Start Time', 'seconds_since_start']].head())

  return df


For each EDF under `High_AHI/`:

1. **Extract `patient_id` from the filename** (e.g. `"OSAA10002_New.edf"` → `"OSAA10002"`).  
2. **Load raw data via MNE**:  
   - `sfreq` is the sampling frequency (e.g. 256 Hz).  
   - `signal` is a NumPy array of shape `(n_channels, n_samples)`.  
3. **Read that patient’s TXT and convert to `seconds_since_start`**.  
4. **Build an `event_list`** = a Python list of tuples `(start_sec, end_sec, event_label)`.  
   - For each row in `ev_df`, `start_sec = df["seconds_since_start"]`.  
   - `end_sec = start_sec + Duration` (or `start_sec + window_size` if Duration missing).  
5. **Slide a 10-second window by 2 seconds** (i.e. overlapping 8 s).  
   - `start_times = np.arange(0, total_sec - window_size + 1, step_size)`.  
   - For each `start_sec` in that array, compute sample indices `start_sample = start_sec * sfreq` and `end_sample = end_sec * sfreq`.  
   - Extract `segment = signal[:, start_sample:end_sample]`: this is a `(n_channels × 2560)` array.  
   - Save it as `patientID_0001.npy`, `patientID_0002.npy`, … under `ProcessedNpy/`.  
6. **Assign each window an overlapping event label**:  
   - Skip any event whose `(end_sec - start_sec) ≥ 29 s` (these are sleep staged which overlap with events and are too long in duration for most cases).  
   - Compute overlap with each remaining event:  
     ```
     overlap_start = max(window_start, event_start)
     overlap_end   = min(window_end,   event_end)
     overlap = max(0, overlap_end - overlap_start)
     if overlap > 0: collect (event_label, overlap)
     ```  
   - Choose the event with the **maximum overlap**; if none overlap, label = `"None"`.  
   - Append that string into `event_array`.  

In [None]:
import numpy as np
import pandas as pd
import mne
import glob
import os

# === Parameters ===
window_size = 10     # seconds
step_size = 2        # seconds
npy_path = "/content/drive/MyDrive/SleepApnea/ProcessedNpy"  # set your target folder
os.makedirs(npy_path, exist_ok=True)

# === Main Processing ===
high_ahi_path = "/content/drive/MyDrive/SleepApnea/Data/High_AHI"
edf_files = glob.glob(os.path.join(high_ahi_path, "*.edf"))

for edf_path in edf_files:
    basename = os.path.basename(edf_path)
    patient_id = basename.split("_")[0]
    print(f"Processing {patient_id}...")

    # === Load EDF ===
    raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
    sfreq = int(raw.info['sfreq'])
    signal = raw.get_data()
    channels = raw.ch_names
    total_sec = signal.shape[1] / sfreq

    # === Load Event TXT ===
    txt_path = os.path.join(high_ahi_path, f"{patient_id} epoch and event list.txt")
    ev_df, ev_start_time = read_txt_file(txt_path, start_line=13)
    ev_df = add_seconds_since_start(ev_df, ev_start_time)

    # === Build event list
    event_list = []
    for _, row in ev_df.iterrows():
        start = row['seconds_since_start']
        end = start + float(row['Duration']) if pd.notna(row['Duration']) else start + window_size
        event_list.append((start, end, row['Event']))

    # === Sliding window
    start_times = np.arange(0, total_sec - window_size + 1, window_size - step_size)
    event_array = []

    for idx, start_sec in enumerate(start_times):
        end_sec = start_sec + window_size
        start_sample = int(start_sec * sfreq)
        end_sample = int(end_sec * sfreq)

        # === Extract signal ===
        segment = signal[:, start_sample:end_sample]
        filename = os.path.join(npy_path, f"{patient_id}_{idx+1:04d}.npy")
        np.save(filename, {'signal': segment, 'channels': channels}, allow_pickle=True)

        # === Assign overlapping event
        event_duration_mapping = []
        for ev_start, ev_end, ev_label in event_list:
            if ev_end - ev_start >= 29:
              continue
            overlap_start = max(start_sec, ev_start)
            overlap_end = min(end_sec, ev_end)
            overlap = max(0, overlap_end - overlap_start)

            if overlap > 0:
                event_duration_mapping.append((ev_label, overlap))

        if event_duration_mapping:
            event_duration_mapping.sort(key=lambda x: x[1], reverse=True)
            matched = event_duration_mapping[0][0]
        else:
            matched = "None"

        event_array.append(matched)

    # === Save events as TXT file
    event_txt_path = os.path.join(npy_path, f"{patient_id}_event_array.txt")
    with open(event_txt_path, "w") as f:
        for label in event_array:
            f.write(f"{label}\n")

    print(f"Saved {len(event_array)} samples and events for {patient_id}.")


Processing OSAA10006...
0                         Event          Start Time  seconds_since_start
0                   Eyes Closed 2016-08-15 23:10:47               3354.0
1                     Eyes Open 2016-08-15 23:11:32               3399.0
2               Eyes Left/Right 2016-08-15 23:12:38               3465.0
3                    Eye Blinks 2016-08-15 23:12:49               3476.0
4  Teeth Grind, Yawn or Swallow 2016-08-15 23:13:08               3495.0
Saved 3129 samples and events for OSAA10006.
Processing OSAA10002...
0                              Event          Start Time  seconds_since_start
0   Bad Data (Channel SpO2  (Amp 1)) 2016-08-15 22:14:53                  0.0
1  Bad Data (Channel Pulse  (Amp 1)) 2016-08-15 22:14:53                  0.0
2                        Eyes Closed 2016-08-15 22:19:05                252.0
3                          Eyes Open 2016-08-15 22:19:29                276.0
4                    Eyes Left/Right 2016-08-15 22:20:03                310.0
S

In [None]:
import os
import glob

def save_labels(apath, step_size=8, forecast_seconds=20):
    """
    For each *_event_array.txt in apath, generate a patient_id_labels.txt
    containing one integer per sliding‐window index:
      1  = window with (hypo)apnea
      0  = window forecast_steps before that event
     -1  = everything else

    step_size: how many seconds each window shifts (8 s in your case)
    forecast_seconds: how far ahead to forecast (20 s here)
    """
    forecast_steps = int(forecast_seconds / step_size)

    event_txt_files = glob.glob(os.path.join(apath, "*_event_array.txt"))
    for txt_file in event_txt_files:
        patient_id = os.path.basename(txt_file).split("_event_array.txt")[0]

        # Load event names (one per window, exactly in index order)
        with open(txt_file, "r") as f:
            event_array = [line.strip() for line in f if line.strip()]

        n = len(event_array)
        label_array = [-1] * n   # default = -1

        for i, ev in enumerate(event_array):
            if "apnea" in ev.lower() or "hypopnea" in ev.lower():
                label_array[i] = 1
                target_idx = i - forecast_steps
                if target_idx >= 0:
                    label_array[target_idx] = 0

        # Save the numeric labels to patient_id_labels.txt
        out_path = os.path.join(apath, f"{patient_id}_labels.txt")
        with open(out_path, "w") as out:
            for lbl in label_array:
                out.write(f"{lbl}\n")

        print(f"Saved labels for {patient_id} → {out_path}")



In [None]:
save_labels(npy_path)