In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import glob
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report

In [2]:
MAX_LEN = 2001 # Number of time points to use from each CSV
SIGNAL_COL = 'F3_1000Hz'
SENSOR_DIR = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"

In [3]:
df_pre = pd.read_csv("label-pre.csv")
df_post = pd.read_csv("label-post.csv")
df_fup = pd.read_csv("label-fup.csv")

df_pre['Time'] = 'PRE'
df_post['Time'] = 'POST'
df_fup['Time'] = 'FOLLOWUP'

df_clinical = pd.concat([df_pre, df_post, df_fup]).reset_index(drop=True)

df_mapping = pd.read_excel("meta/measured_with_elastograph_patients.xlsx")
df_mapping.columns = df_mapping.columns.str.strip()

df_master = pd.merge(df_mapping, df_clinical, on=['ID', 'Time'], how='inner')
df_master = df_master.dropna(subset=['Target_Metabolic_Disease'])

In [4]:
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, sensor_dir, transform_meta=None):
        self.df = dataframe
        self.sensor_dir = sensor_dir
        self.transform_meta = transform_meta

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        p_num = str(int(row['N_PACIENT'])).zfill(4)
        
        # Load Signal (Pick the first file found for this patient number)
        # In a more advanced version, you could load ALL reps/angles as separate samples
        file_path = glob.glob(os.path.join(self.sensor_dir, f"in_test_{p_num}_*.csv"))[0]
        sig_df = pd.read_csv(file_path)
        signal = sig_df[SIGNAL_COL].fillna(0).values[:MAX_LEN]
        
        # Pad signal if it's too short
        if len(signal) < MAX_LEN:
            signal = np.pad(signal, (0, MAX_LEN - len(signal)), 'constant')
        
        # Metadata Features
        meta_feats = np.array([row['Sex'], row['Age'], row['Waist_Circum_mean']], dtype=np.float32)
        
        label = int(row['Target_Metabolic_Disease'])
        
        return (torch.tensor(signal, dtype=torch.float32).unsqueeze(0), # [1, 2000]
                torch.tensor(meta_feats, dtype=torch.float32), 
                torch.tensor(label, dtype=torch.long))

# Multi Modal CNN Architecture

In [5]:
class MultiModalCNN(nn.Module):
    def __init__(self, num_meta_features):
        super(MultiModalCNN, self).__init__()
        
        # --- Branch 1: The Signal Brain (1D CNN) ---
        self.signal_branch = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(4), # Reduces 2000 -> 500
            
            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(4), # Reduces 500 -> 125
            
            nn.AdaptiveAvgPool1d(1) # Flatten to [64]
        )
        
        # --- Branch 2: Metadata branch ---
        self.meta_branch = nn.Sequential(
            nn.Linear(num_meta_features, 16),
            nn.ReLU()
        )
        
        # --- Branch 3: Fusion ---
        # CNN output (64) + Meta output (16) = 80
        self.classifier = nn.Sequential(
            nn.Linear(64 + 16, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 2) # Binary: Healthy vs Disease
        )

    def forward(self, signal, meta):
        sig_out = self.signal_branch(signal).squeeze(-1)
        meta_out = self.meta_branch(meta)
        
        # Concatenate!
        combined = torch.cat((sig_out, meta_out), dim=1)
        return self.classifier(combined)

# Training

In [6]:
# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gkf = GroupKFold(n_splits=5)
groups = df_master['ID'].values
X_indices = np.arange(len(df_master))
Y_labels = df_master['Target_Metabolic_Disease'].values

fold_results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X_indices, Y_labels, groups)):
    print(f"\n--- Training Fold {fold+1} ---")
    
    train_ds = MultiModalDataset(df_master.iloc[train_idx], SENSOR_DIR)
    val_ds = MultiModalDataset(df_master.iloc[val_idx], SENSOR_DIR)
    
    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=16)
    
    model = MultiModalCNN(num_meta_features=3).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Simple training loop
    for epoch in range(20):
        model.train()
        for sig, meta, lbl in train_loader:
            sig, meta, lbl = sig.to(device), meta.to(device), lbl.to(device)
            optimizer.zero_grad()
            outputs = model(sig, meta)
            loss = criterion(outputs, lbl)
            loss.backward()
            optimizer.step()
            
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sig, meta, lbl in val_loader:
            sig, meta, lbl = sig.to(device), meta.to(device), lbl.to(device)
            outputs = model(sig, meta)
            _, predicted = torch.max(outputs.data, 1)
            total += lbl.size(0)
            correct += (predicted == lbl).sum().item()
            
    acc = 100 * correct / total
    fold_results.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.2f}%")

print(f"\nAverage Multi-Modal Accuracy: {np.mean(fold_results):.2f}%")


--- Training Fold 1 ---
Fold 1 Accuracy: 44.44%

--- Training Fold 2 ---
Fold 2 Accuracy: 40.74%

--- Training Fold 3 ---
Fold 3 Accuracy: 44.44%

--- Training Fold 4 ---
Fold 4 Accuracy: 37.04%

--- Training Fold 5 ---
Fold 5 Accuracy: 57.69%

Average Multi-Modal Accuracy: 44.87%


# New I guess

In [10]:
SENSOR_DIR = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"
MAX_LEN = 2000  # Number of data points per signal

# Load Clinical Labels
df_pre = pd.read_csv("label-pre.csv")
df_pre['Time'] = 'PRE'

df_post = pd.read_csv("label-post.csv")
df_post['Time'] = 'POST'

df_fup = pd.read_csv("label-fup.csv")
df_fup['Time'] = 'FOLLOWUP'

df_clinical = pd.concat([df_pre, df_post, df_fup], axis=0, ignore_index=True)

# Load Mapping Table (Rosetta Stone)
df_mapping = pd.read_excel("meta/measured_with_elastograph_patients.xlsx")
df_mapping.columns = df_mapping.columns.str.strip()

# Merge info
df_master = pd.merge(df_mapping, df_clinical, on=['ID', 'Time'], how='inner')
df_master = df_master.dropna(subset=['Target_Metabolic_Disease', 'Sex', 'Age', 'Waist_Circum_mean'])

# Global Scaling of Metadata (Prevents Waist > Age > Sex bias)
meta_cols = ['Sex', 'Age', 'Waist_Circum_mean']
scaler = StandardScaler()
df_master[meta_cols] = scaler.fit_transform(df_master[meta_cols])

print(f"Data merged. Unique Patients: {df_master['ID'].nunique()}. Total visit records: {len(df_master)}")

# ==========================================
# 2. DATASET CLASS (Augmented: Uses ALL files)
# ==========================================
class MetabolicDataset(Dataset):
    def __init__(self, dataframe, sensor_dir, max_len=2000):
        self.max_len = max_len
        self.all_samples = []
        
        print("Indexing all sensor files for augmentation...")
        for _, row in dataframe.iterrows():
            p_num_str = str(int(row['N_PACIENT'])).zfill(4)
            # Find ALL repetitions/angles for this specific patient visit
            patient_files = glob.glob(os.path.join(sensor_dir, f"in_test_{p_num_str}_*.csv"))
            
            for f in patient_files:
                self.all_samples.append({
                    'filepath': f,
                    'meta': torch.tensor([row['Sex'], row['Age'], row['Waist_Circum_mean']], dtype=torch.float32),
                    'label': int(row['Target_Metabolic_Disease']),
                    'id': row['ID'] # Needed for GroupKFold
                })
        print(f"Total augmented samples (files): {len(self.all_samples)}")

    def __len__(self):
        return len(self.all_samples)

    def __getitem__(self, idx):
        sample = self.all_samples[idx]
        
        # Load Signal
        df_sig = pd.read_csv(sample['filepath'])
        # Take first column (usually F1_2000Hz), clean nans
        signal = df_sig.iloc[:, 2].fillna(0).values[:self.max_len]
        
        # Pad if short
        if len(signal) < self.max_len:
            signal = np.pad(signal, (0, self.max_len - len(signal)), 'constant')
        
        # SIGNAL NORMALIZATION (Per sample)
        mean = np.mean(signal)
        std = np.std(signal) + 1e-8
        signal = (signal - mean) / std
        
        return (torch.tensor(signal, dtype=torch.float32).unsqueeze(0), 
                sample['meta'], 
                torch.tensor(sample['label'], dtype=torch.long))

# ==========================================
# 3. MULTI-MODAL CNN ARCHITECTURE
# ==========================================
class MultiModalCNN(nn.Module):
    def __init__(self):
        super(MultiModalCNN, self).__init__()
        
        # Branch 1: Signal Processing
        self.signal_branch = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=15, stride=2, padding=7),
            nn.BatchNorm1d(16),
            nn.LeakyReLU(0.1),
            nn.MaxPool1d(4),
            
            nn.Conv1d(16, 32, kernel_size=7, stride=1, padding=3),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool1d(1)
        )
        
        # Branch 2: Clinical Fusion
        # 32 (from CNN) + 3 (Sex, Age, Waist) = 35
        self.classifier = nn.Sequential(
            nn.Linear(32 + 3, 32),
            nn.ReLU(),
            nn.Dropout(0.4), # Critical to stop overfitting
            nn.Linear(32, 2)
        )

    def forward(self, signal, meta):
        sig_out = self.signal_branch(signal).squeeze(-1)
        combined = torch.cat((sig_out, meta), dim=1)
        return self.classifier(combined)

# ==========================================
# 4. TRAINING WITH GROUP-K-FOLD
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Determine Class Weights and explicitly cast to FLOAT32
counts = df_master['Target_Metabolic_Disease'].value_counts().sort_index()
# We use .values to ensure we get the numbers correctly
weights = 1.0 / counts.values
class_weights = torch.tensor(weights, dtype=torch.float32).to(device)

gkf = GroupKFold(n_splits=5)
ids = df_master['ID'].values
fold_accs = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(df_master, groups=ids)):
    print(f"\n=== FOLD {fold+1} ===")
    
    train_ds = MetabolicDataset(df_master.iloc[train_idx], SENSOR_DIR)
    val_ds = MetabolicDataset(df_master.iloc[val_idx], SENSOR_DIR)
    
    # Using smaller batch size can sometimes help stability on small datasets
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)
    
    model = MultiModalCNN().to(device)
    
    # Ensure model parameters are float32
    model = model.float() 
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
    
    # The weight here must be the same type as the model output (Float32)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Training Loop
    for epoch in range(30):
        model.train()
        for sig, meta, lbl in train_loader:
            # Explicitly move to device and ensure float type
            sig = sig.to(device).float()
            meta = meta.to(device).float()
            lbl = lbl.to(device).long() # Labels must be Long for CrossEntropy
            
            optimizer.zero_grad()
            outputs = model(sig, meta)
            
            loss = criterion(outputs, lbl)
            loss.backward()
            optimizer.step()
            
    # Validation
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for sig, meta, lbl in val_loader:
            sig = sig.to(device).float()
            meta = meta.to(device).float()
            lbl = lbl.to(device).long()
            
            outputs = model(sig, meta)
            preds = outputs.argmax(dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(lbl.cpu().numpy())
            
    acc = accuracy_score(all_true, all_preds)
    fold_accs.append(acc)
    print(f"Fold Accuracy: {acc*100:.2f}%")

print(f"\nFinal Average Multi-Modal Accuracy: {np.mean(fold_accs)*100:.2f}%")

Data merged. Unique Patients: 104. Total visit records: 134

=== FOLD 1 ===
Indexing all sensor files for augmentation...
Total augmented samples (files): 1237
Indexing all sensor files for augmentation...
Total augmented samples (files): 314
Fold Accuracy: 63.69%

=== FOLD 2 ===
Indexing all sensor files for augmentation...
Total augmented samples (files): 1231
Indexing all sensor files for augmentation...
Total augmented samples (files): 320
Fold Accuracy: 46.56%

=== FOLD 3 ===
Indexing all sensor files for augmentation...
Total augmented samples (files): 1236
Indexing all sensor files for augmentation...
Total augmented samples (files): 315
Fold Accuracy: 60.95%

=== FOLD 4 ===
Indexing all sensor files for augmentation...
Total augmented samples (files): 1249
Indexing all sensor files for augmentation...
Total augmented samples (files): 302
Fold Accuracy: 59.93%

=== FOLD 5 ===
Indexing all sensor files for augmentation...
Total augmented samples (files): 1251
Indexing all sensor 

# More

In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import glob
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from scipy import signal as scipy_signal

# ==========================================
# 1. SETUP & DATA MERGING (Same as before)
# ==========================================
SENSOR_DIR = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"
FS = 2000  # Sampling frequency

# Load and Merge Clinical (ensure 'Time' is added as discussed previously)
df_pre = pd.read_csv("label-pre.csv"); df_pre['Time'] = 'PRE'
df_post = pd.read_csv("label-post.csv"); df_post['Time'] = 'POST'
df_fup = pd.read_csv("label-fup.csv"); df_fup['Time'] = 'FOLLOWUP'
df_clinical = pd.concat([df_pre, df_post, df_fup], axis=0, ignore_index=True)

df_mapping = pd.read_excel("meta/measured_with_elastograph_patients.xlsx")
df_mapping.columns = df_mapping.columns.str.strip()

df_master = pd.merge(df_mapping, df_clinical, on=['ID', 'Time'], how='inner')
df_master = df_master.dropna(subset=['Target_Metabolic_Disease', 'Sex', 'Age', 'Waist_Circum_mean'])

# Scaling
meta_cols = ['Sex', 'Age', 'Waist_Circum_mean']
scaler = StandardScaler()
df_master[meta_cols] = scaler.fit_transform(df_master[meta_cols])

# ==========================================
# 2. ADVANCED DATASET (Frequency Domain)
# ==========================================
class MetabolicPSDDataset(Dataset):
    def __init__(self, dataframe, sensor_dir):
        self.sensor_dir = sensor_dir
        self.all_samples = []
        
        for _, row in dataframe.iterrows():
            p_num_str = str(int(row['N_PACIENT'])).zfill(4)
            patient_files = glob.glob(os.path.join(sensor_dir, f"in_test_{p_num_str}_*.csv"))
            for f in patient_files:
                self.all_samples.append({
                    'filepath': f,
                    'meta': torch.tensor([row['Sex'], row['Age'], row['Waist_Circum_mean']], dtype=torch.float32),
                    'label': int(row['Target_Metabolic_Disease']),
                    'id': row['ID']
                })

    def __len__(self):
        return len(self.all_samples)

    def __getitem__(self, idx):
        sample = self.all_samples[idx]
        df_sig = pd.read_csv(sample['filepath'])
        raw_signal = df_sig.iloc[:, 2].fillna(0).values
        
        # A. BANDPASS FILTER (Remove noise below 10Hz and above 400Hz)
        b, a = scipy_signal.butter(4, [10, 400], btype='bandpass', fs=FS)
        filt_signal = scipy_signal.filtfilt(b, a, raw_signal)
        
        # B. WELCH PSD (Convert to Frequency Domain)
        # This gives us a 129-point "signature" of the signal's frequencies
        freqs, psd = scipy_signal.welch(filt_signal, fs=FS, nperseg=256)
        
        # C. LOG TRANSFORM (Standard for frequency data)
        psd_log = np.log10(psd + 1e-10)
        
        # D. NORMALIZATION
        psd_norm = (psd_log - np.mean(psd_log)) / (np.std(psd_log) + 1e-8)
        
        return (torch.tensor(psd_norm, dtype=torch.float32).unsqueeze(0), 
                sample['meta'], 
                torch.tensor(sample['label'], dtype=torch.long))

# ==========================================
# 3. FREQUENCY-CONV NET
# ==========================================
class FrequencyCNN(nn.Module):
    def __init__(self):
        super(FrequencyCNN, self).__init__()
        # Input size is [1, 129]
        self.conv_block = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, padding=2),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 + 3, 16),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(16, 2)
        )

    def forward(self, x, meta):
        x = self.conv_block(x).squeeze(-1)
        combined = torch.cat((x, meta), dim=1)
        return self.classifier(combined)

# ==========================================
# 4. TRAINING LOOP
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
counts = df_master['Target_Metabolic_Disease'].value_counts().sort_index()
class_weights = torch.tensor([1.0/counts[0], 1.0/counts[1]], dtype=torch.float32).to(device)

gkf = GroupKFold(n_splits=5)
ids = df_master['ID'].values
fold_accs = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(df_master, groups=ids)):
    print(f"\n=== FOLD {fold+1} ===")
    train_ds = MetabolicPSDDataset(df_master.iloc[train_idx], SENSOR_DIR)
    val_ds = MetabolicPSDDataset(df_master.iloc[val_idx], SENSOR_DIR)
    
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)
    
    model = FrequencyCNN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    for epoch in range(50): # Frequency data converges slower but more stably
        model.train()
        for sig, meta, lbl in train_loader:
            sig, meta, lbl = sig.to(device), meta.to(device), lbl.to(device)
            optimizer.zero_grad()
            loss = criterion(model(sig, meta), lbl)
            loss.backward()
            optimizer.step()
            
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for sig, meta, lbl in val_loader:
            outputs = model(sig.to(device), meta.to(device))
            all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            all_true.extend(lbl.cpu().numpy())
            
    acc = accuracy_score(all_true, all_preds)
    fold_accs.append(acc)
    print(f"Fold Accuracy: {acc*100:.2f}%")

print(f"\nFinal Average PSD-CNN Accuracy: {np.mean(fold_accs)*100:.2f}%")


=== FOLD 1 ===
Fold Accuracy: 55.41%

=== FOLD 2 ===
Fold Accuracy: 60.94%

=== FOLD 3 ===
Fold Accuracy: 68.25%

=== FOLD 4 ===
Fold Accuracy: 51.66%

=== FOLD 5 ===
Fold Accuracy: 67.00%

Final Average PSD-CNN Accuracy: 60.65%


In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import glob
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report

# ==========================================
# 1. SETUP & PATHS
# ==========================================
# Adjust these paths to your computer
SENSOR_DIR = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"
MAPPING_PATH = "meta/measured_with_elastograph_patients.xlsx"
MAX_LEN = 2000  # We will take 2000 points from every file
NUM_CHANNELS = 18 # We will use all columns F1 through F18

# ==========================================
# 2. DATA MERGING & CLEANING
# ==========================================
# Load clinical files and add the 'Time' tag to match the mapping
df_pre = pd.read_csv("label-pre.csv")
df_pre['Time'] = 'PRE'

df_post = pd.read_csv("label-post.csv")
df_post['Time'] = 'POST'

df_fup = pd.read_csv("label-fup.csv")
df_fup['Time'] = 'FOLLOWUP'

df_clinical = pd.concat([df_pre, df_post, df_fup], axis=0, ignore_index=True)

# Load the mapping table (Rosetta Stone)
df_mapping = pd.read_excel(MAPPING_PATH)
df_mapping.columns = df_mapping.columns.str.strip()

# Merge clinical data with mapping table
df_master = pd.merge(df_mapping, df_clinical, on=['ID', 'Time'], how='inner')

# Drop rows that are missing the label or the clinical features
df_master = df_master.dropna(subset=['Target_Metabolic_Disease', 'Sex', 'Age', 'Waist_Circum_mean'])

# Standardize clinical features (Age, Sex, Waist) so they are on the same scale
meta_cols = ['Sex', 'Age', 'Waist_Circum_mean']
scaler = StandardScaler()
df_master[meta_cols] = scaler.fit_transform(df_master[meta_cols])

print(f"Dataset Ready. Total visit records: {len(df_master)}")

# ==========================================
# 3. MULTI-CHANNEL DATASET CLASS
# ==========================================
class MetabolicWholeSignalDataset(Dataset):
    def __init__(self, dataframe, sensor_dir, max_len=2000):
        self.max_len = max_len
        self.sensor_dir = sensor_dir
        self.samples = []
        
        # We index every single file for every patient visit
        for _, row in dataframe.iterrows():
            p_num_str = str(int(row['N_PACIENT'])).zfill(4)
            # Find all repetitions/angles for this patient session
            files = glob.glob(os.path.join(self.sensor_dir, f"in_test_{p_num_str}_*.csv"))
            
            for f in files:
                self.samples.append({
                    'file': f,
                    'meta': torch.tensor([row['Sex'], row['Age'], row['Waist_Circum_mean']], dtype=torch.float32),
                    'label': int(row['Target_Metabolic_Disease']),
                    'id': row['ID'] # To keep patient reps together in Fold
                })
        print(f"Total files indexed for training/test: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Load the CSV
        df_sig = pd.read_csv(sample['file'])
        
        # SELECT ALL 18 CHANNELS (F1 to F18)
        # We use regex to grab all columns starting with F
        signal = df_sig.filter(regex='^F\d+').fillna(0).values.T # Transpose to [Channels, Time]
        
        # Fix Length
        if signal.shape[1] > self.max_len:
            signal = signal[:, :self.max_len]
        else:
            pad_width = self.max_len - signal.shape[1]
            signal = np.pad(signal, ((0,0), (0, pad_width)), mode='constant')
            
        # NORMALIZE SIGNAL (Per channel)
        for i in range(signal.shape[0]):
            std = np.std(signal[i, :]) + 1e-8
            signal[i, :] = (signal[i, :] - np.mean(signal[i, :])) / std
            
        return (torch.tensor(signal, dtype=torch.float32), 
                sample['meta'], 
                torch.tensor(sample['label'], dtype=torch.long))

# ==========================================
# 4. MULTI-CHANNEL CNN MODEL
# ==========================================
class MultiChannelCNN(nn.Module):
    def __init__(self, in_channels=18):
        super(MultiChannelCNN, self).__init__()
        
        # Branch 1: The Signal Brain (looks at 18 frequencies)
        self.signal_branch = nn.Sequential(
            nn.Conv1d(in_channels, 32, kernel_size=11, stride=2, padding=5),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(4),
            
            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1) # Compress time to 1 feature per filter
        )
        
        # Branch 2: The Fusion (Signal + Metadata)
        # 64 features from signal + 3 features from clinical (Age, Sex, Waist)
        self.classifier = nn.Sequential(
            nn.Linear(64 + 3, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, 2)
        )

    def forward(self, signal, meta):
        sig_features = self.signal_branch(signal).squeeze(-1)
        combined = torch.cat((sig_features, meta), dim=1)
        return self.classifier(combined)

# ==========================================
# 5. TRAINING ENGINE (GroupKFold)
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Calculate weights to handle class imbalance
counts = df_master['Target_Metabolic_Disease'].value_counts().sort_index()
weights = 1.0 / counts.values
class_weights = torch.tensor(weights, dtype=torch.float32).to(device)

gkf = GroupKFold(n_splits=5)
patient_ids = df_master['ID'].values
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(df_master, groups=patient_ids)):
    print(f"\n=== STARTING FOLD {fold+1} ===")
    
    train_ds = MetabolicWholeSignalDataset(df_master.iloc[train_idx], SENSOR_DIR)
    val_ds = MetabolicWholeSignalDataset(df_master.iloc[val_idx], SENSOR_DIR)
    
    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=16)
    
    model = MultiChannelCNN(in_channels=NUM_CHANNELS).to(device).float()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Training Loop
    for epoch in range(25):
        model.train()
        for sig, meta, lbl in train_loader:
            sig, meta, lbl = sig.to(device).float(), meta.to(device).float(), lbl.to(device).long()
            
            optimizer.zero_grad()
            outputs = model(sig, meta)
            loss = criterion(outputs, lbl)
            loss.backward()
            optimizer.step()
            
    # Evaluation
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for sig, meta, lbl in val_loader:
            outputs = model(sig.to(device).float(), meta.to(device).float())
            all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            all_true.extend(lbl.cpu().numpy())
            
    acc = accuracy_score(all_true, all_preds)
    fold_scores.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc*100:.2f}%")

print(f"\nFinal Average Multi-Channel Accuracy: {np.mean(fold_scores)*100:.2f}%")

  signal = df_sig.filter(regex='^F\d+').fillna(0).values.T # Transpose to [Channels, Time]


Dataset Ready. Total visit records: 134

=== STARTING FOLD 1 ===
Total files indexed for training/test: 1237
Total files indexed for training/test: 314
Fold 1 Accuracy: 58.92%

=== STARTING FOLD 2 ===
Total files indexed for training/test: 1231
Total files indexed for training/test: 320
Fold 2 Accuracy: 53.44%

=== STARTING FOLD 3 ===
Total files indexed for training/test: 1236
Total files indexed for training/test: 315
Fold 3 Accuracy: 64.76%

=== STARTING FOLD 4 ===
Total files indexed for training/test: 1249
Total files indexed for training/test: 302
Fold 4 Accuracy: 58.28%

=== STARTING FOLD 5 ===
Total files indexed for training/test: 1251
Total files indexed for training/test: 300
Fold 5 Accuracy: 66.00%

Final Average Multi-Channel Accuracy: 60.28%


# TCN Finally

In [3]:
import pandas as pd
import numpy as np
import os
import glob
import tensorflow as tf
from tensorflow.keras import layers, models, Input
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from scipy.signal import butter, filtfilt

# ==========================================
# 1. PREPROCESSING CONFIGURATION
# ==========================================
SENSOR_DIR = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"
FS = 2000          # Sampling Frequency
LOWCUT = 10        # Remove motion artifacts
HIGHCUT = 450      # Remove high-freq electronic noise
MAX_LEN = 2000     # Time steps
CHANNELS = 18      # F1 to F18

def bandpass_filter(data):
    """Clean the signal to highlight biological vibrations."""
    nyq = 0.5 * FS
    low = LOWCUT / nyq
    high = HIGHCUT / nyq
    b, a = butter(4, [low, high], btype='band')
    # Apply filter across the time axis (axis 0)
    return filtfilt(b, a, data, axis=0)

# ==========================================
# 2. DATA MERGING & CLEANING
# ==========================================
df_pre = pd.read_csv("label-pre.csv"); df_pre['Time'] = 'PRE'
df_post = pd.read_csv("label-post.csv"); df_post['Time'] = 'POST'
df_fup = pd.read_csv("label-fup.csv"); df_fup['Time'] = 'FOLLOWUP'
df_clinical = pd.concat([df_pre, df_post, df_fup], axis=0, ignore_index=True)

df_mapping = pd.read_excel("meta/measured_with_elastograph_patients.xlsx")
df_mapping.columns = df_mapping.columns.str.strip()

df_master = pd.merge(df_mapping, df_clinical, on=['ID', 'Time'], how='inner')
df_master = df_master.dropna(subset=['Target_Metabolic_Disease', 'Sex', 'Age', 'Waist_Circum_mean'])

# Standardize Metadata
meta_cols = ['Sex', 'Age', 'Waist_Circum_mean']
scaler = StandardScaler()
df_master[meta_cols] = scaler.fit_transform(df_master[meta_cols])

# ==========================================
# 3. DATA LOADING FUNCTION
# ==========================================
def load_patient_data(dataframe):
    X_sig, X_meta, Y, groups = [], [], [], []
    
    for _, row in dataframe.iterrows():
        p_num_str = str(int(row['N_PACIENT'])).zfill(4)
        files = glob.glob(os.path.join(SENSOR_DIR, f"in_test_{p_num_str}_*.csv"))
        
        for f in files:
            try:
                df_sig = pd.read_csv(f)
                # Select F1-F18 columns
                signal = df_sig.filter(regex='^F\d+').fillna(0).values
                
                # Apply Filter
                signal = bandpass_filter(signal)
                
                # Fix Length
                if len(signal) > MAX_LEN:
                    signal = signal[:MAX_LEN, :]
                else:
                    signal = np.pad(signal, ((0, MAX_LEN - len(signal)), (0, 0)), 'constant')
                
                # Normalize Signal (Z-score)
                signal = (signal - np.mean(signal)) / (np.std(signal) + 1e-8)
                
                X_sig.append(signal)
                X_meta.append(row[meta_cols].values)
                Y.append(row['Target_Metabolic_Disease'])
                groups.append(row['ID'])
            except:
                continue
                
    return np.array(X_sig, dtype='float32'), np.array(X_meta, dtype='float32'), \
           np.array(Y, dtype='int32'), np.array(groups)

# ==========================================
# 4. TCN MODEL ARCHITECTURE (Keras Functional API)
# ==========================================
from tensorflow.keras import layers

def residual_block(x, dilation_rate, nb_filters, kernel_size):
    prev_x = x

    # Main path: length-preserving conv
    conv = layers.Conv1D(
        filters=nb_filters,
        kernel_size=kernel_size,
        dilation_rate=dilation_rate,
        padding='same',      # keep time length
        activation='relu'
    )(x)

    # Skip path: only match channels, keep same length
    if prev_x.shape[-1] != nb_filters:
        prev_x = layers.Conv1D(
            filters=nb_filters,
            kernel_size=1,
            padding='same'    # also length-preserving
        )(prev_x)

    out = layers.Add()([prev_x, conv])
    out = layers.Activation('relu')(out)
    return out

def build_tcn_model():
    # Input 1: Signal [Time steps, Channels]
    sig_input = Input(shape=(MAX_LEN, CHANNELS))
    
    # TCN Layers (Dilated Convolutions)
    x = sig_input
    for d in [1, 2, 4, 8, 16]: # Increasing Dilation allows "Wide View"
        x = residual_block(x, dilation_rate=d, nb_filters=32, kernel_size=3)
    
    x = layers.GlobalAveragePooling1D()(x)
    
    # Input 2: Metadata [Sex, Age, Waist]
    meta_input = Input(shape=(3,))
    m = layers.Dense(16, activation='relu')(meta_input)
    
    # Fusion
    merged = layers.Concatenate()([x, m])
    
    dense = layers.Dense(32, activation='relu')(merged)
    dense = layers.Dropout(0.4)(dense)
    output = layers.Dense(1, activation='sigmoid')(dense)
    
    model = models.Model(inputs=[sig_input, meta_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# ==========================================
# 5. CROSS-VALIDATION LOOP
# ==========================================
gkf = GroupKFold(n_splits=5)
patient_ids = df_master['ID'].values
fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(df_master, groups=patient_ids)):
    print(f"\n--- Training Fold {fold+1} ---")
    
    # Load data for this fold
    X_sig_train, X_meta_train, y_train, _ = load_patient_data(df_master.iloc[train_idx])
    X_sig_val, X_meta_val, y_val, _ = load_patient_data(df_master.iloc[val_idx])
    
    model = build_tcn_model()
    
    # Early stopping prevents the model from memorizing noise
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    model.fit(
        [X_sig_train, X_meta_train], y_train,
        validation_data=([X_sig_val, X_meta_val], y_val),
        epochs=50,
        batch_size=16,
        callbacks=[early_stop],
        verbose=1
    )
    
    # Evaluate
    probs = model.predict([X_sig_val, X_meta_val])
    preds = (probs > 0.5).astype(int)
    acc = accuracy_score(y_val, preds)
    fold_accuracies.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc*100:.2f}%")

print(f"\nFinal Average TCN Accuracy: {np.mean(fold_accuracies)*100:.2f}%")

  signal = df_sig.filter(regex='^F\d+').fillna(0).values



--- Training Fold 1 ---
Epoch 1/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.5133 - loss: 0.7598 - val_accuracy: 0.5987 - val_loss: 0.7090
Epoch 2/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.6079 - loss: 0.6646 - val_accuracy: 0.4745 - val_loss: 0.7008
Epoch 3/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.6297 - loss: 0.6441 - val_accuracy: 0.6306 - val_loss: 0.7443
Epoch 4/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.6758 - loss: 0.6277 - val_accuracy: 0.6465 - val_loss: 0.7553
Epoch 5/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.6896 - loss: 0.6078 - val_accuracy: 0.5924 - val_loss: 0.7546
Epoch 6/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.6960 - loss: 0.6010 - val_accuracy: 0.5955 - val_loss: 0.7691
Epoch 7/5