In [2]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report

In [7]:
def extract_signal_features(file_path, sheet_name=2):
    """Extracts RMS from high-frequency sensor data."""
    try:
        df_signal = pd.read_csv(file_path, sheet_name=sheet_name)
        # We focus on the 2000Hz column as it has the most data density
        # Handle NaNs by dropping them for the math
        signal = df_signal['F1_2000Hz'].dropna().values
        if len(signal) == 0: return 0
        
        # Root Mean Square (RMS) calculation
        rms = np.sqrt(np.mean(signal**2))
        return rms
    except:
        return 0

In [8]:
def load_and_merge_data(main_data_path, sensor_folder):
    # Load your main clinical file (Biochem, Demographics, PA)
    # Assuming this is a merged file where each row is a unique ID
    df_clinical = pd.read_excel(main_data_path) # or pd.read_csv
    
    # Placeholder for sensor features
    sensor_features = []

    print(f"Processing {len(df_clinical)} patients...")

    for index, row in df_clinical.iterrows():
        p_id = str(int(row['ID'])).zfill(4) # Formats ID to 0001, 0002 etc
        
        # Find all test files for this specific patient
        search_pattern = os.path.join(sensor_folder, f"in_test_{p_id}_*.csv")
        patient_files = glob.glob(search_pattern)
        
        if patient_files:
            # Calculate RMS for every file (rep/angle) found for this patient
            rms_values = [extract_signal_features(f) for f in patient_files]
            avg_rms = np.mean(rms_values)
        else:
            avg_rms = np.nan # No sensor data found
            
        sensor_features.append(avg_rms)

    df_clinical['Sensor_RMS_Intensity'] = sensor_features
    return df_clinical

In [9]:
def create_metabolic_label(df):
    """
    Creates a 'Disease' label based on IDF criteria.
    Adjust column names to match your file exactly.
    """
    def logic(row):
        score = 0
        # 1. Fasting Glucose > 100
        if row.get('Glucose', 0) > 100: score += 1
        # 2. Triglycerides > 150
        if row.get('Triglycerides', 0) > 150: score += 1
        # 3. BMI > 30 (as proxy for Waist Circumference)
        if row.get('BC_BMI', 0) > 30: score += 1
        # 4. Blood Pressure Systolic > 130
        if row.get('BP_Systolic', 0) > 130: score += 1
        
        return 1 if score >= 2 else 0 # 2+ markers indicates risk

    df['Target_Metabolic_Disease'] = df.apply(logic, axis=1)
    return df

In [None]:
sex = "meta/DB_QUE_Overall_Health_Tempus_2025_09_17_Toni.xlsx"
sensor_dir = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"
age = "meta/DB_RP_TEMPUS_2025_10_09_Toni.xlsx"
data = load_and_merge_data(sex, sensor_dir)
data = create_metabolic_label(data)

# Drop rows where we are missing critical predictors or the label
data = data.dropna(subset=['Target_Metabolic_Disease', 'Sensor_RMS_Intensity'])

# Select potential features (Add your column names here)
features = [
    'Age', 'OH_DEMO_sex', 'BC_BMI', 'dur_day_total_MVPA_bts_5_min_wei', 
    'Sensor_RMS_Intensity', 'HDL_Cholesterol', 'Some_Reproductive_Marker'
]

X = data[features]
y = data['Target_Metabolic_Disease']

# Standardize data (Critical for small datasets)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_biochem_demographics.xlsx'

In [None]:
selector_model = RandomForestClassifier(n_estimators=100, random_state=42)
selector_model.fit(X_scaled, y)

importances = pd.Series(selector_model.feature_importances_, index=features)
print("\n--- Feature Importance ---")
print(importances.sort_values(ascending=False))

In [None]:
loo = LeaveOneOut()
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform Leave-One-Out Cross Validation
scores = cross_val_score(model, X_scaled, y, cv=loo)

print("\n--- Model Results ---")
print(f"Mean Accuracy: {scores.mean() * 100:.2f}%")
print(f"Standard Deviation: {scores.std():.2f}")

# Final training on whole set to see classification report
model.fit(X_scaled, y)
y_pred = model.predict(X_scaled)
print("\nFinal Classification Report (On Full Training Set):")
print(classification_report(y, y_pred))

# Damn

In [None]:
path_health = "meta/DB_QUE_Overall_Health_Tempus_2025_09_17_Toni.xlsx"
path_rp     = "meta/DB_RP_TEMPUS_2025_10_09_Toni.xlsx"
path_pa     = "DB_PA_TEMPUS_Short_Version_Toni.csv"
sensor_dir  = "C:\\DumbStuff\\epf study\\Meta-Elasto\\els\\meta\\Elastography_rawdata\\oldcode\\"

In [10]:
def load_master_data():
    # Load separate files
    df_health = pd.read_excel(path_health)
    df_rp     = pd.read_excel(path_rp)
    df_pa     = pd.read_csv(path_pa)
    
    # IMPORTANT: We focus only on measurement 't1' (Baseline) for diagnosis
    df_pa = df_pa[df_pa['measurement'] == 't1']
    
    # Merge Step 1: Health + Reproductive Profile
    # Assuming both use 'ID' as the common column
    combined = pd.merge(df_health, df_rp, on='ID', how='inner')
    
    # Merge Step 2: Add Physical Activity
    combined = pd.merge(combined, df_pa, on='ID', how='inner')
    
    # Merge Step 3: Add Biochemistry (Replace with your actual biochem path if separate)
    # combined = pd.merge(combined, df_biochem, on='ID', how='inner')
    
    return combined

In [8]:
import pandas as pd
import glob
import os

folder = "meta"
pattern = os.path.join(folder, "*.xlsx")
files = glob.glob(pattern)  # all Excel files in folder [web:6][web:9]

target_cols = ["BC_WC_mean_PRE", "BC_WC_mean_POST", "BC_WC_mean_FUP"]  # columns you want

# # Option 1: store per-file extracted data
# extracted_per_file = {}

# for f in files:
#     df = pd.read_excel(f)  # read whole file [web:6]
#     present = [c for c in target_cols if c in df.columns]
#     if present:
#         extracted_per_file[os.path.basename(f)] = df[present].copy()

# Option 2: combine all found columns into one DataFrame
combined_list = []
for f in files:
    print(f"Processing file: {f}")
    df = pd.read_excel(f)
    present = [c for c in target_cols if c in df.columns]
    if present:
        tmp = df[present].copy()
        tmp["source_file"] = os.path.basename(f)
        combined_list.append(tmp)
        print(f"Extracted from {f}: columns {present}")

combined = pd.concat(combined_list, ignore_index=True) if combined_list else pd.DataFrame()

Processing file: meta\DB_BC_Tempus_2025_09_17_Toni.xlsx
Processing file: meta\DB_BLOB_Tempus_2025_09_17_Toni.xlsx
Processing file: meta\DB_MEDication_Tempus_2025_09_15_Toni.xlsx
Processing file: meta\DB_NUT_R24H_Tempus_2025_01_10_Toni.xlsx
Processing file: meta\DB_QUE_Overall_Health_Tempus_2025_09_17_Toni.xlsx
Processing file: meta\DB_RP_TEMPUS_2025_10_09_Toni.xlsx
Processing file: meta\measured_with_elastograph_patients.xlsx
Processing file: meta\patients_newcode_physics.xlsx
Processing file: meta\patients_oldcode_physics.xlsx


In [None]:
df = pd.read_excel('meta/DB_BC_Tempus_2025_09_17_Toni.xlsx', sheet_name='PRE')  # replace with your file path [web:11]
col1 = df['BC_WC_mean_PRE'].tolist()
col2 = df['ID'].tolist()
col3 = df['Sex'].tolist()
col4 = df['Age'].tolist()

out_df = pd.DataFrame({
    'ID': col2,
    'Sex': col3,
    'Age': col4,
    'BC_WC_mean_PRE': col1
})

out_df.to_csv('data.csv', index=False)

In [17]:
df = pd.read_excel('meta/DB_BC_Tempus_2025_09_17_Toni.xlsx', sheet_name='FOLLOW-UP')  # replace with your file path [web:11]
col1 = df['BC_WC_mean_FUP'].tolist()

out_df = pd.read_csv('data.csv')
out_df['BC_WC_mean_FUP'] = col1

out_df.to_csv('data.csv', index=False)

In [23]:
types = ['PRE', 'POST', 'FOLLOW-UP']
for label in types:
    df = pd.read_excel('meta/DB_BLOB_Tempus_2025_09_17_Toni.xlsx', sheet_name=label)
    if label == 'FOLLOW-UP':
        label = 'FUP'
    col1 = df[f'BLOB_Gluc_{label}'].tolist()
    out_df = pd.read_csv('data.csv')
    out_df[f'BLOB_Gluc_{label}'] = col1
    out_df.to_csv('data.csv', index=False)

In [39]:
def create_metabolic_label(df):
    """
    Creates a 'Disease' label based on IDF-like criteria.
    Assumes the following columns (adjust names as needed):
      - 'Sex' (0 = man, 1 = woman)
      - 'Waist_Circum_Mean' (cm)
      - 'Triglycerides' (mg/dL)
      - 'HDL_Cholesterol' (mg/dL)
      - 'Blood_Pressure_Systolic'
      - 'Blood_Pressure_Diastolic'
      - 'Glucose' (fasting, mg/dL)
    """

    def logic(row):
        score = 0

        sex = row.get('Sex', 0)  # 0 = man, 1 = woman

        # 1. Fasting Glucose > 100 mg/dL
        if row.get('Glucose', 0) > 100:
            score += 1

        # 2. Triglycerides > 150 mg/dL
        if row.get('Triglycerides', 0) > 150:
            score += 1

        # 3. Waist Circumference (central obesity)
        waist = row.get('Waist_Circum_mean', 0)
        if sex == 0:  # man
            if waist > 94:
                score += 1
        else:  # woman
            if waist > 80:
                score += 1

        # 4. Blood Pressure: systolic >130 or diastolic >85
        if (row.get('Blood_Pressure', 0) > 130):
            score += 1

        # 5. HDL Cholesterol (low)
        hdl = row.get('HDL_Cholestrol', 999)
        if sex == 0:  # man
            if hdl < 40:
                score += 1
        else:  # woman
            if hdl < 50:
                score += 1

        # Label: 1 if at least 2 abnormal markers
        return 1 if score >= 2 else 0

    df['Target_Metabolic_Disease'] = df.apply(logic, axis=1)
    return df

In [43]:
import numpy as np

def create_metabolic_label_clean(df):
    """
    Creates a 'Disease' label based on IDF-like criteria.
    Assumes the following columns (adjust names as needed):
      - 'Sex' (0 = man, 1 = woman)
      - 'Waist_Circum_Mean' (cm)
      - 'Triglycerides' (mg/dL)
      - 'HDL_Cholesterol' (mg/dL)
      - 'Blood_Pressure_Systolic'
      - 'Blood_Pressure_Diastolic'
      - 'Glucose' (fasting, mg/dL)
    """

    required_cols = [
        'Sex',
        'Waist_Circum_mean',
        'Triglycerides',
        'HDL_Cholestrol',
        'Blood_Pressure',
        'Glucose'
    ]

    def logic(row):
        # if ANY required value is missing, skip scoring for this row
        if row[required_cols].isna().any():
            return np.nan  # or return None, or 0, depending on what "skip" should mean for you

        score = 0

        sex = row.get('Sex', 0)  # 0 = man, 1 = woman

        # 1. Fasting Glucose > 100 mg/dL
        if row.get('Glucose', 0) > 100:
            score += 1

        # 2. Triglycerides > 150 mg/dL
        if row.get('Triglycerides', 0) > 150:
            score += 1

        # 3. Waist Circumference (central obesity)
        waist = row.get('Waist_Circum_mean', 0)
        if sex == 0:  # man
            if waist > 94:
                score += 1
        else:  # woman
            if waist > 80:
                score += 1

        # 4. Blood Pressure: systolic >130 or diastolic >85
        if row.get('Blood_Pressure', 0) > 130:
            score += 1

        # 5. HDL Cholesterol (low)
        hdl = row.get('HDL_Cholestrol', 999)
        if sex == 0:  # man
            if hdl < 40:
                score += 1
        else:  # woman
            if hdl < 50:
                score += 1

        # Label: 1 if at least 2 abnormal markers
        return 1 if score >= 2 else 0

    df['Target_Metabolic_Disease'] = df.apply(logic, axis=1)
    return df


In [46]:
df = pd.read_csv('fup.csv')
label = create_metabolic_label_clean(df)
label.to_csv('label-fup.csv', index=False)

In [40]:
df = pd.read_csv('pre.csv')
label = create_metabolic_label(df)
label.to_csv('label-pre.csv', index=False)

In [30]:
df = pd.read_excel('meta/DB_BC_Tempus_2025_09_17_Toni.xlsx', sheet_name='FOLLOW-UP')
col1 = df['BC_WC_mean_FUP'].tolist()
col2 = df['ID'].tolist()
col3 = df['Sex'].tolist()
col4 = df['Age'].tolist()

out_df = pd.DataFrame({
    'ID': col2,
    'Sex': col3,
    'Age': col4,
    'Waist_Circum_mean': col1
})

out_df.to_csv('fup.csv', index=False)

In [31]:
names = ['BLOB_TG', 'BLOB_HDL', 'BLOB_Gluc', 'BP_SBP_ave']
for name in names:
    df = pd.read_excel('meta/DB_BLOB_Tempus_2025_09_17_Toni.xlsx', sheet_name='FOLLOW-UP')
    col1 = df[f'{name}_FUP'].tolist()
    out_df = pd.read_csv('fup.csv')
    out_df[f'{name}'] = col1
    out_df.to_csv('fup.csv', index=False)