In [3]:
import os
import pandas as pd

# FIXED: Absolute paths – adjust if needed
PROJECT_ROOT = r"D:\Work\Stress_Level_Prediction\data"

# Print folder structure
train_dir = os.path.join(PROJECT_ROOT, 'Train Data')
print("Train Data folder exists:", os.path.exists(train_dir))
if os.path.exists(train_dir):
    print("Files in Train Data:", os.listdir(train_dir))

zip_dir = os.path.join(train_dir, 'Train Data Zip')
print("\nTrain Data Zip folder exists:", os.path.exists(zip_dir))
if os.path.exists(zip_dir):
    print("Files in Train Data Zip:", os.listdir(zip_dir))
    # Try to load each file
    for file in os.listdir(zip_dir):
        if file.endswith('.xlsx'):
            file_path = os.path.join(zip_dir, file)
            print(f"\nLoading {file}:")
            try:
                df_temp = pd.read_excel(file_path)
                print(f"Shape: {df_temp.shape}")
                print(f"Columns: {df_temp.columns.tolist()}")
                print(df_temp.head(2))
            except Exception as e:
                print(f"Error loading {file}: {e}")
else:
    print("Train Data Zip not found – extract the zip file first!")

# Same for Test
test_dir = os.path.join(PROJECT_ROOT, 'Test Data')
print("\nTest Data folder exists:", os.path.exists(test_dir))
if os.path.exists(test_dir):
    print("Files in Test Data:", os.listdir(test_dir))

test_zip_dir = os.path.join(test_dir, 'Test Zip')
print("Test Zip folder exists:", os.path.exists(test_zip_dir))
if os.path.exists(test_zip_dir):
    print("Files in Test Zip:", os.listdir(test_zip_dir))

Train Data folder exists: True
Files in Train Data: ['Train Data Zip']

Train Data Zip folder exists: True
Files in Train Data Zip: ['frequency_domain_features_train.csv', 'heart_rate_non_linear_features_train.csv', 'time_domain_features_train.csv']

Test Data folder exists: True
Files in Test Data: ['Test Zip']
Test Zip folder exists: True
Files in Test Zip: ['frequency_domain_features_test.csv', 'heart_rate_non_linear_features_test.csv', 'time_domain_features_test.csv']


In [7]:
import pandas as pd
import os
import numpy as np

# FIXED: Absolute paths for CSV files
PROJECT_ROOT = r"D:\Work\Stress_Level_Prediction\data"
TRAIN_TIME = os.path.join(PROJECT_ROOT, 'Train Data', 'Train Data Zip', 'time_domain_features_train.csv')
TRAIN_FREQ = os.path.join(PROJECT_ROOT, 'Train Data', 'Train Data Zip', 'frequency_domain_features_train.csv')
TRAIN_NONLIN = os.path.join(PROJECT_ROOT, 'Train Data', 'Train Data Zip', 'heart_rate_non_linear_features_train.csv')

# Load the 3 train files (CSV)
df_time = pd.read_csv(TRAIN_TIME)
df_freq = pd.read_csv(TRAIN_FREQ)
df_nonlin = pd.read_csv(TRAIN_NONLIN)

print("Time domain shape:", df_time.shape)
print("Time columns:", df_time.columns.tolist())
print(df_time.head(2))

print("\nFrequency domain shape:", df_freq.shape)
print("Frequency columns:", df_freq.columns.tolist())
print(df_freq.head(2))

print("\nNon-linear shape:", df_nonlin.shape)
print("Non-linear columns:", df_nonlin.columns.tolist())
print(df_nonlin.head(2))

# UPDATED: Merge time and frequency on 'uuid' (common column from head)
common_col = 'uuid'
df = pd.merge(df_time, df_freq, on=common_col, how='inner')

print("\nMerged time + freq shape:", df.shape)
print("Merged columns:", df.columns.tolist())
print("\nSummary stats:\n", df.describe())
print("\nMissing values:\n", df.isnull().sum())

# Non-linear is separate – print condition distribution (target for classification)
print("\nNon-linear target 'condition' distribution:")
if 'condition' in df_nonlin.columns:
    print(df_nonlin['condition'].value_counts())
else:
    print("No 'condition' column.")

# UPDATED: Correlations with target 'HR' from time (drop non-numeric 'uuid')
df_numeric = df.drop(columns=['uuid'], errors='ignore')  # Drop string columns
if 'HR' in df_numeric.columns:
    corr_all = df_numeric.corr()['HR'].sort_values(ascending=False)
    print("\nCorrelations with HR (sorted):")
    print(corr_all)
else:
    print("\nTarget 'HR' not found – check columns.")

# Save merged df for later
df.to_csv('../data/heart_stress_merged.csv', index=False)
print("\nMerged data saved to ../data/heart_stress_merged.csv")

Time domain shape: (369289, 20)
Time columns: ['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD', 'HR', 'pNN25', 'pNN50', 'KURT', 'SKEW', 'MEAN_REL_RR', 'MEDIAN_REL_RR', 'SDRR_REL_RR', 'RMSSD_REL_RR', 'SDSD_REL_RR', 'SDRR_RMSSD_REL_RR', 'KURT_REL_RR', 'SKEW_REL_RR', 'uuid']
      MEAN_RR   MEDIAN_RR        SDRR      RMSSD       SDSD  SDRR_RMSSD  \
0  885.157845  853.763730  140.972741  15.554505  15.553371    9.063146   
1  939.425371  948.357865   81.317742  12.964439  12.964195    6.272369   

          HR      pNN25     pNN50      KURT      SKEW  MEAN_REL_RR  \
0  69.499952  11.133333  0.533333 -0.856554  0.335218    -0.000203   
1  64.363150   5.600000  0.000000 -0.408190 -0.155286    -0.000059   

   MEDIAN_REL_RR  SDRR_REL_RR  RMSSD_REL_RR  SDSD_REL_RR  SDRR_RMSSD_REL_RR  \
0      -0.000179     0.017080      0.007969     0.007969           2.143342   
1       0.000611     0.013978      0.004769     0.004769           2.930855   

   KURT_REL_RR  SKEW_REL_RR           