In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Feature Statistical Analysis — training_set.parquet
Compute descriptive statistics and IQR-based outlier limits for the 72 feature columns (8 channels × 9 TD features).

In [2]:
# Load the dataset
DATA_PATH = Path("..") / "training_set.parquet"
df = pd.read_parquet(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"Total columns: {len(df.columns)}")
print(f"Feature columns (first 72): {list(df.columns[:72])}")
print(f"Metadata columns (last 4): {list(df.columns[72:])}")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

Dataset shape: (8788319, 76)
Total columns: 76
Feature columns (first 72): ['ch1_LS', 'ch1_MFL', 'ch1_MSR', 'ch1_WAMP', 'ch1_ZC', 'ch1_RMS', 'ch1_IAV', 'ch1_DASDV', 'ch1_VAR', 'ch2_LS', 'ch2_MFL', 'ch2_MSR', 'ch2_WAMP', 'ch2_ZC', 'ch2_RMS', 'ch2_IAV', 'ch2_DASDV', 'ch2_VAR', 'ch3_LS', 'ch3_MFL', 'ch3_MSR', 'ch3_WAMP', 'ch3_ZC', 'ch3_RMS', 'ch3_IAV', 'ch3_DASDV', 'ch3_VAR', 'ch4_LS', 'ch4_MFL', 'ch4_MSR', 'ch4_WAMP', 'ch4_ZC', 'ch4_RMS', 'ch4_IAV', 'ch4_DASDV', 'ch4_VAR', 'ch5_LS', 'ch5_MFL', 'ch5_MSR', 'ch5_WAMP', 'ch5_ZC', 'ch5_RMS', 'ch5_IAV', 'ch5_DASDV', 'ch5_VAR', 'ch6_LS', 'ch6_MFL', 'ch6_MSR', 'ch6_WAMP', 'ch6_ZC', 'ch6_RMS', 'ch6_IAV', 'ch6_DASDV', 'ch6_VAR', 'ch7_LS', 'ch7_MFL', 'ch7_MSR', 'ch7_WAMP', 'ch7_ZC', 'ch7_RMS', 'ch7_IAV', 'ch7_DASDV', 'ch7_VAR', 'ch8_LS', 'ch8_MFL', 'ch8_MSR', 'ch8_WAMP', 'ch8_ZC', 'ch8_RMS', 'ch8_IAV', 'ch8_DASDV', 'ch8_VAR']
Metadata columns (last 4): ['label', 'user', 'sample_id', 'window_start']

Memory usage: 4.23 GB


## Descriptive Statistics & IQR Outlier Limits
For each of the 72 feature columns compute: **count, min, Q1 (25%), Q2 (50%), Q3 (75%), max, IQR, lower limit, upper limit**.

In [4]:
# Select only the 72 feature columns (exclude metadata)
feature_cols = df.columns[:72].tolist()
df_features = df[feature_cols]

# Compute statistics for each feature column
count = df_features.count()
mean_val = df_features.mean()
std_val = df_features.std()
min_val = df_features.min()
Q1 = df_features.quantile(0.25)
Q2 = df_features.quantile(0.50)
Q3 = df_features.quantile(0.75)
max_val = df_features.max()
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Build the summary table: rows = features, columns = statistics
stats_df = pd.DataFrame({
    "count": count,
    "mean": mean_val,
    "std": std_val,
    "min": min_val,
    "Q1 (25%)": Q1,
    "Q2 (50%)": Q2,
    "Q3 (75%)": Q3,
    "max": max_val,
    "IQR": IQR,
    "lower_limit": lower_limit,
    "upper_limit": upper_limit
})

stats_df.index.name = "feature"

print(f"Statistics table shape: {stats_df.shape}  (72 features × 11 statistics)\n")
stats_df

Statistics table shape: (72, 11)  (72 features × 11 statistics)



Unnamed: 0_level_0,count,mean,std,min,Q1 (25%),Q2 (50%),Q3 (75%),max,IQR,lower_limit,upper_limit
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ch1_LS,8788319,0.287362,0.495708,0.015492,0.065382,0.107577,0.255482,7.196148,0.190099,-0.219766,0.540630
ch1_MFL,8788319,0.997902,1.070295,-1.351157,0.181720,0.702566,1.619280,4.951151,1.437560,-1.974621,3.775620
ch1_MSR,8788319,0.470582,0.317484,0.118221,0.277603,0.353458,0.530107,3.128066,0.252504,-0.101153,0.908863
ch1_WAMP,8788319,37.650314,1.567740,22.000000,37.000000,38.000000,39.000000,39.000000,2.000000,34.000000,42.000000
ch1_ZC,8788319,25.735340,3.022020,1.000000,24.000000,26.000000,28.000000,39.000000,4.000000,18.000000,34.000000
...,...,...,...,...,...,...,...,...,...,...,...
ch8_ZC,8788319,25.529375,3.142559,5.000000,23.000000,26.000000,28.000000,39.000000,5.000000,15.500000,35.500000
ch8_RMS,8788319,0.483959,0.894502,0.024297,0.093721,0.147764,0.385471,12.927100,0.291750,-0.343904,0.823096
ch8_IAV,8788319,14.608518,27.529869,0.675411,3.002044,4.683551,11.377490,441.238708,8.375446,-9.561126,23.940660
ch8_DASDV,8788319,0.846410,1.591138,0.034635,0.156670,0.249319,0.671512,23.564337,0.514842,-0.615593,1.443775
