### 100 ms Window and utility functions

In [1]:
import pandas as pd
import os

# Replace 'your_file.parquet' with the actual file name
filename = 'DB4_w200_env0_f0.6[20250406-194002].parquet'
file_path = 'preprocessed_data'
database = 'DB4'

file_path = os.path.join(file_path, database, filename)

# Load the Parquet file into a pandas DataFrame
df = pd.read_parquet(file_path)

# Display the DataFrame
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'preprocessed_data\\DB4\\DB4_w200_env0_f0.6[20250406-194002].parquet'

In [None]:
import datetime

class Logger:
    def __init__(self, verbose=True, log_to_file=False, log_file="run_log.txt"):
        self.verbose = verbose
        self.log_to_file = log_to_file
        self.log_file = log_file
        if self.log_to_file:
            with open(self.log_file, "w") as f:
                f.write(f"=== PyCaret Run Log - {datetime.datetime.now()} ===\n")

    def log(self, msg, level="INFO"):
        formatted = f"[{level}] {msg}"
        if self.verbose:
            print(formatted)
        if self.log_to_file:
            with open(self.log_file, "a") as f:
                f.write(formatted + "\n")


In [2]:
# 1. First, let's identify columns with constant values
def identify_constant_columns(df):
    """
    Identify columns that contain only a single value.
    Returns a list of column names with constant values.
    """
    constant_columns = []
    for col in df.columns:
        # Check if the column has only one unique value
        if df[col].nunique() == 1:
            constant_value = df[col].iloc[0]
            constant_columns.append((col, constant_value))
    return constant_columns

# 3. Apply the functions to identify constant columns and rows
constant_columns = identify_constant_columns(df)
print(f"Found {len(constant_columns)} columns with constant values:")
for col, value in constant_columns:
    print(f"  - {col}: {value}")

# 4. Create a cleaned dataframe with constant columns removed
constant_column_names = [col for col, _ in constant_columns]
df_cleaned = df.drop(columns=constant_column_names)
print(f"\nShape after removing constant columns: {df_cleaned.shape}")


Found 1 columns with constant values:
  - HFD: 0

Shape after removing constant columns: (3280764, 48)


In [3]:
# Check for NaN values in the DataFrame
nan_counts = df.isna().sum()
columns_with_nans = nan_counts[nan_counts > 0]

print(f"Found {len(columns_with_nans)} columns with NaN values:")
if len(columns_with_nans) > 0:
    print(columns_with_nans)
else:
    print("  No NaN values found in the dataset!")

# Calculate percentage of NaN values in each column
nan_percentage = (df.isna().sum() / len(df)) * 100
high_nan_cols = nan_percentage[nan_percentage > 1].sort_values(ascending=False)

print(f"\nColumns with >1% NaN values:")
if len(high_nan_cols) > 0:
    for col, pct in high_nan_cols.items():
        print(f"  - {col}: {pct:.2f}%")
else:
    print("  No columns with significant NaN values")


Found 0 columns with NaN values:
  No NaN values found in the dataset!

Columns with >1% NaN values:
  No columns with significant NaN values


In [4]:
# Check for outliers in each feature column
# Exclude metadata columns
metadata_cols = ['subject', 'filename', 'grasp', 'relabeled', 'channel', 'window_id']
feature_cols = [col for col in df.columns if col not in metadata_cols]

# Calculate z-scores for each feature
from scipy import stats
import numpy as np

print("\nChecking for extreme outliers (z-score > 5):")
outlier_cols = {}

for col in feature_cols:
    # Skip columns that have constant values
    if df[col].nunique() <= 1:
        continue
    
    # Calculate z-scores, ignoring NaN values
    z_scores = np.abs(stats.zscore(df[col], nan_policy='omit'))
    
    # Count values with z-score > 5 (extreme outliers)
    outlier_count = np.sum(z_scores > 5)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    if outlier_percentage > 1:  # Only report columns with >1% outliers
        outlier_cols[col] = outlier_percentage

if outlier_cols:
    sorted_outliers = dict(sorted(outlier_cols.items(), key=lambda x: x[1], reverse=True))
    for col, pct in sorted_outliers.items():
        print(f"  - {col}: {pct:.2f}% extreme outliers")
else:
    print("  No columns with significant outliers found")



Checking for extreme outliers (z-score > 5):
  No columns with significant outliers found


In [5]:
# Check feature distributions - focus on skewness
print("\nFeatures with highly skewed distributions (skewness > 3):")
skewed_features = {}

for col in feature_cols:
    if df[col].nunique() <= 1:
        continue
    
    # Calculate skewness, ignoring NaN values
    skewness = df[col].skew()
    
    # Only report features with high skewness (absolute value > 3)
    if abs(skewness) > 3:
        skewed_features[col] = skewness

if skewed_features:
    sorted_skewed = dict(sorted(skewed_features.items(), key=lambda x: abs(x[1]), reverse=True))
    for col, skew in sorted_skewed.items():
        direction = "right" if skew > 0 else "left"
        print(f"  - {col}: {skew:.2f} (skewed {direction})")
else:
    print("  No highly skewed features found")



Features with highly skewed distributions (skewness > 3):
  - CoV: -649.38 (skewed left)
  - MNP_STD: 522.34 (skewed right)
  - MNP: 175.73 (skewed right)
  - TTP: 175.73 (skewed right)
  - VAR: 175.69 (skewed right)
  - WAMP: -9.03 (skewed left)
  - WL_STD: 7.65 (skewed right)
  - TD_STD: 7.65 (skewed right)
  - MAVS_STD: 7.65 (skewed right)
  - mDWT_STD: 7.30 (skewed right)
  - SSC_STD: -6.33 (skewed left)
  - LE: -5.55 (skewed left)
  - KURT: 5.49 (skewed right)
  - MAV_STD: 4.90 (skewed right)
  - IAV_STD: 4.90 (skewed right)
  - DASDV: 4.88 (skewed right)
  - PKF: 4.46 (skewed right)
  - RMS: 4.18 (skewed right)
  - RMS_STD: 4.13 (skewed right)
  - VAR_STD: 4.13 (skewed right)
  - LOG: 3.88 (skewed right)
  - MAV: 3.87 (skewed right)
  - IAV: 3.87 (skewed right)
  - mDWT: 3.81 (skewed right)
  - WL: 3.52 (skewed right)
  - TD: 3.52 (skewed right)
  - MAVS: 3.52 (skewed right)
  - CC: 3.52 (skewed right)


In [None]:
# Check for correlation between features - ITERATIVE APPROACH
import matplotlib.pyplot as plt
import seaborn as sns

# Start with all feature columns (excluding metadata)
remaining_features = [col for col in df_cleaned.columns if col not in metadata_cols]
removed_features = []
correlation_history = []
corr_threshold = 0.95

print("Starting iterative correlation removal process...")
print(f"Initial feature count: {len(remaining_features)}")

iteration = 1
while True:
    # Calculate correlation matrix for remaining features
    corr_matrix = df_cleaned[remaining_features].corr()
    
    # Find the highest correlation pair
    highest_corr = 0
    highest_pair = None
    
    for i in range(len(remaining_features)):
        for j in range(i+1, len(remaining_features)):
            correlation = abs(corr_matrix.iloc[i, j])
            if correlation > highest_corr:
                highest_corr = correlation
                highest_pair = (remaining_features[i], remaining_features[j], correlation)
    
    # If highest correlation is below threshold, we're done
    if highest_corr < corr_threshold:
        print(f"\nNo more pairs with correlation >= {corr_threshold}")
        print(f"Highest remaining correlation: {highest_corr:.4f}")
        break
    
    # Record this pair's correlation
    correlation_history.append(highest_pair)
    
    # Decide which feature to remove based on average correlation with other features
    feature1, feature2, corr_value = highest_pair
    
    # Calculate average correlation with all other features
    avg_corr1 = corr_matrix[feature1].abs().mean()
    avg_corr2 = corr_matrix[feature2].abs().mean()
    
    # Also consider NaN percentage
    nan_pct1 = nan_percentage.get(feature1, 0)
    nan_pct2 = nan_percentage.get(feature2, 0)
    
    # Remove the feature with higher average correlation or more NaNs
    if avg_corr1 > avg_corr2 or nan_pct1 > nan_pct2:
        to_remove = feature1
        to_keep = feature2
    else:
        to_remove = feature2
        to_keep = feature1
    
    # Remove the selected feature
    remaining_features.remove(to_remove)
    removed_features.append((to_remove, highest_corr))
    
    print(f"Iteration {iteration}: Removed '{to_remove} [{to_keep}]' (corr={highest_corr:.4f})")
    iteration += 1

# Print summary of removed features
print(f"\nRemoved {len(removed_features)} features due to high correlation:")
for feature, corr in removed_features:
    print(f"  - {feature} (max corr: {corr:.4f})")

print(f"\nRetained {len(remaining_features)} features after correlation analysis")

# Create a heatmap of the final correlation matrix
plt.figure(figsize=(12, 10))
final_corr = df_cleaned[remaining_features].corr()
sns.heatmap(final_corr, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation Matrix After Removing Highly Correlated Features')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Show top 10 highest remaining correlations
print("\nTop 10 highest remaining correlations:")
high_corrs = []
for i in range(len(remaining_features)):
    for j in range(i+1, len(remaining_features)):
        high_corrs.append((
            remaining_features[i],
            remaining_features[j],
            abs(final_corr.iloc[i, j])
        ))

high_corrs.sort(key=lambda x: x[2], reverse=True)
for f1, f2, c in high_corrs[:10]:
    print(f"  - {f1} and {f2}: {c:.4f}")

In [9]:
import pandas as pd
import numpy as np

# Define your feature groups
feature_groups = {
    'amplitude': [
        'MAV', 'IAV', 'RMS', 'TTP', 'LOG', 'MYOP', 'WAMP', 'DASDV', 'CoV'
    ],
    'change': [
        'WL', 'ZC', 'SSC', 'TD', 'MAVS'
    ],
    'moments': [
        'VAR', 'SKEW', 'KURT'
    ],
    'std': [
        'MAV_STD', 'IAV_STD', 'RMS_STD', 'WL_STD', 'ZC_STD',
        'SSC_STD', 'VAR_STD', 'TD_STD', 'MAVS_STD'
    ],
    'freq': [
        'MDF', 'PKF', 'MNF', 'TTP', 'SM1', 'SM2', 'SM3', 'MNPF'
    ],
    'wavelet': ['mDWT', 'mDWT_STD'],
    'complexity': ['SampEn', 'CC', 'LE', 'HFD']
}

# Build a reverse index: feature -> group
feature_to_group = {}
for group, features in feature_groups.items():
    for f in features:
        feature_to_group[f] = group

# Exclude metadata
metadata_cols = ['subject', 'filename', 'grasp', 'relabeled', 'channel', 'window_id']
feature_cols = [col for col in df_cleaned.columns if col not in metadata_cols]

# Create correlation matrix
corr_threshold = 0.98
corr_matrix = df_cleaned[feature_cols].corr().abs()

# Track what gets removed
to_remove = set()
removed_pairs = []

# Loop through upper triangle of correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        f1 = corr_matrix.columns[i]
        f2 = corr_matrix.columns[j]
        corr = corr_matrix.iloc[i, j]

        if corr > corr_threshold:
            group1 = feature_to_group.get(f1, 'unknown')
            group2 = feature_to_group.get(f2, 'unknown')

            # Prefer to keep features from different groups
            if group1 != group2:
                continue

            # If from same group, remove the one with higher mean correlation to others
            mean_corr_f1 = corr_matrix[f1].mean()
            mean_corr_f2 = corr_matrix[f2].mean()

            if mean_corr_f1 >= mean_corr_f2:
                if f1 not in to_remove:
                    to_remove.add(f1)
                    removed_pairs.append((f1, f2, corr))
            else:
                if f2 not in to_remove:
                    to_remove.add(f2)
                    removed_pairs.append((f2, f1, corr))

# Filter features
final_features = [f for f in feature_cols if f not in to_remove]

# --- Reporting ---
print(f"Initial feature count: {len(feature_cols)}")
print(f"Removed {len(to_remove)} highly correlated features (>|{corr_threshold}|):\n")
for f_rm, f_keep, corr in removed_pairs:
    print(f"  - Removed '{f_rm}' instead of '{f_keep}' (corr={corr:.4f}) [Group: {feature_to_group.get(f_rm)}]")

print(f"\nFinal retained features: {len(final_features)}")

# Optionally: update df
df_selected = df_cleaned[metadata_cols + final_features]


Initial feature count: 42
Removed 11 highly correlated features (>|0.98|):

  - Removed 'IAV' instead of 'MAV' (corr=1.0000) [Group: amplitude]
  - Removed 'RMS' instead of 'MAV' (corr=0.9935) [Group: amplitude]
  - Removed 'MAV' instead of 'LOG' (corr=0.9898) [Group: amplitude]
  - Removed 'MAV_STD' instead of 'IAV_STD' (corr=1.0000) [Group: std]
  - Removed 'RMS_STD' instead of 'MAV_STD' (corr=0.9916) [Group: std]
  - Removed 'VAR_STD' instead of 'MAV_STD' (corr=0.9916) [Group: std]
  - Removed 'WL' instead of 'TD' (corr=1.0000) [Group: change]
  - Removed 'MAVS' instead of 'WL' (corr=1.0000) [Group: change]
  - Removed 'WL_STD' instead of 'TD_STD' (corr=1.0000) [Group: std]
  - Removed 'TD_STD' instead of 'MAVS_STD' (corr=1.0000) [Group: std]
  - Removed 'SM1' instead of 'MNPF' (corr=1.0000) [Group: freq]

Final retained features: 31


In [14]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import warnings
import time
from sklearn.model_selection import train_test_split
from contextlib import redirect_stdout
import io

warnings.filterwarnings("ignore")
logger = Logger(verbose=True, log_to_file=False)

# Measure total time
start_time_total = time.time()

# === Step 1: Filter data for selected classes ===
filtered_labels = [55, 2, 4, 14, 10, 16, 17, 19, 32]
df_reduced = df_selected[df_selected['relabeled'].isin(filtered_labels)].sample(frac=1, random_state=42)

models_to_test = [
    'dummy',    # Dummy classifier (baseline)
    'lr',       # Logistic Regression
    'dt',       # Decision Tree
    'lightgbm', # LightGBM
    'nb',       # Naive Bayes
    'ridge',    # Ridge Classifier
    'knn',      # K Nearest Neighbors
    'ada',      # AdaBoost
    'qda',      # Quadratic Discriminant Analysis
    'et',       # Extra Trees Classifier
    'rf',       # Random Forest
    'gbc',      # Gradient Boosting Classifier
    'svm'       # Support Vector Machine (Linear)
]

# === Step 2: Train/Test Split ===
train_df, test_df = train_test_split(df_reduced, test_size=0.2, random_state=42)
logger.log(f"Train/Test split: {len(train_df)} train, {len(test_df)} test")

# === Step 3: PyCaret Setup (silent) ===
f = io.StringIO()
with redirect_stdout(f):
    s = setup(
        data=train_df,
        target='relabeled',
        numeric_features=[col for col in train_df.columns 
                          if col not in metadata_cols and col != 'relabeled'],
        ignore_features=['subject', 'filename', 'grasp', 'channel', 'window_id'],
        normalize=True,
        transformation=False,
        feature_selection=False,
        polynomial_features=False,
        fold=2,
        session_id=42,
        verbose=False,
        html=False,
        n_jobs=-1,
        use_gpu=True,
        log_experiment=False,
        profile=False
    )
logger.log("PyCaret setup completed")

# === Step 4: Compare Models ===
with redirect_stdout(f):
    best_models = compare_models(
        include=models_to_test,
        n_select=1,
        fold=2,
        sort='Accuracy',
        verbose=False,
        turbo=True
    )
logger.log("Model comparison completed")

# Pull comparison results
comparison_df = pull()
print("\n=== MODEL COMPARISON ===")
# Check if 'Precision' exists in the DataFrame
columns_to_display = ['Model', 'Accuracy', 'AUC', 'Recall', 'F1']
if 'Precision' in comparison_df.columns:
    columns_to_display.insert(4, 'Precision')  # Add 'Precision' if it exists
print(comparison_df[columns_to_display])

# === Step 5: Finalize Best Model ===
selected_model = best_models[0] if isinstance(best_models, list) else best_models
finalized_model = finalize_model(selected_model)

# === Step 6: Evaluate on Test Set ===
test_predictions = predict_model(finalized_model, data=test_df, verbose=False)
logger.log("Model evaluation on test set completed")

# Manual Accuracy (PyCaret's column is often NaN)
accuracy = (test_predictions['prediction_label'] == test_predictions['relabeled']).mean()
print(f"\nTest Accuracy: {accuracy:.4f}")

# === Step 7: Feature Importance ===
def get_model_feature_importance(model, X, y, top_n=20):
    try:
        if hasattr(model, 'feature_importances_'):
            imp = model.feature_importances_
        elif hasattr(model, 'coef_'):
            imp = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
        else:
            from sklearn.inspection import permutation_importance
            result = permutation_importance(model, X, y, n_repeats=3, random_state=42, n_jobs=-1)
            imp = result.importances_mean
        return pd.DataFrame({'Feature': X.columns, 'Importance': imp}).sort_values(by='Importance', ascending=False).head(top_n)
    except Exception as e:
        return pd.DataFrame({'Feature': ['Error'], 'Importance': [0]})
logger.log("Feature importance extraction completed")

X_test = test_df.drop(columns=['relabeled', 'subject', 'filename', 'grasp', 'channel', 'window_id'])
y_test = test_df['relabeled']
importance_df = get_model_feature_importance(finalized_model, X_test, y_test)

print("\n=== TOP FEATURES ===")
print(importance_df)

# === Runtime Summary ===
print(f"\nTotal Runtime: {(time.time() - start_time_total):.2f} seconds")


[INFO] Train/Test split: 219936 train, 54984 test
[INFO] PyCaret setup completed
[INFO] Model comparison completed

=== MODEL COMPARISON ===
                                    Model  Accuracy     AUC  Recall      F1
lightgbm  Light Gradient Boosting Machine    0.2456  0.6814  0.2456  0.2332
rf               Random Forest Classifier    0.2433  0.6728  0.2433  0.2368
et                 Extra Trees Classifier    0.2389  0.6694  0.2389  0.2328
gbc          Gradient Boosting Classifier    0.2365  0.0000  0.2365  0.2195
lr                    Logistic Regression    0.2137  0.0000  0.2137  0.1896
ridge                    Ridge Classifier    0.2120  0.0000  0.2120  0.1820
ada                  Ada Boost Classifier    0.2093  0.0000  0.2093  0.1870
knn                K Neighbors Classifier    0.1827  0.5845  0.1827  0.1833
qda       Quadratic Discriminant Analysis    0.1772  0.0000  0.1772  0.1393
dt               Decision Tree Classifier    0.1734  0.5349  0.1734  0.1734
nb                     

In [None]:
# Create a clean DataFrame with optimal features
# 1. Remove constant columns (already done)
# 2. Remove columns with high NaN percentage (>20%)
high_nan_columns = list(nan_percentage[nan_percentage > 20].index)

# 3. Use the iteratively determined features to remove
correlation_features_to_remove = [feature for feature, _ in removed_features]

# Combine all columns to remove
columns_to_remove = list(set(constant_column_names + high_nan_columns + correlation_features_to_remove))

# Create optimized dataframe
df_optimized = df.drop(columns=columns_to_remove)

print(f"\nOptimized DataFrame Summary:")
print(f"  - Original feature count: {len(feature_cols)}")
print(f"  - Removed constant columns: {len(constant_column_names)}")
print(f"  - Removed high-NaN columns: {len(high_nan_columns)}")
print(f"  - Removed correlated features: {len(correlation_features_to_remove)}")
print(f"  - Final feature count: {len([c for c in df_optimized.columns if c not in metadata_cols])}")
print(f"  - Final shape: {df_optimized.shape}")

# Save the cleaned and optimized dataframe
cleaned_file_path = os.path.join('preprocessed_data', database, f"{os.path.splitext(filename)[0]}_cleaned.parquet")
df_optimized.to_parquet(cleaned_file_path, index=False)
print(f"\nSaved cleaned dataframe to: {cleaned_file_path}")

In [None]:
# Import PyCaret's classification module
from pycaret.classification import *
import pandas as pd
import numpy as np
import warnings
import time
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")  # Suppress all warnings

# Start timing for the entire process
start_time_total = time.time()

# Filter specific movements
print("Step 1: Filtering data for selected grasp types...")
start_time = time.time()
filtered_labels = [55, 2, 4, 14, 10, 16, 17, 19, 32]

# Filter to include only the specified labels
df_reduced = df_optimized[df_optimized['relabeled'].isin(filtered_labels)]
print(f"✓ Done in {time.time() - start_time:.2f}s - Filtered data shape: {df_reduced.shape}")

# SAMPLING - Uncommenting this will make your analysis much faster
start_time = time.time()
print("\nStep 2: Sampling data for faster execution...")
sample_frac = 1  # Using 30% of data for faster results
df_reduced = df_reduced.sample(frac=sample_frac, random_state=42)
print(f"✓ Done in {time.time() - start_time:.2f}s - Using {sample_frac*100:.0f}% sample with shape: {df_reduced.shape}")

# Split data for training
### SUBJECT SPLITTING --- TESTES GENERALIZATION
print("\nStep 3: Splitting data by subjects...")
start_time = time.time()
#subjects = df_reduced['subject'].unique()
#test_subjects = np.random.choice(subjects, size=int(len(subjects)*0.2), replace=False)
#train_df = df_reduced[~df_reduced['subject'].isin(test_subjects)]
#test_df = df_reduced[df_reduced['subject'].isin(test_subjects)]

### Traditioonal split (80/20) can be used as well
train_df, test_df = train_test_split(df_reduced, test_size=0.2, random_state=42)
print(f"✓ Done in {time.time() - start_time:.2f}s - Train: {train_df.shape}, Test: {test_df.shape}")



# Initialize PyCaret setup with maximum speed optimization
print("\nStep 4: Setting up PyCaret (this may take a few minutes)...")
start_time = time.time()
s = setup(
    data=train_df,
    target='relabeled',
    numeric_features=[col for col in train_df.columns 
                     if col not in metadata_cols and col != 'relabeled'],
    ignore_features=['subject', 'filename', 'grasp', 'channel', 'window_id'],
    normalize=True,
    transformation=False,
    feature_selection=False,
    # feature_interaction=False,  # Removed as it is not a valid argument
    polynomial_features=False,
    fold=2,  # Minimum folds for speed
    session_id=42,
    verbose=False,
    html=False,
    n_jobs=-1,
    use_gpu=True,
    log_experiment=False,  # Disable logging for speed
    experiment_name=None,  # Disable experiment name to avoid prints
    # Removed 'silent' argument as it is not valid
    profile=False  # Disable data profiling
)
print(f"✓ Done in {time.time() - start_time:.2f}s")

# Compare all fast models
print("\nStep 5: Training and comparing models...")
start_time = time.time()
models_to_test = [
    'lr',       # Logistic Regression
    'dt',       # Decision Tree
    'lightgbm', # LightGBM
    'nb',       # Naive Bayes
    'ridge',    # Ridge Classifier
    'knn',      # K Nearest Neighbors
    'ada'       # AdaBoost

]

best_models = compare_models(
    include=models_to_test,
    n_select=1,  # Only select the best model to save time
    fold=2,
    sort='Accuracy',
    verbose=False,
    turbo=True
)
print(f"✓ Done in {time.time() - start_time:.2f}s")

# Get the results table with all models compared
print("\n=== MODEL COMPARISON TABLE ===")
comparison_df = pull()
print(comparison_df)

# Select the model with best performance
print("\nStep 6: Selecting best model...")
start_time = time.time()
if isinstance(best_models, list):
    selected_model = best_models[0]
else:
    selected_model = best_models
print(f"✓ Done in {time.time() - start_time:.2f}s - Selected model: {selected_model.__class__.__name__}")

# Fix feature importance with a robust approach
print("\nStep 7: Calculating feature importance (with failsafe)...")
start_time = time.time()

# Define a reliable function to get feature importance for any model
def get_model_feature_importance(model, X, y, top_n=20):
    """Safe function to extract feature importance using multiple methods"""
    feature_names = X.columns.tolist()
    importance_df = None
    
    # Method 1: Try model's built-in feature_importances_
    try:
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importance
            })
            return importance_df.sort_values('Importance', ascending=False).head(top_n)
    except:
        pass
        
    # Method 2: Try model's coef_ attribute (for linear models)
    try:
        if hasattr(model, 'coef_'):
            importance = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importance
            })
            return importance_df.sort_values('Importance', ascending=False).head(top_n)
    except:
        pass
    
    # Method 3: Fallback to permutation importance (works with any model)
    try:
        from sklearn.inspection import permutation_importance
        result = permutation_importance(
            model, X, y, 
            n_repeats=3,  # Lower for speed
            random_state=42,
            n_jobs=-1
        )
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': result.importances_mean
        })
        return importance_df.sort_values('Importance', ascending=False).head(top_n)
    except:
        return pd.DataFrame({'Feature': ["Error calculating importance"], 'Importance': [0]})
        
# Extract X and y from test data
X_test = test_df.drop(['relabeled', 'subject', 'filename', 'grasp', 'channel', 'window_id'], axis=1) 
y_test = test_df['relabeled']

# Get the best model and finalize it
try:
    finalized_model = finalize_model(selected_model)
    model_sklearn = finalized_model
except:
    model_sklearn = selected_model  # Use the original model if finalize fails

# Get feature importance table with our reliable function
importance_df = get_model_feature_importance(model_sklearn, X_test, y_test)

print(f"✓ Done in {time.time() - start_time:.2f}s")
print("\n=== FEATURE IMPORTANCE (TOP 20) ===")
print(importance_df.head(20))

# Final evaluation on test set
print("\nStep 8: Evaluating model on test set...")
start_time = time.time()
test_predictions = predict_model(selected_model, data=test_df, verbose=False)
print(f"✓ Done in {time.time() - start_time:.2f}s")

print("\n=== TEST SET PERFORMANCE ===")
if 'Accuracy' in test_predictions.columns:
    print(f"Accuracy: {test_predictions['Accuracy'].mean():.4f}")
else:
    # Calculate accuracy manually
    correct = sum(test_predictions['prediction_label'] == test_predictions['relabeled'])
    accuracy = correct / len(test_predictions)
    print(f"Accuracy: {accuracy:.4f}")

# Report total runtime
total_time = time.time() - start_time_total
print(f"\nTotal execution time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

### 200 ms Window with original signal

In [15]:
import pandas as pd
import os

# Replace 'your_file.parquet' with the actual file name
filename = 'DB4_w400_env0_f0.6[20250406-201248].parquet'
file_path = 'preprocessed_data'
database = 'DB4'

file_path = os.path.join(file_path, database, filename)

# Load the Parquet file into a pandas DataFrame
df = pd.read_parquet(file_path)

# Display the DataFrame
df.head()

Unnamed: 0,subject,filename,grasp,relabeled,channel,window_id,MAV,MAV_STD,IAV,IAV_STD,...,CC,LE,HFD,DASDV,MYOP,WAMP,CARD,LOG,SKEW,KURT
0,s1,S1_E1_A1.mat,0,0,Channel 1,0,0.072685,0.05612,29.073971,0.05612,...,0.045544,-1.627829,0,0.059622,0.8725,329,0.86,0.04966,0.277695,3.64145
1,s1,S1_E1_A1.mat,0,0,Channel 1,1,0.068561,0.059857,27.424438,0.059857,...,0.048902,-1.448815,0,0.061638,0.8125,351,0.86,0.042523,0.398774,3.593001
2,s1,S1_E1_A1.mat,0,0,Channel 1,2,0.076618,0.065604,30.647045,0.065604,...,0.054342,-1.227032,0,0.070405,0.87,347,0.88,0.049029,0.288872,4.063548
3,s1,S1_E1_A1.mat,0,0,Channel 1,3,0.067623,0.061,27.049158,0.061,...,0.047828,-1.39274,0,0.0623,0.8125,338,0.84,0.041331,0.343092,4.368192
4,s1,S1_E1_A1.mat,0,0,Channel 1,4,0.063394,0.057528,25.357519,0.057528,...,0.045724,-1.277231,0,0.061475,0.8125,330,0.9,0.038421,0.728083,4.574465


In [16]:
# 1. First, let's identify columns with constant values
def identify_constant_columns(df):
    """
    Identify columns that contain only a single value.
    Returns a list of column names with constant values.
    """
    constant_columns = []
    for col in df.columns:
        # Check if the column has only one unique value
        if df[col].nunique() == 1:
            constant_value = df[col].iloc[0]
            constant_columns.append((col, constant_value))
    return constant_columns

# 3. Apply the functions to identify constant columns and rows
constant_columns = identify_constant_columns(df)
print(f"Found {len(constant_columns)} columns with constant values:")
for col, value in constant_columns:
    print(f"  - {col}: {value}")

# 4. Create a cleaned dataframe with constant columns removed
constant_column_names = [col for col, _ in constant_columns]
df_cleaned = df.drop(columns=constant_column_names)
print(f"\nShape after removing constant columns: {df_cleaned.shape}")


Found 1 columns with constant values:
  - HFD: 0

Shape after removing constant columns: (1638756, 48)


In [17]:
# Check feature distributions - focus on skewness
print("\nFeatures with highly skewed distributions (skewness > 3):")
skewed_features = {}

for col in feature_cols:
    if df[col].nunique() <= 1:
        continue
    
    # Calculate skewness, ignoring NaN values
    skewness = df[col].skew()
    
    # Only report features with high skewness (absolute value > 3)
    if abs(skewness) > 3:
        skewed_features[col] = skewness

if skewed_features:
    sorted_skewed = dict(sorted(skewed_features.items(), key=lambda x: abs(x[1]), reverse=True))
    for col, skew in sorted_skewed.items():
        direction = "right" if skew > 0 else "left"
        print(f"  - {col}: {skew:.2f} (skewed {direction})")
else:
    print("  No highly skewed features found")



Features with highly skewed distributions (skewness > 3):
  - CoV: 1158.67 (skewed right)
  - MNP_STD: 368.17 (skewed right)
  - TTP: 84.32 (skewed right)
  - MNP: 84.32 (skewed right)
  - VAR: 84.31 (skewed right)
  - WAMP: -9.63 (skewed left)
  - KURT: 8.68 (skewed right)
  - SSC_STD: -6.79 (skewed left)
  - mDWT_STD: 6.48 (skewed right)
  - WL_STD: 6.33 (skewed right)
  - TD_STD: 6.33 (skewed right)
  - MAVS_STD: 6.33 (skewed right)
  - LE: -5.43 (skewed left)
  - MAV_STD: 4.46 (skewed right)
  - IAV_STD: 4.46 (skewed right)
  - DASDV: 4.36 (skewed right)
  - RMS: 3.85 (skewed right)
  - RMS_STD: 3.83 (skewed right)
  - VAR_STD: 3.83 (skewed right)
  - LOG: 3.66 (skewed right)
  - MAV: 3.60 (skewed right)
  - IAV: 3.60 (skewed right)
  - mDWT: 3.55 (skewed right)
  - PKF: 3.50 (skewed right)
  - WL: 3.39 (skewed right)
  - TD: 3.39 (skewed right)
  - MAVS: 3.39 (skewed right)
  - CC: 3.39 (skewed right)


In [18]:
import pandas as pd
import numpy as np

# Define your feature groups
feature_groups = {
    'amplitude': [
        'MAV', 'IAV', 'RMS', 'TTP', 'LOG', 'MYOP', 'WAMP', 'DASDV', 'CoV'
    ],
    'change': [
        'WL', 'ZC', 'SSC', 'TD', 'MAVS'
    ],
    'moments': [
        'VAR', 'SKEW', 'KURT'
    ],
    'std': [
        'MAV_STD', 'IAV_STD', 'RMS_STD', 'WL_STD', 'ZC_STD',
        'SSC_STD', 'VAR_STD', 'TD_STD', 'MAVS_STD'
    ],
    'freq': [
        'MDF', 'PKF', 'MNF', 'TTP', 'SM1', 'SM2', 'SM3', 'MNPF'
    ],
    'wavelet': ['mDWT', 'mDWT_STD'],
    'complexity': ['SampEn', 'CC', 'LE', 'HFD']
}

# Build a reverse index: feature -> group
feature_to_group = {}
for group, features in feature_groups.items():
    for f in features:
        feature_to_group[f] = group

# Exclude metadata
metadata_cols = ['subject', 'filename', 'grasp', 'relabeled', 'channel', 'window_id']
feature_cols = [col for col in df_cleaned.columns if col not in metadata_cols]

# Create correlation matrix
corr_threshold = 0.98
corr_matrix = df_cleaned[feature_cols].corr().abs()

# Track what gets removed
to_remove = set()
removed_pairs = []

# Loop through upper triangle of correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        f1 = corr_matrix.columns[i]
        f2 = corr_matrix.columns[j]
        corr = corr_matrix.iloc[i, j]

        if corr > corr_threshold:
            group1 = feature_to_group.get(f1, 'unknown')
            group2 = feature_to_group.get(f2, 'unknown')

            # Prefer to keep features from different groups
            if group1 != group2:
                continue

            # If from same group, remove the one with higher mean correlation to others
            mean_corr_f1 = corr_matrix[f1].mean()
            mean_corr_f2 = corr_matrix[f2].mean()

            if mean_corr_f1 >= mean_corr_f2:
                if f1 not in to_remove:
                    to_remove.add(f1)
                    removed_pairs.append((f1, f2, corr))
            else:
                if f2 not in to_remove:
                    to_remove.add(f2)
                    removed_pairs.append((f2, f1, corr))

# Filter features
final_features = [f for f in feature_cols if f not in to_remove]

# --- Reporting ---
print(f"Initial feature count: {len(feature_cols)}")
print(f"Removed {len(to_remove)} highly correlated features (>|{corr_threshold}|):\n")
for f_rm, f_keep, corr in removed_pairs:
    print(f"  - Removed '{f_rm}' instead of '{f_keep}' (corr={corr:.4f}) [Group: {feature_to_group.get(f_rm)}]")

print(f"\nFinal retained features: {len(final_features)}")

# Optionally: update df
df_selected = df_cleaned[metadata_cols + final_features]


Initial feature count: 42
Removed 11 highly correlated features (>|0.98|):

  - Removed 'MAV' instead of 'IAV' (corr=1.0000) [Group: amplitude]
  - Removed 'RMS' instead of 'MAV' (corr=0.9928) [Group: amplitude]
  - Removed 'MAV_STD' instead of 'IAV_STD' (corr=1.0000) [Group: std]
  - Removed 'RMS_STD' instead of 'MAV_STD' (corr=0.9924) [Group: std]
  - Removed 'VAR_STD' instead of 'MAV_STD' (corr=0.9924) [Group: std]
  - Removed 'IAV' instead of 'LOG' (corr=0.9914) [Group: amplitude]
  - Removed 'WL' instead of 'TD' (corr=1.0000) [Group: change]
  - Removed 'WL_STD' instead of 'TD_STD' (corr=1.0000) [Group: std]
  - Removed 'TD' instead of 'MAVS' (corr=1.0000) [Group: change]
  - Removed 'TD_STD' instead of 'MAVS_STD' (corr=1.0000) [Group: std]
  - Removed 'SM1' instead of 'MNPF' (corr=1.0000) [Group: freq]

Final retained features: 31


In [19]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import warnings
import time
from sklearn.model_selection import train_test_split
from contextlib import redirect_stdout
import io

warnings.filterwarnings("ignore")
logger = Logger(verbose=True, log_to_file=False)

# Measure total time
start_time_total = time.time()

# === Step 1: Filter data for selected classes ===
filtered_labels = [55, 2, 4, 14, 10, 16, 17, 19, 32]
df_reduced = df_selected[df_selected['relabeled'].isin(filtered_labels)].sample(frac=1, random_state=42)

models_to_test = [
    'dummy',    # Dummy classifier (baseline)
    'lr',       # Logistic Regression
    'dt',       # Decision Tree
    'lightgbm', # LightGBM
    'nb',       # Naive Bayes
    'ridge',    # Ridge Classifier
    'knn',      # K Nearest Neighbors
    'ada',      # AdaBoost
    'qda',      # Quadratic Discriminant Analysis
    'et',       # Extra Trees Classifier
    'rf',       # Random Forest
    'gbc',      # Gradient Boosting Classifier
    'svm'       # Support Vector Machine (Linear)
]

# === Step 2: Train/Test Split ===
train_df, test_df = train_test_split(df_reduced, test_size=0.2, random_state=42)
logger.log(f"Train/Test split: {len(train_df)} train, {len(test_df)} test")

# === Step 3: PyCaret Setup (silent) ===
f = io.StringIO()
with redirect_stdout(f):
    s = setup(
        data=train_df,
        target='relabeled',
        numeric_features=[col for col in train_df.columns 
                          if col not in metadata_cols and col != 'relabeled'],
        ignore_features=['subject', 'filename', 'grasp', 'channel', 'window_id'],
        normalize=True,
        transformation=False,
        feature_selection=False,
        polynomial_features=False,
        fold=2,
        session_id=42,
        verbose=False,
        html=False,
        n_jobs=-1,
        use_gpu=True,
        log_experiment=False,
        profile=False
    )
logger.log("PyCaret setup completed")

# === Step 4: Compare Models ===
with redirect_stdout(f):
    best_models = compare_models(
        include=models_to_test,
        n_select=1,
        fold=2,
        sort='Accuracy',
        verbose=False,
        turbo=True
    )
logger.log("Model comparison completed")

# Pull comparison results
comparison_df = pull()
print("\n=== MODEL COMPARISON ===")
# Check if 'Precision' exists in the DataFrame
columns_to_display = ['Model', 'Accuracy', 'AUC', 'Recall', 'F1']
if 'Precision' in comparison_df.columns:
    columns_to_display.insert(4, 'Precision')  # Add 'Precision' if it exists
print(comparison_df[columns_to_display])

# === Step 5: Finalize Best Model ===
selected_model = best_models[0] if isinstance(best_models, list) else best_models
finalized_model = finalize_model(selected_model)

# === Step 6: Evaluate on Test Set ===
test_predictions = predict_model(finalized_model, data=test_df, verbose=False)
logger.log("Model evaluation on test set completed")

# Manual Accuracy (PyCaret's column is often NaN)
accuracy = (test_predictions['prediction_label'] == test_predictions['relabeled']).mean()
print(f"\nTest Accuracy: {accuracy:.4f}")

# === Step 7: Feature Importance ===
def get_model_feature_importance(model, X, y, top_n=20):
    try:
        if hasattr(model, 'feature_importances_'):
            imp = model.feature_importances_
        elif hasattr(model, 'coef_'):
            imp = np.abs(model.coef_[0]) if model.coef_.ndim > 1 else np.abs(model.coef_)
        else:
            from sklearn.inspection import permutation_importance
            result = permutation_importance(model, X, y, n_repeats=3, random_state=42, n_jobs=-1)
            imp = result.importances_mean
        return pd.DataFrame({'Feature': X.columns, 'Importance': imp}).sort_values(by='Importance', ascending=False).head(top_n)
    except Exception as e:
        return pd.DataFrame({'Feature': ['Error'], 'Importance': [0]})
logger.log("Feature importance extraction completed")

X_test = test_df.drop(columns=['relabeled', 'subject', 'filename', 'grasp', 'channel', 'window_id'])
y_test = test_df['relabeled']
importance_df = get_model_feature_importance(finalized_model, X_test, y_test)

print("\n=== TOP FEATURES ===")
print(importance_df)

# === Runtime Summary ===
print(f"\nTotal Runtime: {(time.time() - start_time_total):.2f} seconds")


[INFO] Train/Test split: 109756 train, 27440 test
[INFO] PyCaret setup completed
[INFO] Model comparison completed

=== MODEL COMPARISON ===
                                    Model  Accuracy     AUC  Recall      F1
rf               Random Forest Classifier    0.2711  0.6989  0.2711  0.2660
et                 Extra Trees Classifier    0.2668  0.6961  0.2668  0.2618
lightgbm  Light Gradient Boosting Machine    0.2649  0.6968  0.2649  0.2557
gbc          Gradient Boosting Classifier    0.2472  0.0000  0.2472  0.2336
lr                    Logistic Regression    0.2186  0.0000  0.2186  0.1969
ridge                    Ridge Classifier    0.2154  0.0000  0.2154  0.1894
ada                  Ada Boost Classifier    0.2089  0.0000  0.2089  0.1902
knn                K Neighbors Classifier    0.2077  0.6084  0.2077  0.2081
dt               Decision Tree Classifier    0.1897  0.5441  0.1897  0.1898
qda       Quadratic Discriminant Analysis    0.1807  0.0000  0.1807  0.1568
nb                     

In [1]:
import pandas as pd
import os

# Replace 'your_file.parquet' with the actual file name
filename = 'signals_50OL_03Windowing.pkl'
file_path = 'preprocessed_data'

file_path = os.path.join(file_path, filename)

# Load the Pickle file into a pandas DataFrame
df = pd.read_pickle(file_path)

# Display the DataFrame
df.head()

Unnamed: 0,label,RMS_E1,MAV_E1,VARIANCE_E1,SAMPLE_VARIANCE_E1,RMS_E2,MAV_E2,VARIANCE_E2,SAMPLE_VARIANCE_E2,RMS_E3,...,VARIANCE_E6,SAMPLE_VARIANCE_E6,RMS_E7,MAV_E7,VARIANCE_E7,SAMPLE_VARIANCE_E7,RMS_E8,MAV_E8,VARIANCE_E8,SAMPLE_VARIANCE_E8
0,Base,0.200014,0.19997,1.8e-05,1.8e-05,0.202461,0.197485,0.00199,0.002024,0.459723,...,2.6e-05,2.7e-05,0.373438,0.366913,0.00483,0.004912,0.322071,0.318258,0.002441,0.002483
1,Base,0.193816,0.19372,3.7e-05,3.8e-05,0.154873,0.154214,0.000204,0.000207,0.342896,...,7e-06,7e-06,0.294526,0.293246,0.000753,0.000765,0.266149,0.265341,0.000429,0.000437
2,Base,0.181736,0.181561,6.3e-05,6.4e-05,0.134383,0.134057,8.7e-05,8.9e-05,0.294403,...,4e-06,4e-06,0.253785,0.253042,0.000377,0.000383,0.233627,0.233021,0.000283,0.000287
3,Base,0.166725,0.166455,9e-05,9.2e-05,0.120167,0.11996,5e-05,5e-05,0.261695,...,3e-06,3e-06,0.224414,0.223954,0.000206,0.00021,0.206151,0.205611,0.000222,0.000226
4,Base,0.149564,0.149202,0.000108,0.00011,0.109139,0.108991,3.2e-05,3.3e-05,0.236985,...,2e-06,2e-06,0.202398,0.2021,0.000121,0.000123,0.18134,0.180819,0.000189,0.000192
