In [None]:
import os


In [None]:

# --- CONFIGURATION ---
# List of corrupted files identified by check_nans
BAD_FILES = [
    "S030.pt", "S012.pt", "S016.pt", "S034.pt", 
    "S031.pt", "S033.pt", "S004.pt"
]

# Robust Path Finding: 
# Checks if 'processed_tensors' is in current folder OR one level up
if os.path.exists("processed_tensors"):
    DATA_DIR = "processed_tensors"
elif os.path.exists("../processed_tensors"):
    DATA_DIR = "../processed_tensors"
else:
    raise FileNotFoundError("Could not find 'processed_tensors' folder. Check your notebook location.")

print(f"Targeting Folder: {os.path.abspath(DATA_DIR)}")
print("-" * 40)

# --- CLEANUP LOOP ---
deleted_count = 0

for filename in BAD_FILES:
    file_path = os.path.join(DATA_DIR, filename)
    
    if os.path.exists(file_path):
        try:
            os.remove(file_path)
            print(f"✅ Deleted: {filename}")
            deleted_count += 1
        except OSError as e:
            print(f"❌ Error deleting {filename}: {e}")
    else:
        print(f"⚠️  Not Found (Already deleted?): {filename}")

print("-" * 40)
print(f"Cleanup Complete. Removed {deleted_count} files.")
print("You are now ready to run 'train_gait_model.py'.")

## CHecking for if the raw file itself has some problem or not

### CODE

In [4]:
import pandas as pd
import numpy as np
import glob
import os


In [1]:

def diagnose_csv(filepath):
    print(f"\n--- Diagnosing: {os.path.basename(filepath)} ---")
    try:
        df = pd.read_csv(filepath)
        
        # Check 1: Missing Values (NaNs in raw CSV)
        if df.isnull().values.any():
            print("❌ FAILURE: Raw CSV contains NaN values.")
            print(df.isnull().sum()[df.isnull().sum() > 0])
            return

        # Check 2: Zeros (Potential sensor dropout)
        # Check a critical column like 'Pelvis-Acceleration-z'
        col = "Noraxon MyoMotion-Segments-Pelvis-Acceleration-z (mG)"
        if col in df.columns:
            zeros = (df[col] == 0).sum()
            if zeros > 100:
                print(f"⚠️ WARNING: Found {zeros} zeros in Pelvis Accel. Sensor might have dropped out.")
        
        # Check 3: Short File
        if len(df) < 200: # Less than 1 second
            print(f"❌ FAILURE: File too short ({len(df)} rows). Minimum 200 required.")
            return

        print("✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).")

    except Exception as e:
        print(f"❌ Error reading file: {e}")


In [6]:

# Adjust path to where your RAW CSVs are
RAW_DATA_PATH = "../DATA/fep/S030" 


In [9]:
l=["S030", "S012","S016","S034","S031","S033","S004"]
for _ in l:
	# Find S030 files
	files = glob.glob(os.path.join(f"../DATA/fep/{_}", "*target_features.csv"))
	if not files:
		print("Could not find S030 files. Check path.")
	else:
		for f in files:
			diagnose_csv(f)


--- Diagnosing: S030_G01_D02_B01_T01-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D01_B01_T02-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D02_B03_T01-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D02_B01_T02-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D01_B01_T01-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D01_B02_T03-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D01_B01_T03-target_features.csv ---
✅ Raw CSV looks healthy. The bug might be in normalization (Divide by Zero).

--- Diagnosing: S030_G01_D

## code edit

In [10]:
import os
import glob
import pandas as pd
import shutil
from tqdm import tqdm
from utils.loggers import get_logger


ModuleNotFoundError: No module named 'utils'

In [None]:

# --- CONFIGURATION ---
# Correct path for running from 'verf/' root
DATABASE_ROOT = os.path.join("database", "fep")
QUARANTINE_ROOT = os.path.join("quarantined_files")

logger = get_logger("Cleaner", "data_cleaning.log")

def clean_database():
    logger.info(f"Scanning database at: {os.path.abspath(DATABASE_ROOT)}")
    
    # Check if database exists
    if not os.path.exists(DATABASE_ROOT):
        logger.error(f"Folder not found: {DATABASE_ROOT}")
        return

    # Create Quarantine
    if not os.path.exists(QUARANTINE_ROOT):
        os.makedirs(QUARANTINE_ROOT)
        logger.info(f"Created quarantine: {os.path.abspath(QUARANTINE_ROOT)}")

    # Recursive search for all target_features.csv files
    # matches verf/database/fep/S###/*.csv
    search_pattern = os.path.join(DATABASE_ROOT, "**", "*target_features.csv")
    files = glob.glob(search_pattern, recursive=True)
    
    logger.info(f"Found {len(files)} CSV files. Checking for corruption...")
    
    bad_files = []
    
    for fpath in tqdm(files, desc="Validating CSVs"):
        try:
            # Quick load
            df = pd.read_csv(fpath, low_memory=False)
            
            # 1. Check for NaN (Missing Values)
            if df.isnull().values.any():
                logger.warning(f"CORRUPT (NaNs): {os.path.basename(fpath)}")
                bad_files.append(fpath)
                continue
                
            # 2. Check for Empty/Short files
            if len(df) < 200:
                logger.warning(f"CORRUPT (Too Short): {os.path.basename(fpath)}")
                bad_files.append(fpath)
                continue
                
        except Exception as e:
            logger.error(f"READ ERROR: {os.path.basename(fpath)} | {e}")
            bad_files.append(fpath)

    logger.info("-" * 30)
    
    if len(bad_files) == 0:
        logger.info("✅ No corrupted files found. Your database is clean!")
    else:
        logger.info(f"Found {len(bad_files)} corrupted files.")
        
        # Move files
        moved_count = 0
        for f in bad_files:
            filename = os.path.basename(f)
            # Add parent folder to filename to avoid overwriting (e.g. S030_Trial1.csv)
            parent = os.path.basename(os.path.dirname(f))
            dest_name = f"{parent}_{filename}"
            destination = os.path.join(QUARANTINE_ROOT, dest_name)
            
            try:
                shutil.move(f, destination)
                moved_count += 1
            except Exception as e:
                logger.error(f"Error moving {filename}: {e}")
        
        logger.info(f"Cleanup Complete. Moved {moved_count} files to '{QUARANTINE_ROOT}'.")
        logger.info("You can now safely re-run 'prepare_data.py'.")

if __name__ == "__main__":
    clean_database()