In [15]:
import os
import pandas as pd
import numpy as np
import shutil  # Import the shutil module for copying files

def find_outliers_zscore_both_directions(data, threshold=3):
    """
    Finds indices of data points with a Z-score higher than the positive threshold
    OR lower than the negative threshold (both upper and lower outliers).

    Args:
        data: A pandas Series of numerical data.
        threshold: The Z-score threshold. Values beyond this threshold (positive or negative)
                   are considered outliers. Default is 3.

    Returns:
       A dictionary containing lists of indices for 'higher' and 'lower' outliers.
    """
    mean = data.mean()
    std = data.std()
    if std == 0:  # Handle the case where standard deviation is zero
        return {'higher': [], 'lower': []}  # No outliers if all values are the same
    z_scores = (data - mean) / std
    higher_outlier_indices = data[z_scores > threshold].index.tolist()
    lower_outlier_indices = data[z_scores < -threshold].index.tolist() # Added check for lower bound
    return {'higher': higher_outlier_indices, 'lower': lower_outlier_indices}

def process_csv_file(filepath, output_base_directory_outlier2, output_base_directory_radius3, input_base_directory):
    """
    Processes a single CSV file, applying triple z-score check for 'higher_order_amplitude_sum',
    removes outliers, appends suffix, saves to outlier2 (folders) and radius3 (flattened).

    Args:
        filepath: The path to the CSV file.
        output_base_directory_outlier2: Directory for outlier-removed files with folder structure.
        output_base_directory_radius3: Directory for outlier-removed files without folder structure (flattened).
        input_base_directory: Base directory of input files.
    """
    try:
        df = pd.read_csv(filepath)
        columns_to_check = ['higher_order_amplitude_sum', 'component_1_amplitude', 'component_2_amplitude', 'dc_component']
        strict_threshold_column = 'higher_order_amplitude_sum'
        default_threshold = 3
        strict_threshold = 2.5

        print(f"Processing file: {filepath}")

        all_outlier_indices = [] # To collect indices from other columns for removal
        df_strict_processing = df.copy() # Start with a copy for strict column processing


        # Triple Z-score check for 'higher_order_amplitude_sum'
        if strict_threshold_column in df.columns:
            for pass_num in range(1, 4): # Loop for three passes
                if df_strict_processing.empty: # Stop if DataFrame becomes empty
                    print(f"  Column: '{strict_threshold_column}' - Pass {pass_num}: DataFrame is empty, skipping pass.")
                    break

                data_strict = df_strict_processing[strict_threshold_column]
                outlier_indices_dict_pass = find_outliers_zscore_both_directions(data_strict, threshold=strict_threshold)
                combined_outlier_indices_pass = outlier_indices_dict_pass['higher'] + outlier_indices_dict_pass['lower']

                if combined_outlier_indices_pass:
                    print(f"  Column: '{strict_threshold_column}' - Pass {pass_num}: Found {len(combined_outlier_indices_pass)} outliers (Z-score > {strict_threshold} or < -{strict_threshold})")
                    # No need to extend all_outlier_indices here for strict column, drop immediately
                    df_strict_processing = df_strict_processing.drop(combined_outlier_indices_pass) # Remove outliers for next pass
                else:
                    print(f"  Column: '{strict_threshold_column}' - Pass {pass_num}: No outliers found.")
                    break # No more outliers found in this pass, no need for further passes


        # Single Z-score check for other columns on the DataFrame after strict column processing
        df_other_cols_processing = df_strict_processing.copy() # Process other cols on the df after strict column processing
        for column in columns_to_check:
            if column != strict_threshold_column: # Skip strict column as it's already processed
                if column not in df_other_cols_processing.columns:
                    print(f"  Warning: Column '{column}' not found in this file.")
                    continue

                data = df_other_cols_processing[column]
                current_threshold = default_threshold
                outlier_indices_dict = find_outliers_zscore_both_directions(data, threshold=current_threshold)
                combined_outlier_indices = outlier_indices_dict['higher'] + outlier_indices_dict['lower']
                if combined_outlier_indices:
                    print(f"  Column: '{column}' - Found {len(combined_outlier_indices)} outliers (Z-score > {current_threshold} or < -{current_threshold})")
                    all_outlier_indices.extend(combined_outlier_indices) # Extend for other columns


        # Remove duplicate indices and sort to ensure correct dropping (from original df index for other columns)
        unique_outlier_indices = sorted(list(set(all_outlier_indices)))

        # Combine outliers from strict column processing and other columns
        combined_all_outlier_indices = unique_outlier_indices # Outliers from other columns are in unique_outlier_indices
        df_final = df_strict_processing.copy() # Start with df after strict column processing

        if combined_all_outlier_indices:
            print(f"  Removing {len(combined_all_outlier_indices)} outlier rows from other columns")
            df_final = df_final.drop(combined_all_outlier_indices) # Drop outliers from other columns
        # else: df_final is already set to df_strict_processing


        # Construct output filepath with suffix
        relative_path = os.path.relpath(filepath, input_base_directory)
        output_dir_relative = os.path.dirname(relative_path)
        input_parent_dir_name = os.path.basename(os.path.dirname(filepath)) # Get parent directory name
        filename_no_ext, ext = os.path.splitext(os.path.basename(filepath))

        suffix = ""
        if input_parent_dir_name == "Pachymetry_Value_casia1-2":
            suffix = "_casia1-2"
        elif input_parent_dir_name == "Pachymetry_Value_casia_less_than_1":
            suffix = "_casia_less_than_1"
        elif input_parent_dir_name == "Pachymetry_Value_casia2-4":
            suffix = "_casia2-4"
        elif input_parent_dir_name == "Pachymetry_Value_casia_more_than_4":
            suffix = "_casia_more_than_4"

        output_filename = filename_no_ext + suffix + ext


        # Save to outlier2 directory (with outlier removal and folder structure)
        output_filepath_outlier2 = os.path.join(output_base_directory_outlier2, output_dir_relative, output_filename)
        output_dir_outlier2 = os.path.dirname(output_filepath_outlier2)
        os.makedirs(output_dir_outlier2, exist_ok=True) # Ensure directory exists
        df_final.to_csv(output_filepath_outlier2, index=False) # Save df_final


        # Move to radius3 directory (ALSO with outlier removal, but flattened)
        output_filepath_radius3 = os.path.join(output_base_directory_radius3, output_filename) # No subfolders in radius3 path
        output_dir_radius3 = output_base_directory_radius3 # Radius3 dir is the base itself, no sub dir needed
        os.makedirs(output_dir_radius3, exist_ok=True) # Ensure radius3 base directory exists
        df_final.to_csv(output_filepath_radius3, index=False) # Save the PROCESSED DataFrame df_final


        total_outliers_removed = 0
        if unique_outlier_indices:
            total_outliers_removed += len(unique_outlier_indices)
        # Count outliers removed in strict processing is implicit as rows are dropped in each pass

        print(f"  Outliers removed and file saved at {output_filepath_outlier2} (with folders) and {output_filepath_radius3} (flattened)")


        print("-" * 30) # Separator for different files


    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
    except pd.errors.EmptyDataError:
        print(f"Warning: Empty CSV file: {filepath}")
    except Exception as e:
        print(f"Error processing {filepath}: {e}")

def process_directory(input_directory, output_directory_outlier2, output_directory_radius3):
    """
    Processes all CSV files, saving to outlier2 with folder structure and radius3 flattened.

    Args:
        input_directory: Input directory.
        output_directory_outlier2: Output directory for outlier-removed files with folder structure.
        output_directory_radius3: Output directory for outlier-removed files without folder structure.
    """
    for root, _, files in os.walk(input_directory):
        for file in files:
            if file.lower().endswith(".csv"):
                filepath = os.path.join(root, file)
                process_csv_file(filepath, output_directory_outlier2, output_directory_radius3, input_directory)

if __name__ == "__main__":
    input_base_directory = "/home/aricept094/mydata/ANOVA/outlier"
    output_base_directory_outlier2 = "/home/aricept094/mydata/ANOVA/outlier2"
    output_base_directory_radius3 = "/home/aricept094/mydata/ANOVA/radius3"
    process_directory(input_base_directory, output_base_directory_outlier2, output_base_directory_radius3)
    print("Processing complete. Files saved in:", output_base_directory_outlier2, "(with folders) and", output_base_directory_radius3, "(flattened)")

Processing file: /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_24.csv
  Column: 'higher_order_amplitude_sum' - Pass 1: Found 1 outliers (Z-score > 2.5 or < -2.5)
  Column: 'higher_order_amplitude_sum' - Pass 2: Found 7 outliers (Z-score > 2.5 or < -2.5)
  Column: 'higher_order_amplitude_sum' - Pass 3: Found 3 outliers (Z-score > 2.5 or < -2.5)
  Column: 'component_1_amplitude' - Found 3 outliers (Z-score > 3 or < -3)
  Column: 'component_2_amplitude' - Found 2 outliers (Z-score > 3 or < -3)
  Removing 5 outlier rows from other columns
  Outliers removed and file saved at /home/aricept094/mydata/ANOVA/outlier2/Pachymetry_Value_casia_less_than_1/analysis_results_radial_24_casia_less_than_1.csv (with folders) and /home/aricept094/mydata/ANOVA/radius3/analysis_results_radial_24_casia_less_than_1.csv (flattened)
------------------------------
Processing file: /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_

In [None]:
import pandas as pd
import os

def remove_duplicate_rows(csv_filepath):
    """
    Reads a CSV, identifies and removes duplicate rows based on specified columns,
    and prints the number of duplicates removed and lists the duplicated filenames.
    Duplicates are checked up to 6 decimal places for specified columns.

    Args:
        csv_filepath (str): The full path to the CSV file.
    """

    try:
        df = pd.read_csv(csv_filepath)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_filepath}")
        return

    columns_to_check = ["dc_component", "component_1_amplitude", "component_2_amplitude", "higher_order_amplitude_sum"]

    # Check if the columns exist in dataframe
    if not all(col in df.columns for col in columns_to_check):
        print(f"Error: Not all required columns found in {csv_filepath}")
        return

    initial_rows = len(df)

    # Round the columns to 6 decimal places for duplicate checking
    df_rounded = df[columns_to_check].round(6)

    # Identify duplicate rows based on the rounded values
    duplicate_rows = df[df_rounded.duplicated(keep=False)]


    # Extract filenames of the duplicated rows
    duplicated_filenames = duplicate_rows['filename'].tolist()
    unique_duplicated_filenames = list(set(duplicated_filenames)) # get unique filenames

    # Identify and remove duplicates based on the ROUNDED values, keeping the first occurrence
    df.drop_duplicates(subset=df_rounded.columns, inplace=True, keep='first') # use df_rounded.columns to ensure correct subset
    final_rows = len(df)

    duplicates_removed = initial_rows - final_rows

    print(f"File: {os.path.basename(csv_filepath)}")
    print(f"  Initial rows: {initial_rows}")
    print(f"  Duplicates removed: {duplicates_removed}")
    print(f"  Final rows: {final_rows}")

    if duplicates_removed > 0 :
        print("  Duplicated filenames:")
        for filename in unique_duplicated_filenames:
            print(f"    - {filename}")
        df.to_csv(csv_filepath, index=False)

    print("-" * 40)


def process_csv_files_in_directory(directory_path):
    """
    Processes all CSV files within a given directory.

    Args:
        directory_path (str): The full path to the directory containing the CSV files.
    """
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".csv"):
             filepath = os.path.join(directory_path, filename)
             remove_duplicate_rows(filepath)

if __name__ == "__main__":
    directory_to_scan = "/home/aricept094/mydata/ANOVA/radius2"
    process_csv_files_in_directory(directory_to_scan)