In [12]:
import os
import pandas as pd
import numpy as np

def find_outliers_zscore(data, threshold=3):
    """
    Finds outliers in a dataset using the Z-score method (both upper and lower).

    Args:
        data: A pandas Series of numerical data.
        threshold: The Z-score threshold. Values beyond this threshold (positive or negative) are considered outliers.
                   Default is 3.

    Returns:
       A list of outlier indices.
    """
    mean = data.mean()
    std = data.std()
    z_scores = (data - mean) / std
    outlier_indices = data[abs(z_scores) > threshold].index.tolist()
    return outlier_indices

def process_csv_file(filepath):
    """
    Processes a single CSV file, removing outliers in 'higher_order_amplitude_sum' column.

    Args:
        filepath: The path to the CSV file.
    """
    try:
        df = pd.read_csv(filepath)
        if 'higher_order_amplitude_sum' not in df.columns:
            print(f"Warning: Column 'higher_order_amplitude_sum' not found in {filepath}")
            return

        data = df['higher_order_amplitude_sum']
        outlier_indices = find_outliers_zscore(data, threshold=3)

        if outlier_indices:
            print(f"Removing {len(outlier_indices)} outliers (Z-score threshold=3) from {filepath}")
            # Drop outlier rows based on index
            df = df.drop(outlier_indices)

            # Save the modified DataFrame back to the CSV, directly overwriting
            df.to_csv(filepath, index=False)
            print(f"Outliers removed and file saved at {filepath}")

    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
    except pd.errors.EmptyDataError:
        print(f"Warning: Empty CSV file: {filepath}")
    except Exception as e:
        print(f"Error processing {filepath}: {e}")

def process_directory(directory):
    """
    Processes all CSV files in a directory and its subdirectories.

    Args:
        directory: The path to the directory.
    """
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".csv"):
                filepath = os.path.join(root, file)
                process_csv_file(filepath)

if __name__ == "__main__":
    base_directory = "/home/aricept094/mydata/ANOVA/outlier"
    process_directory(base_directory)

Removing 3 outliers (Z-score threshold=3) from /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_24.csv
Outliers removed and file saved at /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_24.csv
Removing 4 outliers (Z-score threshold=3) from /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_20.csv
Outliers removed and file saved at /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_20.csv
Removing 2 outliers (Z-score threshold=3) from /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_4.csv
Outliers removed and file saved at /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_results_radial_4.csv
Removing 1 outliers (Z-score threshold=3) from /home/aricept094/mydata/ANOVA/outlier/Pachymetry_Value_casia_less_than_1/analysis_res

In [13]:
import pandas as pd
import os

def remove_duplicate_rows(csv_filepath):
    """
    Reads a CSV, identifies and removes duplicate rows based on specified columns,
    and prints the number of duplicates removed.

    Args:
        csv_filepath (str): The full path to the CSV file.
    """

    try:
        df = pd.read_csv(csv_filepath)
    except FileNotFoundError:
        print(f"Error: File not found at {csv_filepath}")
        return

    columns_to_check = ["dc_component", "component_1_amplitude", "component_2_amplitude", "higher_order_amplitude_sum"]

    # Check if the columns exist in dataframe
    if not all(col in df.columns for col in columns_to_check):
        print(f"Error: Not all required columns found in {csv_filepath}")
        return

    initial_rows = len(df)

    # Identify duplicates based on the specified columns
    df.drop_duplicates(subset=columns_to_check, inplace=True, keep='first')
    final_rows = len(df)

    duplicates_removed = initial_rows - final_rows

    print(f"File: {os.path.basename(csv_filepath)}")
    print(f"  Initial rows: {initial_rows}")
    print(f"  Duplicates removed: {duplicates_removed}")
    print(f"  Final rows: {final_rows}")


    if duplicates_removed > 0 :
        df.to_csv(csv_filepath, index=False)

    print("-" * 40)


def process_csv_files_in_directory(directory_path):
    """
    Processes all CSV files within a given directory.

    Args:
        directory_path (str): The full path to the directory containing the CSV files.
    """
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".csv"):
             filepath = os.path.join(directory_path, filename)
             remove_duplicate_rows(filepath)

if __name__ == "__main__":
    directory_to_scan = "/home/aricept094/mydata/ANOVA/radius"
    process_csv_files_in_directory(directory_to_scan)

File: analysis_results_radial_16_casia_less_than_1.csv
  Initial rows: 186
  Duplicates removed: 26
  Final rows: 160
----------------------------------------
File: analysis_results_radial_4_casia_less_than_1.csv
  Initial rows: 186
  Duplicates removed: 26
  Final rows: 160
----------------------------------------
File: analysis_results_radial_20_casia1-2.csv
  Initial rows: 203
  Duplicates removed: 21
  Final rows: 182
----------------------------------------
File: analysis_results_radial_16_casia_more_than_4.csv
  Initial rows: 38
  Duplicates removed: 0
  Final rows: 38
----------------------------------------
File: analysis_results_radial_16_casia1-2.csv
  Initial rows: 200
  Duplicates removed: 21
  Final rows: 179
----------------------------------------
File: analysis_results_radial_16_casia2-4.csv
  Initial rows: 106
  Duplicates removed: 3
  Final rows: 103
----------------------------------------
File: analysis_results_radial_24_casia_more_than_4.csv
  Initial rows: 40
  Du