In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import glob
import os

def perform_anova_analysis(data_dir):
    # Parameters of interest
    parameters = ['dc_component', 'component_1_amplitude', 
                 'component_2_amplitude', 'higher_order_amplitude_sum']
    
    # Radius values
    radii = [4, 8, 12, 16, 20, 24]
    
    # Groups
    groups = ['casia_less_than_1', 'casia1-2', 'casia2-4', 'more_than_4']
    
    # Dictionary to store results
    results = []
    tukey_results = []
    
    # Process each radius
    for radius in radii:
        # Dictionary to store data for current radius
        radius_data = {param: {group: [] for group in groups} for param in parameters}
        
        # Read files for current radius
        pattern = f"{data_dir}/analysis_results_radial_{radius}_*.csv"
        files = glob.glob(pattern)
        
        # Process each file
        for file in files:
            # Determine group from filename
            for group in groups:
                if group in file:
                    current_group = group
                    break
            
            # Read CSV
            try:
                df = pd.read_csv(file)
                # Store data for each parameter
                for param in parameters:
                    if param in df.columns:
                        radius_data[param][current_group].extend(df[param].tolist())
            except Exception as e:
                print(f"Error reading file {file}: {e}")
        
        # Perform ANOVA for each parameter
        for param in parameters:
            # Prepare data for ANOVA
            param_data = []
            param_groups = []
            
            for group in groups:
                param_data.extend(radius_data[param][group])
                param_groups.extend([group] * len(radius_data[param][group]))
            
            if len(param_data) > 0:
                # Perform one-way ANOVA
                group_data = [np.array(radius_data[param][group]) 
                            for group in groups 
                            if len(radius_data[param][group]) > 0]
                
                try:
                    f_stat, p_value = stats.f_oneway(*group_data)
                    
                    # Store ANOVA results
                    results.append({
                        'radius': radius,
                        'parameter': param,
                        'f_statistic': f_stat,
                        'p_value': p_value,
                        'significant': p_value < 0.05
                    })
                    
                    # If significant, perform Tukey's test
                    if p_value < 0.05:
                        tukey = pairwise_tukeyhsd(param_data, param_groups)
                        
                        # Process Tukey results
                        for i in range(len(tukey.pvalues)):
                            comparison = tukey._results_table.data[i+1]  # Skip header row
                            tukey_results.append({
                                'radius': radius,
                                'parameter': param,
                                'group1': comparison[0],
                                'group2': comparison[1],
                                'mean_difference': comparison[2],
                                'p_value': comparison[3],
                                'significant': comparison[3] < 0.05
                            })
                except Exception as e:
                    print(f"Error performing analysis for radius {radius}, parameter {param}: {e}")
    
    # Convert results to DataFrames
    anova_df = pd.DataFrame(results)
    tukey_df = pd.DataFrame(tukey_results)
    
    # Save results
    results_dir = os.path.join(data_dir, 'anova_results')
    os.makedirs(results_dir, exist_ok=True)
    
    # Save with more informative column names
    anova_df.to_csv(os.path.join(results_dir, 'anova_results.csv'), index=False)
    tukey_df.to_csv(os.path.join(results_dir, 'tukey_results.csv'), index=False)
    
    # Create a summary file for significant differences
    with open(os.path.join(results_dir, 'significant_differences_summary.txt'), 'w') as f:
        f.write("SIGNIFICANT DIFFERENCES SUMMARY\n")
        f.write("===============================\n\n")
        
        for radius in radii:
            radius_anova = anova_df[anova_df['radius'] == radius]
            if len(radius_anova) > 0:
                f.write(f"\nRadius {radius}:\n")
                f.write("-" * 20 + "\n")
                
                for _, row in radius_anova[radius_anova['significant']].iterrows():
                    f.write(f"\nParameter: {row['parameter']}\n")
                    f.write(f"F-statistic: {row['f_statistic']:.3f}\n")
                    f.write(f"p-value: {row['p_value']:.4f}\n")
                    
                    # Get corresponding Tukey results
                    param_tukey = tukey_df[
                        (tukey_df['radius'] == radius) & 
                        (tukey_df['parameter'] == row['parameter']) &
                        (tukey_df['significant'])
                    ]
                    
                    if len(param_tukey) > 0:
                        f.write("Significant group differences:\n")
                        for _, tukey_row in param_tukey.iterrows():
                            f.write(f"  {tukey_row['group1']} vs {tukey_row['group2']}: ")
                            f.write(f"mean diff = {tukey_row['mean_difference']:.3f}, ")
                            f.write(f"p = {tukey_row['p_value']:.4f}\n")
    
    return anova_df, tukey_df

# Run the analysis
data_directory = '/home/aricept094/mydata/ANOVA/radius'
anova_results, tukey_results = perform_anova_analysis(data_directory)

# Print significant findings
print("\nSignificant ANOVA Results:")
significant_anova = anova_results[anova_results['significant']]
print(significant_anova)

print("\nSignificant Group Differences (Tukey test):")
significant_tukey = tukey_results[tukey_results['significant']]
print(significant_tukey)

In [4]:
import pandas as pd
import os
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import multipletests

# --- Configuration ---

input_dir = "/home/aricept094/mydata/ANOVA/radius"  # Replace with your input directory
output_dir = "/home/aricept094/mydata/ANOVA/results" # Replace with your output directory
parameters_of_interest = [
    "dc_component",
    "component_1_amplitude",
    "component_2_amplitude",
    "higher_order_amplitude_sum"
]
group_names = [
    "casia_less_than_1",
    "casia1-2",
    "casia2-4",
    "casia_more_than_4"
]
numbers = [4, 8, 12, 16, 20, 24]
alpha_level = 0.01 # Significance level for ANOVA and Post-hoc
correction_method = 'fdr_bh' # 'bonferroni' or 'fdr_bh' (Benjamini-Hochberg)

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# --- Function to perform ANOVA ---

def perform_anova(data_dict):
    """
    Performs ANOVA on the given parameter across groups.

    Args:
        data_dict (dict): Dictionary where keys are group names and values are lists of parameter data.

    Returns:
        dict: Dictionary containing ANOVA results ('F_value', 'p_value').
    """
    anova_results = {}

    # Prepare data for ANOVA
    group_data_list = [data_dict[group] for group in group_names]

    # Perform ANOVA
    fvalue, pvalue = stats.f_oneway(*group_data_list)

    anova_results['F_value'] = fvalue
    anova_results['p_value'] = pvalue

    return anova_results


# --- Main script ---

anova_results_list = []
anova_p_values_for_correction = []

for number in numbers:
    for parameter in parameters_of_interest:
        group_data = {}
        for group_name in group_names:
            # Construct filename
            filename = f"analysis_results_radial_{number}_{group_name}.csv"
            filepath = os.path.join(input_dir, filename)

            try:
                df = pd.read_csv(filepath)
                group_data[group_name] = df[parameter].tolist()
            except FileNotFoundError:
                print(f"Warning: File not found: {filepath}")
                group_data[group_name] = []

        # Data Validation
        valid_group_data = {group: data for group, data in group_data.items() if data}

        if not valid_group_data:
            print(f"Warning: No data available for ANOVA for radial number {number}, parameter {parameter}. Skipping ANOVA.")
            continue

        if len(valid_group_data) < len(group_names):
            missing_groups = [group for group in group_names if group not in valid_group_data]
            print(f"Warning: Missing data for groups {missing_groups} for radial number {number}, parameter {parameter}. ANOVA will be performed with available groups.")

        if any(len(data) < 2 for data in valid_group_data.values()):
            print(f"Warning: Insufficient data points in at least one group for ANOVA for radial number {number}, parameter {parameter}. Skipping ANOVA.")
            continue

        anova_result = perform_anova(valid_group_data)
        anova_p_values_for_correction.append(anova_result['p_value']) # Collect p-value for correction
        anova_results_list.append({
            'parameter': parameter,
            'radial_number': number,
            'anova_result': anova_result,
            'valid_group_data': valid_group_data
        })

# --- Correct ANOVA p-values and update ANOVA results list ---
reject_anova, corrected_anova_p_values, _, _ = multipletests(anova_p_values_for_correction, alpha=alpha_level, method=correction_method, is_sorted=False, returnsorted=False)

all_anova_results_df = pd.DataFrame()
for i, res in enumerate(anova_results_list):
    anova_df = pd.DataFrame([res['anova_result']])
    anova_df['parameter'] = res['parameter']
    anova_df['radial_number'] = res['radial_number']
    anova_df['p_value_corrected'] = corrected_anova_p_values[i]
    anova_df['significant_corrected'] = reject_anova[i]
    all_anova_results_df = pd.concat([all_anova_results_df, anova_df], ignore_index=True)

all_tukey_results_df = pd.DataFrame()

# --- Perform Post-hoc TukeyHSD for significant tests after correction ---
for index, row in all_anova_results_df.iterrows():
    if row['significant_corrected']: # Check corrected significance
        parameter = row['parameter']
        number = row['radial_number']
        valid_group_data = anova_results_list[index]['valid_group_data'] # Retrieve valid_group_data

        # Perform Post-hoc TukeyHSD test
        all_data = []
        labels = []
        for group_name in group_names:
            if group_name in valid_group_data: # Ensure group has data
                all_data.extend(valid_group_data[group_name])
                labels.extend([group_name] * len(valid_group_data[group_name]))

        if len(set(labels)) > 1:
            tukey_result = pairwise_tukeyhsd(all_data, labels, alpha=alpha_level)
            tukey_df = pd.DataFrame(data=tukey_result._results_table.data[1:], columns=tukey_result._results_table.data[0])
            tukey_df['radial_number'] = number
            tukey_df['parameter'] = parameter
            all_tukey_results_df = pd.concat([all_tukey_results_df, tukey_df], ignore_index=True)
        else:
            print(f"Warning: Cannot perform Tukey HSD for radial number {number}, parameter {parameter}. Only one group with data after ANOVA significance.")


# --- Save results to CSV ---

anova_output_file = os.path.join(output_dir, "anova_results_corrected.csv")
tukey_output_file = os.path.join(output_dir, "tukey_posthoc_results_corrected.csv")

all_anova_results_df.to_csv(anova_output_file, index=False)
print(f"ANOVA results saved to: {anova_output_file}")

if not all_tukey_results_df.empty:
    all_tukey_results_df.to_csv(tukey_output_file, index=False)
    print(f"Tukey Post-hoc results saved to: {tukey_output_file}")
else:
    print("No significant ANOVA results found after correction, Tukey Post-hoc results file not created.")

print("ANOVA analysis completed with multiple comparison correction.")

ANOVA results saved to: /home/aricept094/mydata/ANOVA/results/anova_results_corrected.csv
Tukey Post-hoc results saved to: /home/aricept094/mydata/ANOVA/results/tukey_posthoc_results_corrected.csv
ANOVA analysis completed with multiple comparison correction.


In [None]:
import pandas as pd
import os
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import multipletests
import numpy as np

# --- Configuration ---

input_dir = "/home/aricept094/mydata/ANOVA/radius"  # Replace with your input directory
output_dir = "/home/aricept094/mydata/ANOVA/results" # Replace with your output directory
parameters_of_interest = [
    "dc_component",
    "component_1_amplitude",
    "component_2_amplitude",
    "higher_order_amplitude_sum"
]
group_names = [
    "casia_less_than_1",
    "casia1-2",
    "casia2-4",
    "more_than_4"
]
numbers = [4, 8, 12, 16, 20, 24]
alpha_level = 0.05 # Significance level for ANOVA and Post-hoc
correction_method = 'fdr_bh' # 'bonferroni' or 'fdr_bh' (Benjamini-Hochberg)

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# --- Assumption Checking Functions ---

def check_normality(data_dict):
    """
    Performs Shapiro-Wilk test for normality for each group's data.

    Args:
        data_dict (dict): Dictionary where keys are group names and values are lists of parameter data.

    Returns:
        dict: Dictionary containing Shapiro-Wilk test results for each group.
              Keys are group names, values are dictionaries with 'statistic' and 'p_value'.
    """
    normality_results = {}
    for group_name, data in data_dict.items():
        if len(data) >= 3: # Shapiro-Wilk test is valid for n>=3
            stat, p = stats.shapiro(data)
            normality_results[group_name] = {'statistic': stat, 'p_value': p}
        else:
            normality_results[group_name] = {'statistic': None, 'p_value': 'Insufficient data (<3 samples)'}
    return normality_results

def check_homoscedasticity(data_dict):
    """
    Performs Levene's test for homogeneity of variances across groups.

    Args:
        data_dict (dict): Dictionary where keys are group names and values are lists of parameter data.

    Returns:
        dict: Dictionary containing Levene's test result with 'statistic' and 'p_value'.
    """
    group_data_list = [data_dict[group] for group in group_names if group in data_dict] # Handle potentially missing groups
    if len(group_data_list) >= 2 and all(len(data) > 0 for data in group_data_list): # Levene's test needs at least 2 groups with data
        stat, p = stats.levene(*group_data_list)
        return {'statistic': stat, 'p_value': p}
    else:
        return {'statistic': None, 'p_value': 'Insufficient groups or group data'}


# --- Function to perform ANOVA and Post-hoc ---

def perform_anova_and_posthoc(data_dict, parameter, number, alpha=alpha_level, correction_method=correction_method):
    """
    Performs ANOVA on the given parameter across groups and conducts post-hoc TukeyHSD if significant,
    with False Discovery Rate (FDR) or Bonferroni correction for multiple comparisons.
    Also calculates effect size (eta-squared) for ANOVA and includes confidence intervals for TukeyHSD.
    Performs and includes results of assumption checks (Normality and Homoscedasticity).

    Args:
        data_dict (dict): Dictionary where keys are group names and values are lists of parameter data.
        parameter (str): The parameter being analyzed.
        number (int): The radial number being analyzed.
        alpha (float): Significance level (default 0.05).
        correction_method (str): Method for multiple comparison correction ('bonferroni' or 'fdr_bh').

    Returns:
        pandas.DataFrame: DataFrame containing ANOVA results with corrected p-value, significance, and effect size.
                          Includes results of assumption checks.
        pandas.DataFrame or None: DataFrame containing TukeyHSD results with corrected p-adj and reject,
                                   confidence intervals, or None if ANOVA is not significant or no post-hoc needed.
    """
    anova_results = {}

    # Prepare data for ANOVA
    group_data_list = [data_dict[group] for group in group_names if group in data_dict] # Handle potentially missing groups

    # Perform ANOVA
    fvalue, pvalue = stats.f_oneway(*group_data_list)

    anova_results['F_value'] = fvalue
    anova_results['p_value'] = pvalue

    results_df = pd.DataFrame([anova_results])
    results_df['parameter'] = parameter
    results_df['radial_number'] = number

    results_df['significant'] = pvalue < alpha # Initial significance without correction
    results_df['p_value_corrected'] = None # Placeholder for corrected p-value
    results_df['significant_corrected'] = False # Placeholder for corrected significance
    results_df['eta_squared'] = None # Placeholder for Eta-squared

    # Calculate Eta-squared (Effect Size for ANOVA)
    n_total = sum(len(data_dict[group]) for group in data_dict)
    grand_mean = np.mean([item for sublist in data_dict.values() for item in sublist])
    sst = sum([(x - grand_mean)**2 for sublist in data_dict.values() for x in sublist]) # Total Sum of Squares
    ssb = sum([len(data_dict[group]) * (np.mean(data_dict[group]) - grand_mean)**2 for group in data_dict]) # Between Sum of Squares
    eta_squared = ssb / sst if sst > 0 else 0.0
    results_df['eta_squared'] = eta_squared

    # Perform Assumption Checks
    normality_results = check_normality(data_dict)
    homoscedasticity_result = check_homoscedasticity(data_dict)
    results_df['normality_shapiro_w_statistic'] = str({group: res['statistic'] for group, res in normality_results.items()})
    results_df['normality_shapiro_p_value'] = str({group: res['p_value'] for group, res in normality_results.items()})
    results_df['homoscedasticity_levene_statistic'] = homoscedasticity_result['statistic']
    results_df['homoscedasticity_levene_p_value'] = homoscedasticity_result['p_value']


    tukey_df = None
    if results_df['significant'][0]: # Proceed to Tukey only if initially significant

        # Perform Post-hoc TukeyHSD test
        all_data = []
        labels = []
        for group_name in group_names:
            if group_name in data_dict: # Ensure group exists in data_dict
                all_data.extend(data_dict[group_name])
                labels.extend([group_name] * len(data_dict[group_name]))

        if len(set(labels)) > 1:
            tukey_result = pairwise_tukeyhsd(all_data, labels, alpha=alpha)
            tukey_df = pd.DataFrame(data=tukey_result._results_table.data[1:], columns=tukey_result._results_table.data[0])
            tukey_df = tukey_df.rename(columns={'meandiff': 'mean_difference', 'p-adj': 'p_adj', 'lconf': 'lower_CI', 'uconf': 'upper_CI'}) # Rename and include CI
            tukey_df['radial_number'] = number
            tukey_df['parameter'] = parameter
            tukey_df['reject_corrected'] = tukey_df['reject'].copy()
            tukey_df['p-adj_corrected'] = tukey_df['p_adj'].copy()


        else:
            print(f"Warning: Cannot perform Tukey HSD for radial number {number}, parameter {parameter}. Only one group with data after ANOVA significance.")

    return results_df, tukey_df


# --- Main script ---

all_anova_results_df = pd.DataFrame()
all_tukey_results_df = pd.DataFrame()
anova_p_values_for_correction = [] # List to collect ANOVA p-values for correction

for number in numbers:
    for parameter in parameters_of_interest:
        group_data = {}
        for group_name in group_names:
            # Construct filename
            filename = f"analysis_results_radial_{number}_{group_name}.csv"
            filepath = os.path.join(input_dir, filename)

            try:
                df = pd.read_csv(filepath)
                group_data[group_name] = df[parameter].tolist()
            except FileNotFoundError:
                print(f"Warning: File not found: {filepath}")
                group_data[group_name] = []

        # Data Validation
        valid_group_data = {group: data for group, data in group_data.items() if data}

        if not valid_group_data:
            print(f"Warning: No data available for ANOVA for radial number {number}, parameter {parameter}. Skipping ANOVA.")
            continue

        if len(valid_group_data) < len(group_names):
            missing_groups = [group for group in group_names if group not in valid_group_data]
            print(f"Warning: Missing data for groups {missing_groups} for radial number {number}, parameter {parameter}. ANOVA will be performed with available groups: {list(valid_group_data.keys())}")

        if any(len(data) < 2 for data in valid_group_data.values()):
            print(f"Warning: Insufficient data points in at least one group for ANOVA for radial number {number}, parameter {parameter}. Skipping ANOVA.")
            continue

        anova_df, tukey_df = perform_anova_and_posthoc(valid_group_data, parameter, number)

        anova_p_values_for_correction.append(anova_df['p_value'][0]) # Collect p-value for correction
        all_anova_results_df = pd.concat([all_anova_results_df, anova_df], ignore_index=True)
        if tukey_df is not None:
            all_tukey_results_df = pd.concat([all_tukey_results_df, tukey_df], ignore_index=True)

# --- Correct ANOVA p-values and update ANOVA results ---
reject_anova, corrected_anova_p_values, _, _ = multipletests(anova_p_values_for_correction, alpha=alpha_level, method=correction_method, is_sorted=False, returnsorted=False)
all_anova_results_df['p_value_corrected'] = corrected_anova_p_values
all_anova_results_df['significant_corrected'] = reject_anova

# --- Correct Tukey p-values and update Tukey results ---
if not all_tukey_results_df.empty:
    for index, row in all_tukey_results_df.iterrows():
        radial_number = row['radial_number']
        parameter = row['parameter']
        tukey_p_values_for_radial_param = all_tukey_results_df[(all_tukey_results_df['radial_number'] == radial_number) & (all_tukey_results_df['parameter'] == parameter)]['p_adj'].tolist()

        if tukey_p_values_for_radial_param: # Ensure there are p-values to correct for this group
            reject_tukey, corrected_tukey_p_values, _, _ = multipletests(tukey_p_values_for_radial_param, alpha=alpha_level, method=correction_method, is_sorted=False, returnsorted=False)

            indices_to_update = all_tukey_results_df[(all_tukey_results_df['radial_number'] == radial_number) & (all_tukey_results_df['parameter'] == parameter)].index

            all_tukey_results_df.loc[indices_to_update, 'p-adj_corrected'] = corrected_tukey_p_values
            all_tukey_results_df.loc[indices_to_update, 'reject_corrected'] = reject_tukey

# --- Save results to CSV ---

anova_output_file = os.path.join(output_dir, "anova_results_corrected_enhanced.csv")
tukey_output_file = os.path.join(output_dir, "tukey_posthoc_results_corrected_enhanced.csv")

all_anova_results_df.to_csv(anova_output_file, index=False)
print(f"ANOVA results saved to: {anova_output_file}")

if not all_tukey_results_df.empty:
    all_tukey_results_df.to_csv(tukey_output_file, index=False)
    print(f"Tukey Post-hoc results saved to: {tukey_output_file}")
else:
    print("No significant ANOVA results found after correction, Tukey Post-hoc results file not created.")

print("ANOVA analysis completed with multiple comparison correction and enhancements.")

ANOVA results saved to: /home/aricept094/mydata/ANOVA/results/anova_results_corrected_enhanced.csv
Tukey Post-hoc results saved to: /home/aricept094/mydata/ANOVA/results/tukey_posthoc_results_corrected_enhanced.csv
ANOVA analysis completed with multiple comparison correction and enhancements.
