In [9]:
import pandas as pd
from statsmodels.stats.multitest import multipletests
import os as os
# Define the cell states
cellstates = ['EEP', 'CEP-1', 'CEP-2']

folder_name = 'adj_pval_results/'
if not os.path.exists(folder_name):
    os.mkdir(folder_name)

# Iterate over each cell state and apply multiple hypothesis correction
for cell_state in cellstates:
    # Load the corresponding CSV file for each cell state
    file_name = f'241002_GLM_Updated_results_{cell_state}.csv'
    results_df = pd.read_csv(file_name)

    # Collect all LRT p-values across all comparisons (columns that contain 'lrt_pval')
    lrt_pval_columns = [col for col in results_df.columns if 'lrt_pval' in col]

    # Flatten all LRT p-values into a single list
    all_pvals = results_df[lrt_pval_columns].values.flatten()

    # Remove NaN values from the list
    all_pvals = all_pvals[~pd.isna(all_pvals)]

    # Apply Benjamini-Hochberg correction across all comparisons and all p-values
    _, pvals_corrected, _, _ = multipletests(all_pvals, method='fdr_bh')

    # Reshape the corrected p-values back into the same shape as the original DataFrame
    corrected_pvals = pd.Series(pvals_corrected).values.reshape(results_df[lrt_pval_columns].shape)

    # Assign the corrected p-values back to the corresponding columns in the DataFrame
    for i, col in enumerate(lrt_pval_columns):
        results_df[f'{col}_corrected'] = corrected_pvals[:, i]

    # Save the corrected results back to a CSV file
    corrected_file_name = f'{folder_name}241002_GLM_Updated_results_{cell_state}_corrected.csv'
    results_df.to_csv(corrected_file_name, index=False)
    print(f"Corrected results saved for {cell_state} to {corrected_file_name}")

Corrected results saved for EEP to adj_pval_results/241002_GLM_Updated_results_EEP_corrected.csv
Corrected results saved for CEP-1 to adj_pval_results/241002_GLM_Updated_results_CEP-1_corrected.csv
Corrected results saved for CEP-2 to adj_pval_results/241002_GLM_Updated_results_CEP-2_corrected.csv
