In [1]:
import os
import sys
import pandas as pd

In [2]:
# Define the pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)  # Use terminal width (or set a large int like 2000)
pd.set_option('display.max_colwidth', None)  # Avoid truncating wide text columns

In [3]:
# Set the paths
working_dir = 'D:/OneDrive/dhaynessimmons/OneDrive - UMC Utrecht/Documenten/projects/ageing_flies'
input_directory = os.path.join(working_dir, 'Results/age_scdgea/ct_specific_MAST')

print("Contents of the directory:")
for filename in os.listdir(input_directory):
    print(filename)

Contents of the directory:
markers_adult alary muscle.csv
markers_adult fat body_body.csv
markers_adult glial cell.csv
markers_adult hindgut.csv
markers_adult oenocyte.csv
markers_adult peripheral nervous system.csv
markers_adult tracheal cell.csv
markers_adult ventral nervous system.csv
markers_cell body glial cell.csv
markers_crop.csv
markers_ejaculatory bulb.csv
markers_enteroblast.csv
markers_eo support cell.csv
markers_epithelial cell_body.csv
markers_female reproductive system.csv
markers_follicle cell.csv
markers_germline cell.csv
markers_gustatory receptor neuron.csv
markers_hemocyte_body.csv
markers_indirect flight muscle.csv
markers_intestinal stem cell.csv
markers_male accessory gland main cell.csv
markers_mechanosensory neuron of haltere.csv
markers_muscle cell.csv
markers_oviduct.csv
markers_perineurial glial sheath.csv
markers_pheromone-sensing neuron.csv
markers_scolopidial neuron.csv
markers_seminal vesicle & testis epithelia.csv
markers_subperineurial glial cell_body.c

In [4]:
# Define the genes of interest
genes_of_interest = [
    "Su(var)205","Su(var)3-9","G9a", "HP1b", "HP1c", 
    "HP4", "HP5", "HP6", "ADD1", "Su(var)2-HP2", 
    "Su(var)3-7", "Lam", "LamC", "LBR", "Kdm4A", 
    "Kdm4B", "His2Av", "His3.3A", "His3.3B"
]

In [7]:
inter_df = pd.DataFrame()
for file in os.listdir(input_directory):
    if file.endswith('.csv'):
        # read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(input_directory, file))
        print(
            f"\n\nProcessing file: {file}\n"
            # df.head(3)
        )
        
        subset = df[
            (df['gene'].isin(genes_of_interest)) & 
            (df['p_val_adj'] < 0.1) &
            (abs(df['avg_log2FC']) >= 1)
        ]
        
        if subset.empty:
            print(f"No significant genes found in {file} with the specified criteria.")
        else:
            print(f"Found {len(subset)} significant genes in {file} with the specified criteria.")
            print(subset)
            
            # Append the subset to the inter_df DataFrame
            inter_df = pd.concat([inter_df, subset], ignore_index=True)

# Save the combined DataFrame to a CSV file
output_file = os.path.join(working_dir, 'Results/age_scdgea/MAST_combined_significant_genes.csv')
inter_df.to_csv(output_file, index=False)




Processing file: markers_adult alary muscle.csv

Found 1 significant genes in markers_adult alary muscle.csv with the specified criteria.
        p_val  avg_log2FC  pct.1  pct.2  p_val_adj           cell_type comparison    gene
5994  0.00489    1.139987  0.067  0.032   0.050876  adult alary muscle    5 vs 50  His2Av


Processing file: markers_adult fat body_body.csv

Found 7 significant genes in markers_adult fat body_body.csv with the specified criteria.
              p_val  avg_log2FC  pct.1  pct.2     p_val_adj            cell_type comparison    gene
2036   9.830154e-46    1.297497  0.143  0.038  3.627071e-45  adult fat body_body    5 vs 30  His2Av
4766   3.639603e-13    3.368499  0.034  0.002  5.738464e-13  adult fat body_body    5 vs 30     HP4
6063   2.205651e-07    1.066192  0.014  0.005  2.733785e-07  adult fat body_body    5 vs 30    HP1b
6121   3.411089e-07    1.301922  0.030  0.008  4.187805e-07  adult fat body_body    5 vs 30     LBR
6915   3.202200e-04    2.367668  0.017