In [21]:
import pandas as pd

# Load significant regions data
significant_regions_file = 'global_bycluster_all_regions.csv'
significant_regions_df = pd.read_csv(significant_regions_file)

# Load gene annotation data
gene_meta_file =  '../genebody/GeneMetadata_withGeneNameEnsmbl.csv'
gene_meta = pd.read_csv(gene_meta_file, index_col='gene_id')

# Define significance threshold
p_value_threshold = 0.05

# Filter for significant p-values
significant_df = significant_regions_df[significant_regions_df['p_value'] < p_value_threshold]

# List of contexts and clusters to create filtered files
contexts = ['mCH', 'mCG']
clusters = ['c0', 'c1', 'c2', 'c3']

# Generate CSV files based on combinations of context and cluster
for context in contexts:
    for cluster in clusters:
        # Filter data by context and cluster
        filtered_data = significant_df[(significant_df['context'] == context) & (significant_df['cluster'] == cluster)]
        
        # Define output file name
        output_filename = f'significant_{context}_{cluster}.csv'
        
        # Save to CSV if there are significant regions for this combination
        if not filtered_data.empty:
            filtered_data.to_csv(output_filename, index=False)
            print(f"Saved {output_filename}")
        else:
            print(f"No significant regions found for {context} in {cluster}")

# Function to find overlapping genes in significant regions
def find_genes_in_significant_regions(input_file):
    significant_data = pd.read_csv(input_file)
    
    overlapping_genes = []

    for _, region in significant_data.iterrows():
        chrom = region['chromosome']
        region_start = region['start']
        region_end = region_start + 100000  # Assuming 100kb region size for chrom100kb

        # Find overlapping genes
        overlapping = gene_meta[
            (gene_meta['chrom'] == chrom) &
            (gene_meta['start'] <= region_end) &
            (gene_meta['end'] >= region_start)
        ]
        
        if not overlapping.empty:
            for _, gene in overlapping.iterrows():
                overlapping_genes.append({
                    'chromosome': chrom,
                    'region_start': region_start,
                    'region_end': region_end,
                    'gene_id': gene.name,  # Gene ID as index
                    'gene_name': gene['gene_name'],
                    'gene_start': gene['start'],
                    'gene_end': gene['end'],
                    'methylation_diff': region['methylation_diff'],
                    'p_value': region['p_value'],
                    'q_value': region['q_value'],
                    'direction': region['direction']
                })

    overlapping_genes_df = pd.DataFrame(overlapping_genes)
    output_filename = f'genes_in_{input_file}'
    overlapping_genes_df.to_csv(output_filename, index=False)
    print(f"Saved overlapping genes to {output_filename}")

# Apply the function to each of the filtered files generated
for context in contexts:
    for cluster in clusters:
        input_file = f'significant_{context}_{cluster}.csv'
        try:
            find_genes_in_significant_regions(input_file)
        except FileNotFoundError:
            print(f"{input_file} not found, skipping.")


Saved significant_mCH_c0.csv
Saved significant_mCH_c1.csv
Saved significant_mCH_c2.csv
Saved significant_mCH_c3.csv
Saved significant_mCG_c0.csv
Saved significant_mCG_c1.csv
Saved significant_mCG_c2.csv
Saved significant_mCG_c3.csv
Saved overlapping genes to genes_in_significant_mCH_c0.csv
Saved overlapping genes to genes_in_significant_mCH_c1.csv
Saved overlapping genes to genes_in_significant_mCH_c2.csv
Saved overlapping genes to genes_in_significant_mCH_c3.csv
Saved overlapping genes to genes_in_significant_mCG_c0.csv
Saved overlapping genes to genes_in_significant_mCG_c1.csv
Saved overlapping genes to genes_in_significant_mCG_c2.csv
Saved overlapping genes to genes_in_significant_mCG_c3.csv


In [32]:
import pandas as pd

# Define file paths and clusters/contexts
clusters = ['c0', 'c1', 'c2', 'c3']
contexts = ['mCH', 'mCG']

# Dictionary to store the file paths for easier access
dmg_files = {
    'mCH': {f'c{i}': f"../bygroup/bygroup-cph/DMG_mCH_ByGroup_sig_c{i}.csv" for i in range(4)},
    'mCG': {f'c{i}': f"../bygroup/bygroup-bycpg/DMG_mCG_ByGroup_sig_c{i}.csv" for i in range(4)}
}


# Define significant genes files for mCH and mCG
significant_genes_files = {
    'mCH': {f'c{i}': f"genes_in_significant_mCH_c{i}.csv" for i in range(4)},
    'mCG': {f'c{i}': f"genes_in_significant_mCG_c{i}.csv" for i in range(4)}
}

# Loop through each context and cluster, find overlaps, merge data, and store the results
for context in contexts:
    for cluster in clusters:
        # Load the significant genes file for the specific context and cluster
        significant_genes_df = pd.read_csv(significant_genes_files[context][cluster])
        
        # Load the DMG file for the specific context and cluster
        dmg_df = pd.read_csv(dmg_files[context][cluster])
        
        # Merge on gene_id (assuming this is the column with gene IDs in both files)
        merged_df = pd.merge(significant_genes_df, dmg_df, left_on='gene_id', right_on='genebody', how='inner')
        
        # Save the merged overlap data to a new CSV file
        output_filename = f"overlap_genes_{context}_{cluster}.csv"
        merged_df.to_csv(output_filename, index=False)
        
        print(f"Overlap file created: {output_filename} with {merged_df.shape[0]} overlapping genes")

# Confirm process completion
print("Overlap identification and file generation complete.")


Overlap file created: overlap_genes_mCH_c0.csv with 47 overlapping genes
Overlap file created: overlap_genes_mCH_c1.csv with 353 overlapping genes
Overlap file created: overlap_genes_mCH_c2.csv with 78 overlapping genes
Overlap file created: overlap_genes_mCH_c3.csv with 78 overlapping genes
Overlap file created: overlap_genes_mCG_c0.csv with 247 overlapping genes
Overlap file created: overlap_genes_mCG_c1.csv with 141 overlapping genes
Overlap file created: overlap_genes_mCG_c2.csv with 137 overlapping genes
Overlap file created: overlap_genes_mCG_c3.csv with 61 overlapping genes
Overlap identification and file generation complete.
