In [14]:
import os
import pybedtools
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

In [15]:
# Define paths
bed_folder = "/home/z5628505/Data/JoycData/NarrowPeaks"  # Change this to the actual folder containing .narrowPeak files
reference_peak_folder = "/home/z5628505/Data/JoycData/NarrowPeaks"  # Change this to the reference peak file
results_folder = "/home/z5628505/Data/JoycData/Results"  # Change this to the directory where you want to save results

# Create results folder if it doesn't exist
os.makedirs(results_folder, exist_ok=True)

In [None]:
reference_peak_files = [f for f in os.listdir(bed_folder) if f.endswith(".narrowPeak")]

for rfile in reference_peak_files:
    # Load reference peak file
    reference_peak = pybedtools.BedTool(os.path.join(bed_folder, rfile))
    # rfil= .... if you only want to compare to one, but outcomment for loop  
    # Get list of sample peak files (excluding the reference file)
    bed_files = [f for f in os.listdir(bed_folder) if f.endswith(".narrowPeak") and f != rfile]

    # Iterate over each sample file and compare it to the reference
    for file in bed_files:
        sample_file = os.path.join(bed_folder, file)  # Get full path of the sample file
        peaks_B = pybedtools.BedTool(sample_file)  # Load sample peak file


        sample_name = "_".join(file.split("_")[:2])  # Extract first two prefixes
        reference_name = "_".join(rfile.split("_")[:2])  # Extract first two prefixes from reference


        # Find overlaps
        overlap_AB = reference_peak.intersect(peaks_B, u=True)
        only_A = reference_peak.subtract(peaks_B)  # Peaks unique to Reference
        only_B = peaks_B.subtract(reference_peak)  # Peaks unique to Sample

        # Get counts
        count_A = len(only_A.to_dataframe())
        count_B = len(only_B.to_dataframe())
        count_AB = len(overlap_AB.to_dataframe())

        # Create Venn diagram
        plt.figure(figsize=(5, 5))
        venn2(subsets=(count_A, count_B, count_AB), set_labels=(reference_name, sample_name))
        plt.title(f"ChIP-seq Peak Overlap: {reference_name} vs {sample_name}")

        # Save the plot with the updated name
        output_path = os.path.join(results_folder, f"Venn_{reference_name}_{sample_name}.svg")
        plt.savefig(output_path, format='svg')
        plt.close()  # Close the plot to prevent memory issues



        print(f"Saved: {output_path}")

Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_PU1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_CTCF_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_GATA2_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_LYL1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_ERG_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_STAG2_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_RUNX1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_FLI1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_LMO2_LMPP_TAL1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_PU1_LMPP_LMO2_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_PU1_LMPP_CTCF_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_PU1_LMPP_GATA2_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_PU1_LMPP_LYL1_LMPP.svg
Saved: /home/z5628505/Data/JoycData/Results/Venn_PU1_