In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# ---- Helper ----
def find_results_folder(base_dataset_path, prefix):
    for folder in os.listdir(base_dataset_path):
        if os.path.isdir(os.path.join(base_dataset_path, folder)):
            if prefix in folder and 'results' in folder:
                return folder
    return None


In [None]:
# ---- Setup ----
base_folder = '/mnt/s3fs/flomics-data/FL-cfRNA-meta'
output_folder = os.getcwd()

datasets = {
    'block':         {'sliced': True,  'n_parts': 4,  'prefix': 'block'},
    'chalasani_merged': {'sliced': True,  'n_parts': 5,  'prefix': 'chalasani'},
    'chen':          {'sliced': True,  'n_parts': 21, 'prefix': 'chen'},
    'flomics_1': {'sliced': False, 'subfolder_path': 'Flomics_1/2025_05_16_11_26_41/n3abcb4a/'},
    'flomics_2': {'sliced': False, 'subfolder_path': 'Flomics_2/2025_06_05_09_32_11/n61e05f8/'},
    'decru':         {'sliced': True,  'n_parts': 6,  'prefix': 'decru'},
    'giraldez':      {'sliced': False,                 'prefix': 'giraldez'},
    'ibarra':        {'sliced': True,  'n_parts': 14, 'prefix': 'ibarra'},
    'moufarrej':     {'sliced': True,  'n_parts': 17, 'prefix': 'moufarrej'},
    'ngo':           {'sliced': False,                 'prefix': 'ngo'},
    'reggiardo':     {'sliced': False,                 'prefix': 'reggiardo'},
    'roskams':       {'sliced': True,  'n_parts': 6,  'prefix': 'roskams'},
    'rozowsky':      {'sliced': True,  'n_parts': 9,  'prefix': 'rozowsky'},
    'sun':           {'sliced': True,  'n_parts': 3,  'prefix': 'sun'},
    'tao':           {'sliced': True,  'n_parts': 12, 'prefix': 'tao'},
    #'taowei':        {'sliced': True,  'n_parts': 4,  'prefix': 'taowei'},
    'toden':         {'sliced': True,  'n_parts': 14, 'prefix': 'toden'},
    'wang':          {'sliced': True,  'n_parts': 4,  'prefix': 'wang'},
    'zhu':           {'sliced': True,  'n_parts': 5,  'prefix': 'zhu'},
}

dataset_colors = {
    "block": "#b3b3b3", "decru": "#009E73", "zhu": "#ffd633", "chen": "#997a00", "flomics_1" : "#9AB9D6", "flomics_2": "#144D6B",
    "ngo": "#fa8072", "roskams": "#944dff", "moufarrej": "#CC79A7", "sun": "#D55E00",
    "tao": "#0072B2", "toden": "#800099", "ibarra": "#800000", "chalasani_merged": "#800040",
    "rozowsky": "#006600", "taowei": "#B32400", "giraldez": "#B1CC71", "reggiardo": "#F1085C", "wang": "#FE8F42"
}

In [18]:



# ---- Process ----
all_profiles = {}

for dataset, meta in datasets.items():
    sliced = meta['sliced']
    n_parts = meta.get('n_parts', 1)
    prefix = meta.get('prefix', None)  # Only exists if sliced or traditional unsliced
    print(f"\nProcessing dataset: {dataset}")
    
    part_means = []

    if sliced:
        for i in range(n_parts):
            part_folder = f"{prefix}_results_{i:02d}.part/multiqc/star_salmon/multiqc_data"
            file_path = os.path.join(base_folder, dataset, part_folder, "mqc_qualimap_gene_coverage_profile_Normalised.txt")
            if os.path.isfile(file_path):
                try:
                    df = pd.read_csv(file_path, sep="\t")
                    sample_only = df.drop(columns=["Sample"])
                    part_means.append(sample_only.mean(axis=0))
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
            else:
                print(f"Missing: {file_path}")
    else:
        if "subfolder_path" in meta:
            part_path = os.path.join(base_folder, meta["subfolder_path"], "multiqc", "star_salmon", "multiqc_data")
        else:
            base_dataset_path = os.path.join(base_folder, dataset)
            matched_folder = find_results_folder(base_dataset_path, prefix)
            if not matched_folder:
                print(f"Could not find results folder for: {dataset}")
                continue
            part_path = os.path.join(base_dataset_path, matched_folder, "multiqc", "star_salmon", "multiqc_data")

        file_path = os.path.join(part_path, "mqc_qualimap_gene_coverage_profile_Normalised.txt")
        if os.path.isfile(file_path):
            try:
                df = pd.read_csv(file_path, sep="\t")
                sample_only = df.drop(columns=["Sample"])
                part_means.append(sample_only.mean(axis=0))
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        else:
            print(f"Missing: {file_path}")


    # Average across all .part means
    if part_means:
        dataset_mean_df = pd.DataFrame(part_means).mean(axis=0).reset_index()
        dataset_mean_df.columns = ["position", "coverage"]
        dataset_mean_df["position"] = dataset_mean_df["position"].astype(int)
        dataset_mean_df.sort_values("position", inplace=True)
        all_profiles[dataset] = dataset_mean_df

        # Per-dataset plot
        plt.figure(figsize=(10, 4))
        plt.plot(dataset_mean_df["position"], dataset_mean_df["coverage"],
                 label=dataset, color=dataset_colors.get(dataset))
        plt.title(f"Normalized Transcript Coverage: {dataset}")
        plt.xlabel("Transcript position (%)")
        plt.ylabel("Normalized coverage")
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f"{dataset}_multiqc_coverage.png"))
        plt.close()
        print(f"Saved: {dataset}_multiqc_coverage.png")
    else:
        print(f"No data for {dataset}")





Processing dataset: block
Saved: block_multiqc_coverage.png

Processing dataset: chalasani_merged
Saved: chalasani_merged_multiqc_coverage.png

Processing dataset: chen
Saved: chen_multiqc_coverage.png

Processing dataset: flomics_1
Saved: flomics_1_multiqc_coverage.png

Processing dataset: flomics_2
Saved: flomics_2_multiqc_coverage.png

Processing dataset: decru
Saved: decru_multiqc_coverage.png

Processing dataset: giraldez
Saved: giraldez_multiqc_coverage.png

Processing dataset: ibarra
Saved: ibarra_multiqc_coverage.png

Processing dataset: moufarrej
Saved: moufarrej_multiqc_coverage.png

Processing dataset: ngo
Saved: ngo_multiqc_coverage.png

Processing dataset: reggiardo
Saved: reggiardo_multiqc_coverage.png

Processing dataset: roskams
Saved: roskams_multiqc_coverage.png

Processing dataset: rozowsky
Saved: rozowsky_multiqc_coverage.png

Processing dataset: sun
Saved: sun_multiqc_coverage.png

Processing dataset: tao
Saved: tao_multiqc_coverage.png

Processing dataset: toden


In [None]:
dataset_labels = {
    "block": "Block",
    "chalasani_merged": "Chalasani",
    "chen": "Chen",
    "flomics_1" : "Flomics 1",
    "flomics_2": "Flomics 2",
    "decru": "Decruyenaere",
    "giraldez": "Giráldez",
    "ibarra": "Ibarra",
    "moufarrej": "Moufarrej",
    "ngo": "Ngo",
    "reggiardo": "Reggiardo",
    "roskams": "Roskams-Hieter",
    "rozowsky": "Rozowsky",
    "sun": "Sun",
    "tao": "Tao",
    "taowei": "Wei",
    "toden": "Toden",
    "wang": "Wang",
    "zhu": "Zhu"
}

# ---- Combined Plot ----
plt.figure(figsize=(12, 6))
for dataset, df in all_profiles.items():
    plt.plot(df["position"], df["coverage"], label=dataset_labels.get(dataset, dataset), color=dataset_colors.get(dataset))
plt.xlabel("Transcript position (%)")
plt.ylabel("Normalized coverage")
plt.title("Average Gene Coverage per Dataset (MultiQC normalized)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "multiqc_style_normalized_coverage_combined.png"), dpi=600)
plt.close()

print("All per-dataset and combined plots saved.")
