In [1]:
import numpy as np
import os
import pandas as pd
import tqdm.auto as tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = "test_data"
methods = ["vcmtools", "clomics", "phm"]
core_path = "/data/pushkare/Chromatin_modules"
acm_scores_path = os.path.join(core_path, "1.mapped_CMs", "aCM_scores")
cm_peak_path = os.path.join(core_path, "2.peaks_in_CMs", "peak_files")

In [3]:
for method in methods:
    acm_scores = pd.read_csv(
        os.path.join(acm_scores_path, method, dataset, "aCM_matrix", "aCM_matrix.bed"),
        sep="\t",
    )
    peak_counts = pd.read_csv(
        os.path.join(
            acm_scores_path,
            method,
            dataset,
            "_".join([method, dataset, "merged_count_matrix.bed"]),
        ),
        sep="\t",
    )

    common_columns = list(
        set(acm_scores.columns).intersection(set(peak_counts.columns))
    )
    acm_scores = acm_scores.loc[:, common_columns]
    peak_counts = peak_counts.loc[:, common_columns]
    peaks_in_cms = pd.read_csv(
        os.path.join(
            cm_peak_path,
            "_".join([dataset, method, "all_peaks.bed"]),
        ),
        sep="\t",
        header=None,
        usecols=[3, 4],
        names=["pid", "cm_id"],
    )

    df_pids_per_cm = peaks_in_cms.groupby("cm_id").agg(list)
    peaks_in_cms_dict = dict(zip(df_pids_per_cm.index, df_pids_per_cm.pid))

    acm_df_lst = []
    for cm_id, cm_peaks in peaks_in_cms_dict.items():
        cm_peak_counts = peak_counts.loc[peaks_in_cms_dict.get(cm_id), :]
        min_sample = acm_scores.loc[cm_id, :].dropna().idxmin()
        max_sample = acm_scores.loc[cm_id, :].dropna().idxmax()

        n_greater_peaks = sum(
            cm_peak_counts.loc[:, max_sample] > cm_peak_counts.loc[:, min_sample]
        )
        if n_greater_peaks < len(cm_peaks) / 2 + 0.5:
            sign = -1
        else:
            sign = 1
        acm_df_lst.append((acm_scores.loc[cm_id, :] * sign).to_frame().T)

    acm_df_sign_corrected = pd.concat(acm_df_lst, axis=0)
    acm_df_sign_corrected.to_csv(
        os.path.join(
            acm_scores_path,
            method,
            dataset,
            "aCM_matrix",
            "sign_corrected_aCM_matrix.bed",
        ),
        sep="\t",
        index=True,
        header=True,
    )