# 1. TF - Binding

# 1.1 Setup

# 1.2 Lambert

In [None]:
lambert_url = "http://humantfs.ccbr.utoronto.ca/download/v_1.01/TF_names_v_1.01.txt"
lambert_out = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/lambert.csv"

In [None]:
!wget --no-check-certificate --no-verbose '{lambert_url}' -O {lambert_out}

# 1.3 ChipAtlas

In [None]:
import subprocess

# === Define variables ===
params_mta = "https://chip-atlas.dbcls.jp/data/metadata/experimentList.tab"  # Replace with {params.mta}
output_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/chipatlas_meta.tsv"                   # Replace with {output}
input_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/lambert.csv"                        # Replace with {input}

# 1️⃣ Download the file
print("Downloading file...")
subprocess.run(
    ["wget", "--no-verbose", params_mta, "-O", output_file],
    check=True
)

# 2️⃣ Run the Python script
print("Processing downloaded data...")
result = subprocess.run(
    ["python", "/home/andrem/GRN-project/resources/greta/tfb/chipatlas_meta.py", output_file, input_file],
    capture_output=True, text=True
)

# Print stdout and stderr
print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)


In [None]:
keywords = [
    "B-Cell",
    "B-cell (CD19+)",
    "Bone Marrow Cells",
    "Bone marrow mononuclear cells",
    "Bone marrow nuclear cells",
    "CD20+ B cells",
    "CD4+ T cells",
    "CD8+ T cells",
    "Dendritic Cells",
    "Macrophages",
    "Mast Cells",
    "Memory B cells",
    "Memory T cells",
    "Monocytes",
    "Monocytes-CD14+",
    "Naive B cells",
    "Natural Killer T-Cells",
    "Natural killer cells",
    "T cells",
    "Th1 Cells",
    "Th17 Cells",
    "Th2 Cells",
    "Treg",
    "bone",
    "blood",
    "hematopoiesis",
    "haematopoiesis",
    "hematopoietic",
    "haematopoietic",
    "bone-marrow",
    "leukemia",
    "acute myeloid leukemia",
    "AML",
    "myeloid",
    "monocytic",
    "promyelocytic",
    "granulocytic",
    "erythroid",
    "megakaryocytic",
    "hematopoietic stem cell",
    "HSC",
    "LSC",
    "progenitor cell",
    "CD34+",
    "CD33+",
    "CD13+",
    "FLT3",
    "NPM1",
    "KIT",
    "RUNX1",
    "CEBPA",
    "blast cell",
    "immature myeloid cell",
    "AML cell line",
    "MV4-11",
    "MOLM-13",
    "MOLM-14",
    "THP-1",
    "HL-60",
    "KG-1",
    "OCI-AML2",
    "OCI-AML3",
    "NB4",
    "Kasumi-1",
    "PL-21",
    "SKM-1",
    "U937"
]


In [None]:
import pandas as pd

chipatlas_meta = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/chipatlas_meta.tsv"
output = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/chipatlas_meta_filtered.tsv"

df_hg38 = pd.read_csv(chipatlas_meta, sep='\t', header=None)

# Filter by keywords (case-insensitive)
keyword_mask = df_hg38.apply(lambda row: any(
    kw in str(row[2]).lower()
    for kw in keywords
), axis=1)

df_filtered = df_hg38[keyword_mask]

print(f"Total records after filtering: {len(df_filtered)}")

df_filtered.to_csv(output, sep='\t', index=False, header=None)
print(f"Filtered metadata saved to: {output}")


In [None]:
import os
import subprocess

# Assuming output_dir and df_filtered are defined earlier
meta_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/chipatlas_meta_filtered.tsv"
max_psize = 750

# Use the correct index for TF names
unique_tfs = df_filtered[1].dropna().unique()

raw_dir = os.path.join(output_dir, 'chipatlas_raw')
os.makedirs(raw_dir, exist_ok=True)

# Loop over each transcription factor
for tf in unique_tfs:
    url = f"https://chip-atlas.dbcls.jp/data/hg38/assembled/Oth.ALL.50.{tf}.AllCell.bed"
    output = os.path.join(raw_dir, f"{tf}.bed")

    #Build the command string
    command = (
        f"bash /home/andrem/GRN-project/workflow/bash_scripts/chipatlas.sh "
        f"\"{url}\" \"{output}\" \"{meta_file}\" \"{max_psize}\""
    )

    print(f"Running for {output}...")
    try:
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error processing {output}: {e}")


In [None]:
import os
import subprocess

bed_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/chipatlas_raw"
output = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/chipatlas.bed"

command = (
    f"bash /home/andrem/GRN-project/workflow/bash_scripts/aggreagate_chipatlas.sh "
    f"\"{bed_dir}\" \"{output}\""
)

try:
    subprocess.run(command, shell=True, check=True)
except subprocess.CalledProcessError as e:
    print(f"Error processing {output}: {e}")

# 2. ReMap2022

# 2.1 BTO

In [None]:
import subprocess

params_url = "'https://service.tib.eu/ts4tib/api/ontologies/bto/download'"  # Replace with {params.url}
output_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/bto.tsv"                      # Replace with {output}

command = f"wget --no-verbose '{params_url}' -O - | python /home/andrem/GRN-project/resources/greta/tfb/bto.py '{output_file}'"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

In [None]:
import subprocess

mta_url = "https://remap.univ-amu.fr/storage/remap2022/biotypes/remap2022_hsap_biotypes.xlsx"  # Replace with {params.mta}
input_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/bto.tsv"            # Replace with {input}
output_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/remap2022meta.tsv"          # Replace with {output}

command = f"wget --no-verbose '{mta_url}' -O - | python /home/andrem/GRN-project/resources/greta/tfb/remap2022_meta.py '{input_file}' '{output_file}'"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

In [None]:
import os
import subprocess

# Replace these with actual values:
params_url = "https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_nr_macs2_hg38_v1_0.bed.gz"  # Replace with {params.url}
output_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/remap2022_raw"                # Replace with {output}
output_tmp = f"{output_dir}.tmp"
input_tfs = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/lambert.csv"               # Replace with {input.tfs}
input_mta = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/remap2022meta.tsv"               # Replace with {input.mta}
max_psize = "750"                             # Replace with {params.max_psize}

# 1️⃣ Create the output directory
os.makedirs(output_dir, exist_ok=True)

# 2️⃣ Download the file
print("Downloading file...")
subprocess.run(
    ["wget", "--no-verbose", params_url, "-O", output_tmp],
    check=True
)

# 3️⃣ Unzip and pipe to the Python script
print("Processing data...")
command = (
    f"zcat {output_tmp} | "
    f"python /home/andrem/GRN-project/resources/greta/tfb/remap2022_raw.py "
    f"{input_tfs} {input_mta} {max_psize} {output_dir}"
)
result = subprocess.run(command, shell=True, capture_output=True, text=True)

print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

# 4️⃣ Remove the temporary file
print("Cleaning up...")
os.remove(output_tmp)


In [None]:
import os
import subprocess
import glob

# Define variables
threads = 32
raw_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/remap2022_raw"
output_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/remap2022.bed"

# Step 1: List all .bed files
bed_files = glob.glob(os.path.join(raw_dir, "*.bed"))

# Step 2: Merge each .bed file in parallel
print("Merging .bed files...")
merge_commands = []
for bed_file in bed_files:
    tmp_file = f"{bed_file}.tmp"
    cmd = [
        "bedtools", "merge",
        "-i", bed_file,
        "-c", "4,5",
        "-o", "distinct,distinct"
    ]
    with open(tmp_file, 'w') as out_f:
        result = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            print(f"Error merging {bed_file}: {result.stderr}")

# Step 3: Concatenate all merged files
print("Aggregating merged files...")
cat_cmd = f"cat {raw_dir}/*.bed.tmp | python /home/andrem/GRN-project/resources/greta/tfb/aggregate.py > {output_file}"
result = subprocess.run(cat_cmd, shell=True, capture_output=True, text=True)
print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

# Step 4: Remove all .tmp files
print("Cleaning up temporary files...")
for tmp_file in glob.glob(f"{raw_dir}/*.bed.tmp"):
    os.remove(tmp_file)

print(f"Done! Aggregated file: {output_file}")


# 3. Unibind

In [None]:
import os
import subprocess

# === Define variables ===
input_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/lambert.csv"  # Replace with actual file path
output_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/unibind_raw"      # Same as {output}
params_url = "https://unibind.uio.no/static/data/20220914/bulk_Robust/Homo_sapiens/hg38_compressed_TFBSs.bed.gz"  # Replace with actual URL
max_psize = "750"                            # Replace with actual value

# 1️⃣ Create output directory
os.makedirs(output_dir, exist_ok=True)

# 2️⃣ Download the compressed file
output_tmp = os.path.join(output_dir, "download.tmp")
print(f"Downloading {params_url} ...")
subprocess.run(
    ["wget", "--no-verbose", params_url, "-O", output_tmp],
    check=True
)

# 3️⃣ Uncompress and pipe to Python script
print("Processing unibind data ...")
command = (
    f"zcat {output_tmp} | "
    f"python /home/andrem/GRN-project/resources/greta/tfb/unibind_raw.py "
    f"{input_file} "
    f"{max_psize} "
    f"{output_dir}"
)
result = subprocess.run(command, shell=True, capture_output=True, text=True)
print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

# 4️⃣ Delete the temporary file
print("Cleaning up temporary file ...")
os.remove(output_tmp)

print(f"Done! Unibind data processed in {output_dir}")


In [None]:
import os
import subprocess
import glob

# === Define variables ===
threads = 32
raw_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/unibind_raw"
output_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/unibind.bed"

# 1️⃣ List all .bed files
bed_files = glob.glob(os.path.join(raw_dir, "*.bed"))

# 2️⃣ Merge each .bed file
print("Merging .bed files...")
for bed_file in bed_files:
    tmp_file = f"{bed_file}.tmp"
    cmd = [
        "bedtools", "merge",
        "-i", bed_file,
        "-c", "4,5",
        "-o", "distinct,distinct"
    ]
    with open(tmp_file, 'w') as out_f:
        result = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            print(f"Error merging {bed_file}: {result.stderr}")

# 3️⃣ Concatenate all .tmp files and aggregate
print("Aggregating merged files...")
cat_cmd = f"cat {raw_dir}/*.bed.tmp | python /home/andrem/GRN-project/resources/greta/tfb/aggregate.py > {output_file}"
result = subprocess.run(cat_cmd, shell=True, capture_output=True, text=True)

print("STDOUT:")
print(result.stdout)
print("STDERR:")
print(result.stderr)

# 4️⃣ Remove all .tmp files
print("Cleaning up temporary files...")
for tmp_file in glob.glob(f"{raw_dir}/*.bed.tmp"):
    os.remove(tmp_file)

print(f"Done! Aggregated file: {output_file}")


# 4. Merge

In [None]:
remap_filter_keywords = [
    "osteoblastoma cell",
    "astroblast",
    "blast cell",
    "blastoderm",
    "blastospore",
    "glioblastoma cell",
    "hematopoietic cell",
    "leukemia cell",
    "myeloid leukemia cell line",
    "chronic myeloid leukemia cell",
    "chronic myeloid leukemia cell line",
    "acute myeloid leukemia cell",
    "acute myeloid leukemia cell line",
    "monocyte",
    "monocytic leukemia cell",
    "monocytic leukemia cell line",
    "myeloid progenitor cell",
    "myeloblast",
    "promyelocyte",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "megakaryocyte",
    "granulocyte",
    "bone marrow",
    "bone marrow cell",
    "bone marrow-derived dendritic cell",
    "bone marrow-derived macrophage",
    "stem cell",
    "hematopoietic stem cell",
    "mesenchymal stem cell",
    "peripheral blood stem cell",
    "dendritic cell",
    "macrophage",
    "myelomonocytic leukemia cell line",
    "hairy cell leukemia cell",
    "hairy cell leukemia cell line",
    "myeloid cell",
    "promyelocytic leukemia cell",
    "promyelocytic leukemia cell line",
    "promonocytic leukemia cell",
    "promonocytic leukemia cell line",
    "megakaryoblastic leukemia cell",
    "megakaryoblastic leukemia cell line",
    "AML cell",
    "CD34+ cell",
    "MLL cell line",
    "MLL leukemia cell line",
    "neoblast",
    "lymphoblast",
    "lymphoblastic leukemia cell",
    "lymphoblastic leukemia cell line",
    "lymphocyte",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "lymphoid progenitor cell",
    "NK cell",
    "natural killer cell",
    "bone marrow stromal cell",
    "bone marrow stromal cell line",
    "immune stem cell",
    "hematopoietic cell line",
    "peripheral blood monocyte",
    "myeloblastoma cell",
    "myeloid cell line",
    "myeloid dendritic cell",
    "myeloblastoma cell line",
    "myeloblast cell line",
    "myelomonocytic cell line",
    "leukemic stem cell",
    "megakaryocyte cell line",
    "erythroid progenitor cell",
    "erythroblast",
    "erythroblast cell line",
    "megakaryocytic leukemia cell",
    "megakaryocytic leukemia cell line",
    "basophilic leukemia cell",
    "basophilic leukemia cell line",
    "eosinophilic leukemia cell",
    "eosinophilic leukemia cell line",
    "granulocytic leukemia cell",
    "granulocytic leukemia cell line",
    "regulatory T-lymphocyte",
    "T-lymphocyte",
    "helper T-lymphocyte",
    "CD4+ cell",
    "CD8+ cell",
    "lymphoid dendritic cell",
    "macrophage foam cell",
    "peritoneal macrophage",
    "alveolar macrophage",
    "alveolar macrophage cell line",
    "monocyte-derived dendritic cell",
    "monocyte-derived macrophage",
    "monocyte cell line",
    "peripheral blood lymphocyte",
    "leukocyte",
    "large granular lymphocyte",
    "acute promyelocytic leukemia cell",
    "acute promyelocytic leukemia cell line",
    "chronic lymphocytic leukemia cell",
    "chronic lymphocytic leukemia cell line",
    "acute lymphoblastic leukemia cell",
    "acute lymphoblastic leukemia cell line",
    "B-cell leukemia cell",
    "B-cell chronic lymphocytic leukemia cell",
    "B-cell acute lymphoblastic leukemia cell",
    "T-cell acute lymphoblastic leukemia cell",
    "T-cell chronic lymphocytic leukemia cell",
    "NK-cell leukemia cell",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "myeloid leukemia cell",
    "myeloid leukemia cell line",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "leukemic stem cell line",
    "leukemic progenitor cell",
    "hematopoietic progenitor cell",
    "myeloid progenitor cell line",
    "granulocyte progenitor cell",
    "macrophage progenitor cell",
    "monocyte progenitor cell",
    "megakaryocyte progenitor cell",
    "erythroid progenitor cell line",
    "pluripotent hematopoietic stem cell",
    "multipotent hematopoietic stem cell",
    "common myeloid progenitor cell",
    "common lymphoid progenitor cell",
    "pro-B-lymphocyte",
    "pre-B-lymphocyte",
    "pro-T-lymphocyte",
    "pre-T-lymphocyte",
    "monoblast",
    "megakaryoblast",
    "myeloblast cell line",
    "myeloblast cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "promonocyte",
    "promonocyte cell line",
    "AML blast cell",
    "AML blast cell line",
    "myeloid leukemia blast cell",
    "myeloid leukemia blast cell line",
    "myeloid leukemia cell line",
    "myeloid leukemia cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "granulocytic blast cell",
    "granulocytic blast cell line",
    "megakaryocytic blast cell",
    "megakaryocytic blast cell line",
    "erythrocytic blast cell",
    "erythrocytic blast cell line",
    "dendritic cell line",
    "monocyte cell",
    "monocyte cell line",
    "monoblast cell",
    "monoblast cell line",
    "myelomonocyte cell",
    "myelomonocyte cell line",
    "peripheral blood stem cell",
    "myeloid-derived suppressor cell",
    "HL-60 cell",
    "NB4 cell",
    "THP-1 cell",
    "KG-1 cell",
    "MV4-11 cell",
    "OCI-AML3 cell",
    "Kasumi-1 cell",
    "U937 cell",
    "K-562 cell",
    "B-cell", "CD4", "CD4-pos", "CD8", "T-cell", "Th1", "Th17", "macrophage", "monocyte", "peripheral-blood-mononuclear-cell", "primary-B-cell", "primary-monocyte"
]


In [None]:
# Input and output file paths
input_bed_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/remap2022.bed"  # replace with your .bed file path
output_bed_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/remap2022_filtered.bed"

# Read the BED file (no header assumed)
bed_df = pd.read_csv(input_bed_file, sep="\t", header=None, comment="#")

# Rename columns for clarity
bed_df.columns = ["chrom", "start", "end", "tf", "tissue"]

# Filter rows where the 'tissue' column contains any of the remap_filter_keywords
pattern = "|".join(remap_filter_keywords)
filtered_bed_df = bed_df[bed_df["tissue"].str.contains(pattern, case=False, na=False)]

# Write the filtered output
filtered_bed_df.to_csv(output_bed_file, sep="\t", header=False, index=False)

print(f"Filtered BED file saved to: {output_bed_file}")

In [None]:
unibind_filter_keywords = [
    "osteoblastoma cell",
    "astroblast",
    "blast cell",
    "blastoderm",
    "blastospore",
    "glioblastoma cell",
    "hematopoietic cell",
    "leukemia cell",
    "myeloid leukemia cell line",
    "chronic myeloid leukemia cell",
    "chronic myeloid leukemia cell line",
    "acute myeloid leukemia cell",
    "acute myeloid leukemia cell line",
    "monocyte",
    "monocytic leukemia cell",
    "monocytic leukemia cell line",
    "myeloid progenitor cell",
    "myeloblast",
    "promyelocyte",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "megakaryocyte",
    "granulocyte",
    "bone marrow",
    "bone marrow cell",
    "bone marrow-derived dendritic cell",
    "bone marrow-derived macrophage",
    "stem cell",
    "hematopoietic stem cell",
    "mesenchymal stem cell",
    "peripheral blood stem cell",
    "dendritic cell",
    "macrophage",
    "myelomonocytic leukemia cell line",
    "hairy cell leukemia cell",
    "hairy cell leukemia cell line",
    "myeloid cell",
    "promyelocytic leukemia cell",
    "promyelocytic leukemia cell line",
    "promonocytic leukemia cell",
    "promonocytic leukemia cell line",
    "megakaryoblastic leukemia cell",
    "megakaryoblastic leukemia cell line",
    "AML cell",
    "CD34+ cell",
    "MLL cell line",
    "MLL leukemia cell line",
    "neoblast",
    "lymphoblast",
    "lymphoblastic leukemia cell",
    "lymphoblastic leukemia cell line",
    "lymphocyte",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "lymphoid progenitor cell",
    "NK cell",
    "natural killer cell",
    "bone marrow stromal cell",
    "bone marrow stromal cell line",
    "immune stem cell",
    "hematopoietic cell line",
    "peripheral blood monocyte",
    "myeloblastoma cell",
    "myeloid cell line",
    "myeloid dendritic cell",
    "myeloblastoma cell line",
    "myeloblast cell line",
    "myelomonocytic cell line",
    "leukemic stem cell",
    "megakaryocyte cell line",
    "erythroid progenitor cell",
    "erythroblast",
    "erythroblast cell line",
    "megakaryocytic leukemia cell",
    "megakaryocytic leukemia cell line",
    "basophilic leukemia cell",
    "basophilic leukemia cell line",
    "eosinophilic leukemia cell",
    "eosinophilic leukemia cell line",
    "granulocytic leukemia cell",
    "granulocytic leukemia cell line",
    "regulatory T-lymphocyte",
    "T-lymphocyte",
    "helper T-lymphocyte",
    "CD4+ cell",
    "CD8+ cell",
    "lymphoid dendritic cell",
    "macrophage foam cell",
    "peritoneal macrophage",
    "alveolar macrophage",
    "alveolar macrophage cell line",
    "monocyte-derived dendritic cell",
    "monocyte-derived macrophage",
    "monocyte cell line",
    "peripheral blood lymphocyte",
    "leukocyte",
    "large granular lymphocyte",
    "acute promyelocytic leukemia cell",
    "acute promyelocytic leukemia cell line",
    "chronic lymphocytic leukemia cell",
    "chronic lymphocytic leukemia cell line",
    "acute lymphoblastic leukemia cell",
    "acute lymphoblastic leukemia cell line",
    "B-cell leukemia cell",
    "B-cell chronic lymphocytic leukemia cell",
    "B-cell acute lymphoblastic leukemia cell",
    "T-cell acute lymphoblastic leukemia cell",
    "T-cell chronic lymphocytic leukemia cell",
    "NK-cell leukemia cell",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "myeloid leukemia cell",
    "myeloid leukemia cell line",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "leukemic stem cell line",
    "leukemic progenitor cell",
    "hematopoietic progenitor cell",
    "myeloid progenitor cell line",
    "granulocyte progenitor cell",
    "macrophage progenitor cell",
    "monocyte progenitor cell",
    "megakaryocyte progenitor cell",
    "erythroid progenitor cell line",
    "pluripotent hematopoietic stem cell",
    "multipotent hematopoietic stem cell",
    "common myeloid progenitor cell",
    "common lymphoid progenitor cell",
    "pro-B-lymphocyte",
    "pre-B-lymphocyte",
    "pro-T-lymphocyte",
    "pre-T-lymphocyte",
    "monoblast",
    "megakaryoblast",
    "myeloblast cell line",
    "myeloblast cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "promonocyte",
    "promonocyte cell line",
    "AML blast cell",
    "AML blast cell line",
    "myeloid leukemia blast cell",
    "myeloid leukemia blast cell line",
    "myeloid leukemia cell line",
    "myeloid leukemia cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "granulocytic blast cell",
    "granulocytic blast cell line",
    "megakaryocytic blast cell",
    "megakaryocytic blast cell line",
    "erythrocytic blast cell",
    "erythrocytic blast cell line",
    "dendritic cell line",
    "monocyte cell",
    "monocyte cell line",
    "monoblast cell",
    "monoblast cell line",
    "myelomonocyte cell",
    "myelomonocyte cell line",
    "peripheral blood stem cell",
    "myeloid-derived suppressor cell",
    "HL-60 cell",
    "NB4 cell",
    "THP-1 cell",
    "KG-1 cell",
    "MV4-11 cell",
    "OCI-AML3 cell",
    "Kasumi-1 cell",
    "U937 cell",
    "K-562 cell",
    "B-cell", "CD4", "CD4-pos", "CD8", "T-cell", "Th1", "Th17", "macrophage", "monocyte", "peripheral-blood-mononuclear-cell", "primary-B-cell", "primary-monocyte",
    "leukemia", "myeloid", "monocytic", "promyelocytic", "granulocytic", "erythroid", "megakaryocytic", "hematopoietic stem cell", "HSC", "LSC", "progenitor cell",
]


In [None]:
# Input and output file paths
input_bed_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/unibind.bed"  # replace with your .bed file path
output_bed_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/unibind_filtered.bed"

# Read the BED file (no header assumed)
bed_df = pd.read_csv(input_bed_file, sep="\t", header=None, comment="#")

# Rename columns for clarity
bed_df.columns = ["chrom", "start", "end", "tf", "tissue"]

# Filter rows where the 'tissue' column contains any of the remap_filter_keywords
pattern = "|".join(unibind_filter_keywords)
filtered_bed_df = bed_df[bed_df["tissue"].str.contains(pattern, case=False, na=False)]

# Write the filtered output
filtered_bed_df.to_csv(output_bed_file, sep="\t", header=False, index=False)

print(f"Filtered BED file saved to: {output_bed_file}")

In [None]:
import subprocess
import os

# Define your .bed files
bed_files = [
    "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/chipatlas.bed",
    "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/remap2022_filtered.bed",
    "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/unibind_filtered.bed"
]

# Output paths
concatenated_bed = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/concatenated.bed"
no_celltype_bed = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/concatenated_no_celltype.bed"
sorted_bed = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/sorted_no_celltype.bed"
merged_bed = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/dbs/merged.bed"

# Concatenate the files
with open(concatenated_bed, "w") as outfile:
    for bed_file in bed_files:
        with open(bed_file, "r") as infile:
            for line in infile:
                outfile.write(line)

# Remove the fifth column (cell type)
with open(concatenated_bed, "r") as infile, open(no_celltype_bed, "w") as outfile:
    for line in infile:
        fields = line.strip().split("\t")
        outfile.write("\t".join(fields[:4]) + "\n")

# Sort the BED file by chrom, start, end, and TF (bedtools sort can only sort by chrom/start/end)
# We will use Unix sort instead to also sort by TF
subprocess.run(
    f'sort -k1,1 -k2,2n -k3,3n -k4,4 {no_celltype_bed} > {sorted_bed}',
    shell=True,
    check=True
)

# Group by chrom, TF, and merge overlapping regions
# Here: -g 1,4 groups by chromosome and TF
# -c 2,3 merges overlapping regions using min(start) and max(end)
subprocess.run(
    f'bedtools groupby -i {sorted_bed} -g 1,4 -c 2,3 -o min,max > {merged_bed}',
    shell=True,
    check=True
)

print(f"Final merged BED file saved to: {merged_bed}")


-------

-------

# 2. TF Activity

In [14]:
meta_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-actvity/knockTF_meta.csv"
data_file = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-actvity/knockTF_expr.csv"
out_dir = "/data/benchmarks/andrem/data/pAML/benchmarks/tf-actvity"

In [23]:
import pandas as pd

# Read the CSV file
meta_df = pd.read_csv(meta_file)

# Rename the first column if it was read as 'Unnamed: 0' or empty
if meta_df.columns[0] == '' or meta_df.columns[0].startswith('Unnamed'):
    meta_df.rename(columns={meta_df.columns[0]: 'Dataset ID'}, inplace=True)

allowed_tissues = [
    "Blood", 
    "Haematopoietic_and_lymphoid_tissue", 
    "Haematopoietic_and_lymphoid_tissue_Blood", 
    "Bone_marrow",
    "Leukemia",
    "osteoblastoma cell",
    "astroblast",
    "blast cell",
    "blastoderm",
    "blastospore",
    "glioblastoma cell",
    "hematopoietic cell",
    "leukemia cell",
    "myeloid leukemia cell line",
    "chronic myeloid leukemia cell",
    "chronic myeloid leukemia cell line",
    "acute myeloid leukemia cell",
    "acute myeloid leukemia cell line",
    "monocyte",
    "monocytic leukemia cell",
    "monocytic leukemia cell line",
    "myeloid progenitor cell",
    "myeloblast",
    "promyelocyte",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "megakaryocyte",
    "granulocyte",
    "bone marrow",
    "bone marrow cell",
    "bone marrow-derived dendritic cell",
    "bone marrow-derived macrophage",
    "stem cell",
    "hematopoietic stem cell",
    "mesenchymal stem cell",
    "peripheral blood stem cell",
    "dendritic cell",
    "macrophage",
    "myelomonocytic leukemia cell line",
    "hairy cell leukemia cell",
    "hairy cell leukemia cell line",
    "myeloid cell",
    "promyelocytic leukemia cell",
    "promyelocytic leukemia cell line",
    "promonocytic leukemia cell",
    "promonocytic leukemia cell line",
    "megakaryoblastic leukemia cell",
    "megakaryoblastic leukemia cell line",
    "AML cell",
    "CD34+ cell",
    "MLL cell line",
    "MLL leukemia cell line",
    "neoblast",
    "lymphoblast",
    "lymphoblastic leukemia cell",
    "lymphoblastic leukemia cell line",
    "lymphocyte",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "lymphoid progenitor cell",
    "NK cell",
    "natural killer cell",
    "bone marrow stromal cell",
    "bone marrow stromal cell line",
    "immune stem cell",
    "hematopoietic cell line",
    "peripheral blood monocyte",
    "myeloblastoma cell",
    "myeloid cell line",
    "myeloid dendritic cell",
    "myeloblastoma cell line",
    "myeloblast cell line",
    "myelomonocytic cell line",
    "leukemic stem cell",
    "megakaryocyte cell line",
    "erythroid progenitor cell",
    "erythroblast",
    "erythroblast cell line",
    "megakaryocytic leukemia cell",
    "megakaryocytic leukemia cell line",
    "basophilic leukemia cell",
    "basophilic leukemia cell line",
    "eosinophilic leukemia cell",
    "eosinophilic leukemia cell line",
    "granulocytic leukemia cell",
    "granulocytic leukemia cell line",
    "regulatory T-lymphocyte",
    "T-lymphocyte",
    "helper T-lymphocyte",
    "CD4+ cell",
    "CD8+ cell",
    "lymphoid dendritic cell",
    "macrophage foam cell",
    "peritoneal macrophage",
    "alveolar macrophage",
    "alveolar macrophage cell line",
    "monocyte-derived dendritic cell",
    "monocyte-derived macrophage",
    "monocyte cell line",
    "peripheral blood lymphocyte",
    "leukocyte",
    "large granular lymphocyte",
    "acute promyelocytic leukemia cell",
    "acute promyelocytic leukemia cell line",
    "chronic lymphocytic leukemia cell",
    "chronic lymphocytic leukemia cell line",
    "acute lymphoblastic leukemia cell",
    "acute lymphoblastic leukemia cell line",
    "B-cell leukemia cell",
    "B-cell chronic lymphocytic leukemia cell",
    "B-cell acute lymphoblastic leukemia cell",
    "T-cell acute lymphoblastic leukemia cell",
    "T-cell chronic lymphocytic leukemia cell",
    "NK-cell leukemia cell",
    "lymphocytic leukemia cell",
    "lymphocytic leukemia cell line",
    "myeloid leukemia cell",
    "myeloid leukemia cell line",
    "erythroleukemia cell",
    "erythroleukemia cell line",
    "leukemic stem cell line",
    "leukemic progenitor cell",
    "hematopoietic progenitor cell",
    "myeloid progenitor cell line",
    "granulocyte progenitor cell",
    "macrophage progenitor cell",
    "monocyte progenitor cell",
    "megakaryocyte progenitor cell",
    "erythroid progenitor cell line",
    "pluripotent hematopoietic stem cell",
    "multipotent hematopoietic stem cell",
    "common myeloid progenitor cell",
    "common lymphoid progenitor cell",
    "pro-B-lymphocyte",
    "pre-B-lymphocyte",
    "pro-T-lymphocyte",
    "pre-T-lymphocyte",
    "monoblast",
    "megakaryoblast",
    "myeloblast cell line",
    "myeloblast cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "promonocyte",
    "promonocyte cell line",
    "AML blast cell",
    "AML blast cell line",
    "myeloid leukemia blast cell",
    "myeloid leukemia blast cell line",
    "myeloid leukemia cell line",
    "myeloid leukemia cell",
    "myeloid blast cell",
    "myeloid blast cell line",
    "granulocytic blast cell",
    "granulocytic blast cell line",
    "megakaryocytic blast cell",
    "megakaryocytic blast cell line",
    "erythrocytic blast cell",
    "erythrocytic blast cell line",
    "dendritic cell line",
    "monocyte cell",
    "monocyte cell line",
    "monoblast cell",
    "monoblast cell line",
    "myelomonocyte cell",
    "myelomonocyte cell line",
    "peripheral blood stem cell",
    "myeloid-derived suppressor cell",
    "HL-60 cell",
    "NB4 cell",
    "THP-1 cell",
    "KG-1 cell",
    "MV4-11 cell",
    "OCI-AML3 cell",
    "Kasumi-1 cell",
    "U937 cell",
    "K-562 cell",
    "B-cell", "CD4", "CD4-pos", "CD8", "T-cell", "Th1", "Th17", "macrophage", "monocyte", "peripheral-blood-mononuclear-cell", "primary-B-cell", "primary-monocyte",
    "leukemia", "myeloid", "monocytic", "promyelocytic", "granulocytic", "erythroid", "megakaryocytic", "hematopoietic stem cell", "HSC", "LSC", "progenitor cell",
]

# Filter the DataFrame
filtered_df = meta_df[
    meta_df["Tissue.Type"].str.lower().isin([t.lower() for t in allowed_tissues])
]



In [24]:
import os
import pandas as pd

# Read the main data file
df = pd.read_csv(data_file)

# Rename the first column if it was read as 'Unnamed: 0' or empty
if df.columns[0] == '' or df.columns[0].startswith('Unnamed'):
    df.rename(columns={df.columns[0]: 'Dataset ID'}, inplace=True)

# Filter df to only include Dataset IDs present in filtered_df
filtered_ids = filtered_df["Dataset ID"].unique()
df_filtered_by_ids = df[df["Dataset ID"].isin(filtered_ids)]

# Merge the TF column into the filtered data (avoids duplicates by using drop_duplicates)
tf_info = filtered_df[["Dataset ID", "TF"]].drop_duplicates()
df_merged = pd.merge(df_filtered_by_ids, tf_info, on="Dataset ID", how="left")



In [25]:
# Rename the column
df_merged = df_merged.rename(columns={'TF_y': 'TF_name'})

# Reorder columns to make 'TF_name' the second column
cols = list(df_merged.columns)
cols.remove('TF_name')
cols = [cols[0], 'TF_name'] + cols[1:]
df_merged = df_merged[cols]

# Save the merged DataFrame to CSV
df_merged.to_csv(os.path.join(out_dir, "knock_tf_benchmark_data.csv"), index=False)


In [26]:
df_merged

Unnamed: 0,Dataset ID,TF_name,A1BG,A1BG-AS1,A1CF,A2LD1,A2M,A2ML1,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22
0,DataSet_01_004,CREB1,-0.255680,0.111050,-0.28527,0.0,-0.035860,-0.469700,0.18559,-0.25601,...,0.000000,-0.39888,0.000000,0.104440,-0.164340,0.191460,0.415610,0.393840,0.127900,0.0
1,DataSet_01_005,POU5F1,0.478500,-0.375710,-0.84718,0.0,3.354450,0.171040,-0.34852,-0.95517,...,0.000000,0.24849,0.000000,-0.286920,-0.018150,0.119410,0.077850,0.234740,0.228690,0.0
2,DataSet_01_012,FLI1,0.000000,0.000000,0.40251,0.0,1.934290,0.000000,0.53462,0.00000,...,0.000000,0.00000,0.000000,-0.517350,0.000000,0.000000,3.028970,0.337220,-0.239700,0.0
3,DataSet_01_014,FLI1,0.000000,0.000000,0.19366,0.0,1.960610,0.000000,-0.51895,0.00000,...,0.000000,0.00000,0.000000,0.016510,0.000000,0.000000,2.609880,0.306030,-0.200010,0.0
4,DataSet_01_015,FLI1,0.000000,0.000000,0.02946,0.0,-3.734240,0.000000,-0.05051,0.00000,...,0.000000,0.00000,0.000000,-1.047700,0.000000,0.000000,1.246170,0.098580,0.730540,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,DataSet_04_021,SMAD5,-0.311274,-0.316222,0.00000,0.0,0.000000,-0.304855,0.00000,0.00000,...,0.024783,0.00000,0.166650,-0.114845,0.083749,0.071033,-0.250247,0.002988,0.072450,0.0
249,DataSet_04_022,STAT1,-0.252936,-0.313706,0.00000,0.0,0.000000,0.000000,0.00000,0.00000,...,-0.001144,0.00000,0.420218,0.252221,0.073945,0.424413,-0.077209,0.476842,0.576382,0.0
250,DataSet_04_023,XRCC5,-0.352672,-0.067114,0.00000,0.0,3.014950,-1.536053,0.00000,0.00000,...,0.447777,0.00000,0.221415,-0.307615,-0.024420,0.008363,0.131661,-0.244030,0.237252,0.0
251,DataSet_04_024,YBX1,-0.033285,0.244966,0.00000,0.0,0.000000,-0.700440,0.00000,0.00000,...,0.251406,0.00000,0.288996,0.119108,-0.236440,-0.004361,0.178174,-0.118009,-0.027151,0.0


# 3 Gene Sets

# 3.1 Setup

In [None]:
import os
import wget
import pandas as pd
import decoupler as dc

# 3.1 Downloads 

In [None]:
import pandas as pd
import decoupler as dc
import os

# Define the input variables explicitly
urls = [
    'https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/h.all.v2024.1.Hs.symbols.gmt',  # Replace with the actual URL
    'https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.kegg_legacy.v2024.1.Hs.symbols.gmt',
    'https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.reactome.v2024.1.Hs.symbols.gmt'
]

db_names = [
    'hallmark',
    'kegg',
    'reactome'
]

# Output directory
output_dir = '/data/benchmarks/andrem/data/pAML/benchmarks/gsets'

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop over each URL and process the gene set
for url, db_name in zip(urls, db_names):
    output_file = os.path.join(output_dir, f'{db_name}.csv')
    temp_file = f'{output_file}.tmp'

    # Download the file using wget
    os.system(f"wget --no-verbose '{url}' -O {temp_file}")

    # Read the gene set file (GMT format)
    gst = dc.read_gmt(temp_file)

    # Clean the 'source' column by removing prefixes
    gst['source'] = ['_'.join(s.split('_')[1:]) for s in gst['source']]

    # Save the processed gene set to CSV
    gst.to_csv(output_file, index=False)

    # Remove the temporary file
    os.remove(temp_file)


In [None]:
import os
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

# Define input variables
url = 'https://github.com/saezlab/progeny/raw/master/data/model_human_full.rda'  # Replace with actual URL
output_file = '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/prog.csv'
output_rda = '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/prog.rda'

# Download the RDA file using wget
os.system(f"wget --no-verbose '{url}' -O {output_rda}")

# Load the RDA file
ro.r(f"load('{output_rda}')")

# Extract model_human_full from R
model_human_full = ro.r['model_human_full']

# Convert R dataframe to pandas dataframe using the recommended localconverter context
with localconverter(ro.default_converter + pandas2ri.converter):
    prg = ro.conversion.rpy2py(model_human_full)

# Rename columns
prg = prg.rename(columns={
    'gene': 'target',
    'pathway': 'source',
    'p.value': 'pval'
})

# Select relevant columns
prg = prg[['source', 'target', 'weight', 'pval']]

# Filter by pval < 1e-5
prg = prg[prg['pval'] < 1e-5]

# Filter gene sets with more than 5 targets
n = prg.groupby('source').size()
prg = prg[prg['source'].isin(n[n > 5].index)]

# Sort the results
prg = prg.sort_values(['source', 'target', 'weight'])

# Save the final CSV
prg.to_csv(output_file, index=False)

# Clean up
os.remove(output_rda)


In [None]:
import pandas as pd

# Define your 4 CSV file paths
files = [
    '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/hallmark.csv',
    '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/kegg.csv',
    '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/prog.csv',
    '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/reactome.csv'
]

# Save to a new CSV file
output_file = '/data/benchmarks/andrem/data/pAML/benchmarks/gsets/merged_network.csv'

# Initialize an empty list to hold the DataFrames
dfs = []

# Loop through each file and process it
for file in files:
    df = pd.read_csv(file)
    
    # Check if 'source' and 'target' columns exist
    if 'source' in df.columns and 'target' in df.columns:
        df = df[['source', 'target']]
    else:
        raise ValueError(f"File {file} is missing 'source' and/or 'target' columns.")
    
    dfs.append(df)

# Concatenate all DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

# Remove duplicate edges
merged_df = merged_df.drop_duplicates()


merged_df.to_csv(output_file, index=False)

print(f"Merged network saved to: {output_file}")


# 4. TF Markers

In [None]:
import os
import subprocess

# ----------------------
# Define variables here:
# ----------------------

# URL to download the zip file from HPA
hpa_url = 'https://www.proteinatlas.org/download/proteinatlas.tsv.zip'  # Replace with the actual URL from config

# Path to the Lambert TFs file
tfs_file = '/data/benchmarks/andrem/data/pAML/benchmarks/tf-binding-test/lambert.csv'  # Replace with the actual path to your TF list

# Output directory and file
output_file = '/data/benchmarks/andrem/data/pAML/benchmarks/tfm/hpa.tsv'  # Replace with the desired output file path
output_zip = output_file + '.zip'

# Path to the processing script (assuming it exists)
processing_script = '/home/andrem/GRN-project/resources/greta/tfm/hpa.py'

# ----------------------
# Execute steps
# ----------------------

# Step 1: Download the HPA zip file
os.system(f"wget --no-verbose '{hpa_url}' -O {output_zip}")

# Step 2: Run the processing script
subprocess.run([
    'python', processing_script,
    '-i', output_zip,
    '-t', tfs_file,
    '-o', output_file
], check=True)


In [None]:
import os
import pandas as pd

# --------------------------------------
# Define variables explicitly here:
# --------------------------------------

# URL to download the TFMDB file
tfmdb_url = 'https://bio.liclab.net/TF-Marker/public/download/download_TFMarker.csv'  # Replace with the actual URL from config

# Output file path
output_file = '/data/benchmarks/andrem/data/pAML/benchmarks/tfm/tfmdb.tsv'

# --------------------------------------
# Execute steps
# --------------------------------------

# Step 1: Download the TFMDB file
os.system(f"wget --no-verbose '{tfmdb_url}' -O {output_file}")

# Step 2: Process the TFMDB file
df = pd.read_csv(output_file, sep=',')  # Assuming the downloaded file is comma-separated

# Keep only relevant columns
df = df[['Gene Name', 'Cell Name', 'Tissue Type']]

# Create 'ctype' by concatenating Cell Name and Tissue Type
df['ctype'] = df['Cell Name'] + ',' + df['Tissue Type']

# Group by Gene Name and combine ctype values
df = df.groupby('Gene Name', as_index=False)['ctype'].apply(lambda x: ','.join(x))

# Ensure unique ctype values within each gene (remove duplicates)
df['ctype'] = [','.join(sorted(set(s.split(',')))) for s in df['ctype']]

# Drop any duplicates that might remain
df = df.drop_duplicates(['Gene Name', 'ctype'])

# Rename columns for consistency
df = df.rename(columns={'Gene Name': 'gene'})

# Sort by gene and ctype
df = df.sort_values(['gene', 'ctype'])

# Save the processed file
df.to_csv(output_file, sep='\t', index=False, header=None)


In [None]:
import pandas as pd

# -----------------------------------------
# Define input and output variables here:
# -----------------------------------------

# Paths to the two TSV files
file1 = '/data/benchmarks/andrem/data/pAML/benchmarks/tfm/hpa.tsv'
file2 = '/data/benchmarks/andrem/data/pAML/benchmarks/tfm/tfmdb.tsv'

# Predefined keywords to filter on
keywords = ["B cell", "Dendritic cell", "Lymphoid cell", "Macrophage cell", 
"Macrophages cell", "Macrophagocyte cell", "Mononuclear cell", 
"Natural killer cell", "Peripheral blood cell", "T cell",
"B", "Dendritic cells", "Erythroid cells", "Lymphoid tissue", 
"Macrophages", "Monocytes", "NK", "T", "Leukemia", "Myeloid leukemia",
"Myeloid", "Blood", "Bone Marrow"] 

# Output file
output_file = '/data/benchmarks/andrem/data/pAML/benchmarks/tfm/filtered_tf_markers.txt'

# -----------------------------------------
# Execute steps
# -----------------------------------------

# Read both files
df1 = pd.read_csv(file1, sep='\t', header=None, names=['gene', 'ctype'])
df2 = pd.read_csv(file2, sep='\t', header=None, names=['gene', 'ctype'])

# Concatenate
df = pd.concat([df1, df2], ignore_index=True)

# Filter rows where 'ctype' contains any of the keywords (case-insensitive)
mask = df['ctype'].apply(
    lambda x: any(keyword.lower() in str(x).lower() for keyword in keywords)
)

# Apply the filter
filtered_df = df[mask]

# Keep only the 'gene' column and drop duplicates
filtered_genes = filtered_df['gene'].drop_duplicates()

# Save to output file
filtered_genes.to_csv(output_file, sep='\t', index=False, header=False)

print(f"Filtered TF markers saved to: {output_file}")
