In [1]:
import os
from itertools import product
import polars as pl

In [10]:
tissues = ["BAT", "BLOOD", "LIVER", "HEART", "WAT-SC", "SKM-GN", "HYPOTH"]
assays = ["ATAC", "TRNSCRPT", "METHYL"]

products = list(product(tissues, assays))

In [11]:
annotations = {
    "ATAC": pl.read_csv(
        "data/ATAC_feature_annotation.csv", schema_overrides={"chrom": pl.String}
    ).select(pl.col("feature_ID", "ensembl_gene")),
    "TRNSCRPT": pl.read_csv("data/TRNSCRPT_feature_annotation.csv").select(
        pl.col("feature_ID", "gene_id")
    ),
    "METHYL": pl.read_csv(
        "data/METHYL_feature_annotation.csv", schema_overrides={"EntrezID": pl.String}
    ).select(pl.col("feature_ID", "EntrezID")),
}

In [12]:
all_data = []
for tissue, assay in products:
    print(f"Processing {tissue} {assay}")
    path = f"data/{tissue}_{assay}_data.csv"
    if not os.path.exists(path):
        print(f"File {path} does not exist.")
        continue
    data = pl.read_csv(path)
    annotated_data = data.join(
        annotations[assay], left_on="feature_ID", right_on="feature_ID"
    )
    all_data.append(annotated_data)

Processing BAT ATAC
Processing BAT TRNSCRPT
Processing BAT METHYL
Processing BLOOD ATAC
File data/BLOOD_ATAC_data.csv does not exist.
Processing BLOOD TRNSCRPT
Processing BLOOD METHYL
File data/BLOOD_METHYL_data.csv does not exist.
Processing LIVER ATAC
Processing LIVER TRNSCRPT
Processing LIVER METHYL
Processing HEART ATAC
Processing HEART TRNSCRPT
Processing HEART METHYL
Processing WAT-SC ATAC
Processing WAT-SC TRNSCRPT
Processing WAT-SC METHYL
Processing SKM-GN ATAC
Processing SKM-GN TRNSCRPT
Processing SKM-GN METHYL
Processing HYPOTH ATAC
File data/HYPOTH_ATAC_data.csv does not exist.
Processing HYPOTH TRNSCRPT
Processing HYPOTH METHYL
File data/HYPOTH_METHYL_data.csv does not exist.


In [13]:
merged_data = pl.concat(all_data, how="diagonal_relaxed").drop("feature")

In [16]:
merged_data

True

In [None]:
merged_data.write_parquet("data/merged_data.parquet")

In [None]:
merged_data.to_pandas().memory_usage(deep=True).sum() / 1e6