In [70]:
from itertools import product
import polars as pl
import pandas as pd

In [None]:
tissues = ["BAT", "BLOOD", "LIVER", "HEART", "WAT-SC", "SKM-GN", "HYPOTH"]
assays = ["ATAC", "TRNSCRPT", "METHYL"]

products = list(product(tissues, assays))

In [None]:
genomic_data = (
    pl.read_csv("data/TRAINING_REGULATED_NORM_DATA.csv", null_values=["NA"])
    .filter(pl.col("assay").is_in(assays))
    .drop("dataset")
)
feature_to_gene = pl.read_csv("data/FEATURE_TO_GENE.csv", null_values=["NA"])

In [75]:
data: pl.DataFrame = genomic_data.join(
    feature_to_gene.select(pl.col("feature_ID", "ensembl_gene")),
    on="feature_ID",
    how="left",
).drop(pl.selectors.by_index(0))

In [96]:
grouped_data = (
    data.drop("tissue", "assay", "feature_ID")
    .group_by("feature", "ensembl_gene")
    .mean()
    .sort("ensembl_gene")
    .drop("ensembl_gene")
)

grouped_data.write_parquet("data/grouped_data.parquet")

In [None]:
grouped_data = pl.read_parquet("data/grouped_data.parquet")

In [98]:
# Reading phenotype data
pheno = pd.read_csv(
    "data/motrpac_pass1b-06_pheno_viallabel_data_merged_v4.0.txt", sep="\t"
)

  pheno = pd.read_csv("data/motrpac_pass1b-06_pheno_viallabel_data_merged_v4.0.txt", sep="\t")


In [None]:
# Sub-setting phenotype data to include necessary columns
phenoSubset = pheno[
    [
        "pid",
        "viallabel",
        "sex",
        "vo2.max.test.vo2_max_2",
        "registration.weight",
        "terminal.weight.bw",
        "group",
    ]
]
phenoSubset["mass_lost"] = (
        phenoSubset["registration.weight"] - phenoSubset["terminal.weight.bw"]
)

phenoSubset["norm_mass_lost"] = (
        phenoSubset["mass_lost"] / phenoSubset["registration.weight"]
)

phenoSubset["norm_mass_lost"] = (
        phenoSubset["mass_lost"] / phenoSubset["registration.weight"]
)

In [129]:
pheno_subset = (
    pl.read_csv("data/phenoSubset.csv", schema_overrides={"pid": pl.String})
    .drop(pl.selectors.by_index(0), "viallabel")
    .rename({"vo2.max.test.vo2_max_2": "vo2max"})
    .unique()
)

In [130]:
feature_engineered_data = grouped_data.select(pl.exclude("feature")).transpose(
    include_header=True,
    header_name="pid",
    column_names=grouped_data.get_column("feature"),
)

In [132]:
pheno_subset

pid,sex,vo2max,registration.weight,terminal.weight.bw,group,mass_lost,norm_mass_lost
str,str,f64,f64,f64,str,f64,f64
"""10026940""","""male""",73.37,342.6,326.5,"""8w""",16.1,0.046994
"""10502300""","""male""",,351.1,399.4,"""2w""",-48.3,-0.137568
"""10046461""","""female""",84.06,192.3,200.3,"""8w""",-8.0,-0.041602
"""10175431""","""male""",69.26,313.8,308.5,"""4w""",5.3,0.01689
"""10026355""","""male""",52.64,353.7,381.1,"""control""",-27.4,-0.077467
…,…,…,…,…,…,…,…
"""10106383""","""male""",67.3,292.7,301.4,"""4w""",-8.7,-0.029723
"""10027327""","""male""",58.53,361.3,351.4,"""control""",9.9,0.027401
"""10136967""","""male""",67.39,337.1,353.9,"""4w""",-16.8,-0.049837
"""10886028""","""female""",,163.9,183.3,"""1w""",-19.4,-0.118365


In [138]:
vo2max_dataset = feature_engineered_data.join(
    pheno_subset.select(pl.col("pid"), pl.col("vo2max")), on="pid"
)
vo2max_dataset.write_csv("data/vo2max_dataset.csv")

In [139]:
weight_dataset = feature_engineered_data.join(
    pheno_subset.select(pl.col("pid"), pl.col("norm_mass_lost")), on="pid"
)
weight_dataset.write_csv("data/weight_dataset.csv")