In [None]:
# ! CURL_CA_BUNDLE=/ca-certificates.crt pip install polars scikit-learn

In [None]:
from itertools import product
import polars as pl
import pandas as pd

In [None]:
tissues = ["BAT", "BLOOD", "LIVER", "HEART", "WAT-SC", "SKM-GN", "HYPOTH"]
assays = ["ATAC", "TRNSCRPT", "METHYL"]

products = list(product(tissues, assays))

In [None]:
genomic_data = (
    pl.read_csv("data/TRAINING_REGULATED_NORM_DATA.csv", null_values=["NA"])
    .filter(pl.col("assay").is_in(assays))
    .drop("dataset")
)
feature_to_gene = pl.read_csv("data/FEATURE_TO_GENE.csv", null_values=["NA"])

In [None]:
data: pl.DataFrame = genomic_data.join(
    feature_to_gene.select(pl.col("feature_ID", "ensembl_gene")),
    on="feature_ID",
    how="left",
).drop(pl.selectors.by_index(0))

In [None]:
grouped_data = (
    data.drop("tissue", "assay", "feature_ID")
    .group_by("feature", "ensembl_gene")
    .mean()
    .sort("ensembl_gene")
    .drop("ensembl_gene")
)

grouped_data.write_parquet("data/grouped_data.parquet")

In [None]:
grouped_data = pl.read_parquet("data/grouped_data.parquet")

In [None]:
# Reading phenotype data
pheno = pd.read_csv(
    "data/motrpac_pass1b-06_pheno_viallabel_data_merged_v4.0.txt", sep="\t"
)

In [None]:
# Sub-setting phenotype data to include necessary columns
phenoSubset = pheno[
    [
        "pid",
        "viallabel",
        "sex",
        "vo2.max.test.vo2_max_2",
        "registration.weight",
        "terminal.weight.bw",
        "group",
    ]
]
phenoSubset["mass_lost"] = (
        phenoSubset["registration.weight"] - phenoSubset["terminal.weight.bw"]
)

phenoSubset["norm_mass_lost"] = (
        phenoSubset["mass_lost"] / phenoSubset["registration.weight"]
)

phenoSubset["norm_mass_lost"] = (
        phenoSubset["mass_lost"] / phenoSubset["registration.weight"]
)

In [None]:
pheno_subset = (
    pl.read_csv("data/phenoSubset.csv", schema_overrides={"pid": pl.String})
    .drop(pl.selectors.by_index(0), "viallabel")
    .rename({"vo2.max.test.vo2_max_2": "vo2max"})
    .unique()
)

In [None]:
feature_engineered_data = grouped_data.select(pl.exclude("feature")).transpose(
    include_header=True,
    header_name="pid",
    column_names=grouped_data.get_column("feature"),
)

In [None]:
pheno_subset

In [147]:
vo2max_dataset = feature_engineered_data.join(
    pheno_subset.select(pl.col("pid"), pl.col("vo2max"), pl.col("group"), pl.col("sex")), on="pid"
)
vo2max_dataset.write_csv("data/vo2max_dataset.csv")

In [148]:
weight_dataset = feature_engineered_data.join(
    pheno_subset.select(pl.col("pid"), pl.col("norm_mass_lost"), pl.col("group"), pl.col("sex")), on="pid"
)
weight_dataset.write_csv("data/weight_dataset.csv")