In [1]:
from featureio import FeatureIO
from dataclasses import dataclass
from typing import List
import numpy as np
import pandas as pd

In [2]:
pwd

'/home/adamdrake/notebooks'

In [3]:
with FeatureIO() as ftio:
    pass

In [4]:
@dataclass
class CancerTypeGroups:
    healthy: List[str]
    benign: List[str]
    precancer: List[str]
    cancer: List[str]

In [5]:
OLINK_TRAINING_CLASSES = CancerTypeGroups(
    healthy=[
        "colorectal_healthy_olink_discovery2alpha_v6",
        "colorectal_healthy_olink_discovery2alpha_v6_panelofneg",
        "colorectal_healthy_olink_discovery2beta_v3",
        "colorectal_healthy_olink_discovery2beta_v3_panelofneg",
        "colorectal_healthy_olink_discovery2gamma_v2",
        "colorectal_healthy_olink_discovery2gamma_v2_panelofneg",
        "colorectal_healthy_olink_discovery2delta_v3",
        "colorectal_healthy_olink_discovery2delta_v3_panelofneg",
    ],
    benign=[
        "colorectal_benign_olink_discovery2alpha_v6",
        "colorectal_benign_olink_discovery2beta_v3",
        "colorectal_benign_olink_discovery2gamma_v2",
        "colorectal_benign_olink_discovery2delta_v3",
    ],
    precancer=[
        "colorectal_precancer_olink_discovery2alpha_v6",
        "colorectal_precancer_olink_discovery2beta_v3",
        "colorectal_precancer_olink_discovery2gamma_v2",
        "colorectal_precancer_olink_discovery2delta_v3",
    ],
    cancer=[
        "colorectal_cancer_olink_discovery2alpha_v6",
        "colorectal_cancer_olink_discovery2beta_v3",
        "colorectal_cancer_olink_discovery2gamma_v2",
        "colorectal_cancer_olink_discovery2delta_v3",
    ],
)

BIOGNOSYS_TRAINING_CLASSES = CancerTypeGroups(
    healthy=[
        "colorectal_healthy_bgs_discovery2alpha_v6_panelofneg",
        "colorectal_healthy_bgs_discovery2alpha_v6",
        "colorectal_healthy_bgs_discovery2beta_v3",
        "colorectal_healthy_bgs_discovery2beta_v3_panelofneg",
        "colorectal_healthy_bgs_discovery2gamma_v2",
        "colorectal_healthy_bgs_discovery2gamma_v2_panelofneg",
        "colorectal_healthy_bgs_discovery2delta_v3_panelofneg",
        "colorectal_healthy_bgs_discovery2delta_v3",
    ],
    benign=[
        "colorectal_benign_bgs_discovery2alpha_v6",
        "colorectal_benign_bgs_discovery2beta_v3",
        "colorectal_benign_bgs_discovery2gamma_v2",
        "colorectal_benign_bgs_discovery2delta_v3",
    ],
    precancer=[
        "colorectal_precancer_bgs_discovery2alpha_v6",
        "colorectal_precancer_bgs_discovery2beta_v3",
        "colorectal_precancer_bgs_discovery2gamma_v2",
        "colorectal_precancer_bgs_discovery2delta_v3",
    ],
    cancer=[
        "colorectal_cancer_bgs_discovery2alpha_v6",
        "colorectal_cancer_bgs_discovery2beta_v3",
        "colorectal_cancer_bgs_discovery2gamma_v2",
        "colorectal_cancer_bgs_discovery2delta_v3",
    ],
)

LMNX_TRAINING_CLASSES = CancerTypeGroups(
    healthy=[
        "colorectal_healthy_lmnx_discovery2delta_v3",
        "colorectal_healthy_lmnx_discovery2delta_v3_panelofneg",
        "colorectal_healthy_lmnx_discovery2beta_v3",
        "colorectal_healthy_lmnx_discovery2beta_v3_panelofneg",
        "colorectal_healthy_lmnx_discovery2alpha_v6",
        "colorectal_healthy_lmnx_discovery2alpha_v6_panelofneg",
        "colorectal_healthy_lmnx_discovery2gamma_v2",
        "colorectal_healthy_lmnx_discovery2gamma_v2_panelofneg",
    ],
    benign=[
        "colorectal_benign_lmnx_discovery2delta_v3",
        "colorectal_benign_lmnx_discovery2beta_v3",
        "colorectal_benign_lmnx_discovery2alpha_v6",
        "colorectal_benign_lmnx_discovery2gamma_v2",
    ],
    precancer=[
        "colorectal_precancer_lmnx_discovery2delta_v3",
        "colorectal_precancer_lmnx_discovery2beta_v3",
        "colorectal_precancer_lmnx_discovery2alpha_v6",
        "colorectal_precancer_lmnx_discovery2gamma_v2",
    ],
    cancer=[
        "colorectal_cancer_lmnx_discovery2delta_v3",
        "colorectal_cancer_lmnx_discovery2beta_v3",
        "colorectal_cancer_lmnx_discovery2alpha_v6",
        "colorectal_cancer_lmnx_discovery2gamma_v2",
    ],
)

In [9]:
assays = ["olink", "biognosys", "luminex"]

features =["olink_all_panels_disc2/v1",
           "biognosys_dc2_protein_intensities/v1",
           "luminex-protein-test-feature-median/v4"]

training_classes = [
    OLINK_TRAINING_CLASSES,
    BIOGNOSYS_TRAINING_CLASSES,
    LMNX_TRAINING_CLASSES
]

assay_cm_key = {
    "olink": ("Assay", "Uniprot ID"),
    "biognosys": ("Gene", "uniprot"),
    "luminex": ("panel", None)
}

for assay, feature, training_class in zip(assays, features, training_classes):
    training_class_list = np.hstack(training_class.__dict__.values())
    dataset_ids = np.hstack([ftio.get_datasets_for_training_class(tc) for tc in training_class_list])

    f, v = feature.split("/v")
    data = ftio.get_feature_vectors(f, int(v), dataset_ids)
    column_metadata = ftio.get_column_metadata(f, int(v))[0]
    sample_metadata = ftio.get_sample_metadata(dataset_ids)
    types = [sm.raw.sample_metadata.cancer_type for sm in sample_metadata]
    ids = [sm.raw.sample_metadata.id for sm in sample_metadata]
    tube_ids = [sm.raw.raw_tube_metadata["id"] for sm in sample_metadata]
    index = [(x, y, t) for x, y, t in zip(ids, tube_ids, types)]
    columns = [[cm.get(key) for key in assay_cm_key[assay]] for cm in column_metadata]
    fn = f"/home/adamdrake/notebooks/dc2_protein_data/{assay}_dc2.csv"
    
    df = pd.DataFrame(data, columns=pd.MultiIndex.from_tuples(columns), index=pd.MultiIndex.from_tuples(index))
    df.to_csv(fn)
    break

  training_class_list = np.hstack(training_class.__dict__.values())


In [74]:
for d in data:
    print(d.shape)
    break

(81,)


In [17]:
sids = list(list(zip(*df.index))[0])

sids

[20857,
 22319,
 24119,
 23738,
 23432,
 23021,
 20236,
 23560,
 20640,
 22395,
 22686,
 22262,
 23485,
 22996,
 20880,
 23839,
 22281,
 20445,
 23119,
 23514,
 22748,
 22882,
 23531,
 23092,
 20413,
 23100,
 23791,
 23801,
 22434,
 22317,
 22745,
 23757,
 21238,
 22252,
 20343,
 20703,
 20851,
 21251,
 23259,
 22307,
 21182,
 20987,
 20626,
 20397,
 23257,
 21423,
 23081,
 22937,
 20675,
 22381,
 22290,
 22825,
 22426,
 20553,
 23928,
 20632,
 20918,
 22891,
 22366,
 22315,
 20371,
 22999,
 20942,
 23060,
 22875,
 22276,
 23527,
 20890,
 24141,
 20903,
 29296,
 21490,
 24081,
 20332,
 23719,
 23859,
 24072,
 20637,
 23766,
 20925,
 22701,
 21160,
 20443,
 21659,
 22864,
 21241,
 22287,
 21255,
 20616,
 20669,
 20725,
 21103,
 23397,
 20720,
 23093,
 22924,
 26860,
 22261,
 21417,
 24153,
 23933,
 23144,
 22878,
 22876,
 20610,
 29839,
 23826,
 22155,
 23553,
 22877,
 20219,
 22212,
 22207,
 22329,
 23143,
 24124,
 24084,
 22724,
 23128,
 21178,
 22201,
 22352,
 26382,
 20951,
 23816,


In [71]:
for assay, feature, training_class in zip(assays, features, training_classes):
    training_class_list = np.hstack(training_class.__dict__.values())
    dataset_ids = np.hstack([ftio.get_datasets_for_training_class(tc) for tc in training_class_list])
    print(assay)
    print(training_class_list)

  training_class_list = np.hstack(training_class.__dict__.values())


olink
['colorectal_healthy_olink_discovery2alpha_v6'
 'colorectal_healthy_olink_discovery2alpha_v6_panelofneg'
 'colorectal_healthy_olink_discovery2beta_v3'
 'colorectal_healthy_olink_discovery2beta_v3_panelofneg'
 'colorectal_healthy_olink_discovery2gamma_v2'
 'colorectal_healthy_olink_discovery2gamma_v2_panelofneg'
 'colorectal_healthy_olink_discovery2delta_v3'
 'colorectal_healthy_olink_discovery2delta_v3_panelofneg'
 'colorectal_benign_olink_discovery2alpha_v6'
 'colorectal_benign_olink_discovery2beta_v3'
 'colorectal_benign_olink_discovery2gamma_v2'
 'colorectal_benign_olink_discovery2delta_v3'
 'colorectal_precancer_olink_discovery2alpha_v6'
 'colorectal_precancer_olink_discovery2beta_v3'
 'colorectal_precancer_olink_discovery2gamma_v2'
 'colorectal_precancer_olink_discovery2delta_v3'
 'colorectal_cancer_olink_discovery2alpha_v6'
 'colorectal_cancer_olink_discovery2beta_v3'
 'colorectal_cancer_olink_discovery2gamma_v2'
 'colorectal_cancer_olink_discovery2delta_v3']
biognosys
['co