In [7]:
%load_ext autoreload
%autoreload 2

from biolearn.data_library import DataLibrary
import pickle
from tqdm import tqdm
import textwrap
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, wilcoxon
from statsmodels.stats.multitest import multipletests
from os.path import basename, splitext, exists
from glob import glob

from computage.utils.data_utils import test_dataset
from computage.configs.datasets_bench_config import *
from computage.benchmarking.benchmarking import EpiClocksBenchmarking

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# test call of dataset

In [166]:
gse = 'GSE52588'
data_source = DataLibrary().get(gse)

print(textwrap.fill(data_source.summary, 70))
data=data_source.load()

Down syndrome is characterized by a wide spectrum of clinical signs,
which include cognitive and endocrine disorders and haematological
abnormalities. Although it is well established that the causative
defect of Down syndrome is the trisomy of chromosome 21, the molecular
bases of Down syndrome phenotype are still largely unknown. We used
the Infinium HumanMethylation450 BeadChip to investigate DNA
methylation patterns in whole blood from 29 subjects affected by Down
syndrome (DS), using their healthy relatives as controls (mothers  and
unaffected siblings). This family-based model allowed us to monitor
possible confounding effects on DNA methylation patterns deriving from
genetic and environmental (lifestyle) factors. The identified
epigenetic signature of Down syndrome includes differentially
methylated regions that, although enriched on chromosome 21, interest
most of the other chromosomes and can be functionally linked to the
developmental and haematological defects characteristic 

In [167]:
data_obj = {}
data_obj['data'] = data.dnam
data_obj['meta'] = data.metadata

In [168]:
pickle.dump(data_obj, open(f'data/{gse}.pkl', 'wb'))

In [169]:
#now load it
data_obj = pickle.load(open(f'data/{gse}.pkl', 'rb'))

In [171]:
data_obj['meta']['disease_state']#.value_counts()

id
GSM1272122    Down syndrome
GSM1272123    Down syndrome
GSM1272124    Down syndrome
GSM1272125    Down syndrome
GSM1272126    Down syndrome
                  ...      
GSM1272204          healthy
GSM1272205          healthy
GSM1272206          healthy
GSM1272207          healthy
GSM1272208          healthy
Name: disease_state, Length: 87, dtype: object

# test dataset downloading

In [1]:
import requests
from urllib.parse import urlencode
import os

base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
public_key = 'https://disk.yandex.ru/i/6EKzmWgHWfQhhA'  # link to file

# Получаем загрузочную ссылку
final_url = base_url + urlencode(dict(public_key=public_key))
response = requests.get(final_url)
download_url = response.json()['href']
download_url

'https://downloader.disk.yandex.ru/disk/c3497b4318bdb37d31e77c872507055652550449e3a53e42ad020f4553360079/6602bc7e/o-xAw257UInz2E5KCNNYxzrwZIFIEChaHjZ5c8XuVTkQQP5Vu3LX_PfcrbWl_WXMgzctlrPIR5lis-RHW_1Y4w%3D%3D?uid=0&filename=jess.jpg&disposition=attachment&hash=PIh7RkISs8vpRICogGJFgYXo7GoHNggOK3PpICkoZx%2BkzyvR9xvEdnuySuRZLXysq/J6bpmRyOJonT3VoXnDag%3D%3D%3A&limit=0&content_type=image%2Fjpeg&owner_uid=614251127&fsize=3052875&hid=53e1d3b50896254829e1f189ba2f4197&media_type=image&tknv=v2'

In [None]:
# Загружаем файл и сохраняем его
download_response = requests.get(download_url)
with open('jess.jpg', 'wb') as f:   # Здесь укажите нужный путь к файлу
    f.write(download_response.content)

In [None]:
download_url = 'https://disk.yandex.ru/d/UKAe_mXsv2TfVg'
out_file_name = 'jess.txt'

with open(os.path.join(out_file_name), 'wb') as out_stream:
    req = requests.get(download_url, stream=True)
    for chunk in req.iter_content(1024):  # Куски по 1 КБ
        out_stream.write(chunk)

# Benchmarking of published clocks

In [8]:
models_config = {
    "in_library":{
        'Horvathv1':{},
        'Hannum':{},
        'Lin':{},
        'PhenoAge':{},
        'YingCausAge':{},
        'YingDamAge':{},
        'YingAdaptAge':{},
        'Horvathv2':{},
        'PEDBE':{},
        'HRSInCHPhenoAge':{},
        # 'GrimAgeV1':{},
        # 'GrimAgeV2':{}
    },
    #each model should have `path` in its dict values (see example)
    #each model should be stored in pickle (.pkl) format
    "new_models":{
        #'my_new_model_name': {'path':/path/to/model.pkl}
        
    }
}

# datasets_config_short = dict(list(datasets_config_old.items())[:3])


In [29]:
#fill config with new models
pattern = '/tank/projects/computage/checkpoints/*fs3*'
for p in glob(pattern):
    name = splitext(basename(p))[0]
    models_config['new_models'][name] = {'path':p}

In [9]:
# data, meta = pd.read_pickle('/tank/projects/computage/benchmarking/GSE103929.pkl.gz', compression='gzip').values()

In [3]:
datasets_config_short = dict(list(datasets_config_main.items())[-19:-15])
datasets_config_short

{'GSE182991': {'path': '/tank/projects/computage/benchmarking/GSE182991.pkl.gz',
  'conditions': ['HGPS', 'ncLMNA'],
  'test': 'AA2'},
 'GSE214297': {'path': '/tank/projects/computage/benchmarking/GSE214297.pkl.gz',
  'conditions': ['CGL'],
  'test': 'AA2'},
 'GSE43976': {'path': '/tank/projects/computage/benchmarking/GSE43976.pkl.gz',
  'conditions': ['MS'],
  'test': 'AA1'},
 'GSE59685': {'path': '/tank/projects/computage/benchmarking/GSE59685.pkl.gz',
  'conditions': ['AD'],
  'test': 'AA2'}}

In [11]:
bench = EpiClocksBenchmarking(
    models_config=models_config,
    # datasets_config=datasets_config_short,
    datasets_config=datasets_config_main,
    tissue_types='BSB',
    age_limits = [18, 90],
    age_limits_class_exclusions= ['PGS'],
    experiment_prefix='prefinal1',
    delta_assumption = 'normal',
    correction_threshold=0.05,
    save_results=True,
    output_folder='./tmp_bench_results',
    verbose=1
)
bench.run()

10 models will be tested on 60 datasets.


Datasets:   2%|▏         | 1/60 [00:35<34:42, 35.29s/it]

GSE56046:AS - AA2 testing 776 disease versus 339 healthy samples


Datasets:   3%|▎         | 2/60 [00:40<17:13, 17.83s/it]

GSE56581:AS - AA2 testing 127 disease versus 66 healthy samples


Datasets:   5%|▌         | 3/60 [00:41<09:17,  9.79s/it]

GSE62867:IHD - AA1 testing 6 disease samples


Datasets:   7%|▋         | 4/60 [00:45<07:11,  7.70s/it]

GSE69138:CVA - AA1 testing 172 disease samples


Datasets:   8%|▊         | 5/60 [00:46<04:56,  5.39s/it]

GSE107143:AS - AA2 testing 8 disease versus 8 healthy samples


Datasets:  10%|█         | 6/60 [00:49<04:06,  4.56s/it]

GSE203399:CVA - AA1 testing 118 disease samples


Datasets:  12%|█▏        | 7/60 [00:52<03:27,  3.92s/it]

GSE32148:IBD - AA2 testing 5 disease versus 5 healthy samples


Datasets:  13%|█▎        | 8/60 [00:58<04:06,  4.74s/it]

GSE87640:IBD - AA2 testing 156 disease versus 84 healthy samples


Datasets:  15%|█▌        | 9/60 [01:09<05:28,  6.44s/it]

GSE87648:IBD - AA2 testing 203 disease versus 176 healthy samples


Datasets:  17%|█▋        | 10/60 [01:13<04:50,  5.82s/it]

GSE53840:HIV - AA1 testing 111 disease samples


Datasets:  18%|█▊        | 11/60 [01:15<03:40,  4.49s/it]

GSE53841:HIV - AA1 testing 24 disease samples


Datasets:  20%|██        | 12/60 [01:22<04:25,  5.53s/it]

GSE67705:HIV - AA2 testing 189 disease versus 91 healthy samples


Datasets:  22%|██▏       | 13/60 [01:26<03:53,  4.97s/it]

GSE67751:HIV - AA2 testing 23 disease versus 69 healthy samples


Datasets:  23%|██▎       | 14/60 [01:35<04:37,  6.04s/it]

GSE77696:HIV - AA2 testing 261 disease versus 117 healthy samples


Datasets:  25%|██▌       | 15/60 [01:43<05:07,  6.84s/it]

GSE100264:HIV - AA1 testing 386 disease samples


Datasets:  27%|██▋       | 16/60 [01:58<06:40,  9.11s/it]

GSE107080:HIV - AA1 testing 405 disease samples


Datasets:  28%|██▊       | 17/60 [02:11<07:30, 10.48s/it]

GSE117859:HIV - AA1 testing 608 disease samples


Datasets:  30%|███       | 18/60 [02:29<08:51, 12.65s/it]

GSE117860:HIV - AA1 testing 529 disease samples


Datasets:  32%|███▏      | 19/60 [02:32<06:42,  9.81s/it]

GSE140800:HIV - AA1 testing 70 disease samples


Datasets:  33%|███▎      | 20/60 [02:35<05:06,  7.67s/it]

GSE143942:HIV - AA2 testing 61 disease versus 12 healthy samples


Datasets:  35%|███▌      | 21/60 [02:38<04:04,  6.26s/it]

GSE145714:HIV_TB - AA2 testing 6 disease versus 9 healthy samples
GSE145714:TB - AA2 testing 12 disease versus 9 healthy samples


Datasets:  37%|███▋      | 22/60 [02:40<03:10,  5.01s/it]

GSE185389:HIV - AA1 testing 56 disease samples


Datasets:  38%|███▊      | 23/60 [02:43<02:41,  4.36s/it]

GSE185390:HIV - AA1 testing 30 disease samples


Datasets:  40%|████      | 24/60 [03:02<05:14,  8.73s/it]

GSE217633:HIV - AA2 testing 368 disease versus 43 healthy samples


Datasets:  42%|████▏     | 25/60 [03:19<06:34, 11.26s/it]

GSE42861:RA - AA2 testing 354 disease versus 335 healthy samples


Datasets:  43%|████▎     | 26/60 [03:21<04:51,  8.58s/it]

GSE71841:RA - AA2 testing 12 disease versus 12 healthy samples


Datasets:  45%|████▌     | 27/60 [03:23<03:38,  6.63s/it]

GSE99624:OP - AA2 testing 32 disease versus 16 healthy samples


Datasets:  47%|████▋     | 28/60 [03:32<03:48,  7.13s/it]

GSE131989:RA - AA2 testing 230 disease versus 123 healthy samples


Datasets:  48%|████▊     | 29/60 [03:36<03:12,  6.19s/it]

GSE134429:RA - AA2 testing 47 disease versus 17 healthy samples


Datasets:  50%|█████     | 30/60 [03:40<02:49,  5.66s/it]

GSE137593:OA - AA1 testing 5 disease samples
GSE137593:RA - AA1 testing 43 disease samples


Datasets:  52%|█████▏    | 31/60 [03:45<02:37,  5.44s/it]

GSE137594:OA - AA1 testing 6 disease samples
GSE137594:RA - AA1 testing 46 disease samples


Datasets:  53%|█████▎    | 32/60 [03:49<02:20,  5.01s/it]

GSE138653:RA - AA1 testing 80 disease samples


Datasets:  55%|█████▌    | 33/60 [03:54<02:16,  5.06s/it]

GSE175364:RA - AA2 testing 8 disease versus 13 healthy samples


Datasets:  57%|█████▋    | 34/60 [04:00<02:18,  5.33s/it]

GSE176168:RA - AA1 testing 113 disease samples


Datasets:  58%|█████▊    | 35/60 [04:03<01:54,  4.57s/it]

GSE228104:RA - AA1 testing 40 disease samples


Datasets:  60%|██████    | 36/60 [04:03<01:20,  3.34s/it]

GSE49909:OBS - AA2 testing 31 disease versus 40 healthy samples


Datasets:  62%|██████▏   | 37/60 [04:04<00:55,  2.43s/it]

GSE56606:T1D - AA2 testing 25 disease versus 61 healthy samples


Datasets:  63%|██████▎   | 38/60 [04:07<00:58,  2.67s/it]

GSE62003:T2D - AA1 testing 58 disease samples


Datasets:  65%|██████▌   | 39/60 [04:14<01:26,  4.10s/it]

GSE73103:OBS - AA2 testing 11 disease versus 214 healthy samples


Datasets:  67%|██████▋   | 40/60 [04:16<01:08,  3.40s/it]

GSE118468:COPD - AA2 testing 15 disease versus 6 healthy samples


Datasets:  68%|██████▊   | 41/60 [04:20<01:04,  3.40s/it]

GSE131752:WS - AA2 testing 18 disease versus 24 healthy samples
GSE131752:aWS - AA2 testing 3 disease versus 24 healthy samples
GSE131752:MDPS - AA2 testing 3 disease versus 24 healthy samples


Datasets:  70%|███████   | 42/60 [04:23<00:58,  3.26s/it]

GSE182991:HGPS - AA2 testing 8 disease versus 12 healthy samples
GSE182991:ncLMNA - AA2 testing 7 disease versus 12 healthy samples


Datasets:  72%|███████▏  | 43/60 [04:24<00:46,  2.71s/it]

GSE214297:CGL - AA2 testing 7 disease versus 9 healthy samples


Datasets:  73%|███████▎  | 44/60 [04:28<00:47,  2.96s/it]

GSE43976:MS - AA1 testing 52 disease samples


Datasets:  75%|███████▌  | 45/60 [04:39<01:21,  5.41s/it]

GSE59685:AD - AA2 testing 40 disease versus 9 healthy samples


Datasets:  77%|███████▋  | 46/60 [04:51<01:45,  7.56s/it]

GSE72774:PD - AA2 testing 287 disease versus 218 healthy samples


Datasets:  78%|███████▊  | 47/60 [04:54<01:19,  6.10s/it]

GSE72776:PD - AA2 testing 46 disease versus 38 healthy samples


Datasets:  80%|████████  | 48/60 [04:57<01:01,  5.09s/it]

GSE103929:MS - AA1 testing 49 disease samples


Datasets:  82%|████████▏ | 49/60 [05:04<01:04,  5.83s/it]

GSE106648:MS - AA2 testing 139 disease versus 139 healthy samples


Datasets:  83%|████████▎ | 50/60 [05:12<01:03,  6.34s/it]

GSE111223:PD - AA2 testing 128 disease versus 131 healthy samples


Datasets:  85%|████████▌ | 51/60 [05:26<01:19,  8.83s/it]

GSE111629:PD - AA2 testing 335 disease versus 236 healthy samples


Datasets:  87%|████████▋ | 52/60 [05:32<01:03,  7.93s/it]

GSE112596:MS - AA1 testing 112 disease samples


Datasets:  88%|████████▊ | 53/60 [05:34<00:42,  6.04s/it]

GSE130029:MS - AA2 testing 20 disease versus 11 healthy samples


Datasets:  90%|█████████ | 54/60 [05:36<00:29,  4.92s/it]

GSE130030:MS - AA2 testing 14 disease versus 14 healthy samples


Datasets:  92%|█████████▏| 55/60 [05:40<00:22,  4.51s/it]

GSE130491:MS - AA1 testing 82 disease samples


Datasets:  93%|█████████▎| 56/60 [05:47<00:20,  5.21s/it]

GSE144858:AD - AA2 testing 93 disease versus 96 healthy samples
GSE144858:MCI - AA2 testing 111 disease versus 96 healthy samples


Datasets:  95%|█████████▌| 57/60 [05:48<00:12,  4.18s/it]

GSE151355:PD - AA1 testing 19 disease samples


Datasets:  97%|█████████▋| 58/60 [05:54<00:09,  4.58s/it]

GSE156994:CJD - AA2 testing 114 disease versus 105 healthy samples


Datasets:  98%|█████████▊| 59/60 [05:58<00:04,  4.48s/it]

GSE219293:MS - AA2 testing 29 disease versus 18 healthy samples


Datasets: 100%|██████████| 60/60 [06:03<00:00,  6.05s/it]


GSE122244:PD - AA2 testing 35 disease versus 34 healthy samples
Compute MedAE metric based on 3095 healthy control samples.


In [11]:
!ls -hl tmp_bench_results

total 88K
-rw-rw-r-- 1 dkriukov dkriukov  11K Apr  3 09:59 bench2_published_bench_adj_pvals.csv
-rw-rw-r-- 1 dkriukov dkriukov 4.2K Apr  3 09:59 bench2_published_bench_bools.csv
-rw-rw-r-- 1 dkriukov dkriukov  12K Apr  3 09:59 bench2_published_bench_pvals.csv
-rw-rw-r-- 1 dkriukov dkriukov 8.7K Apr 17 14:31 prefinal0_bench_adj_pvals.csv
-rw-rw-r-- 1 dkriukov dkriukov 5.1K Apr 17 14:31 prefinal0_bench_bools.csv
-rw-rw-r-- 1 dkriukov dkriukov  530 Apr 17 14:31 prefinal0_bench_CA_pred_MAE.csv
-rw-rw-r-- 1 dkriukov dkriukov  13K Apr 17 14:31 prefinal0_bench_pvals.csv
-rw-rw-r-- 1 dkriukov dkriukov  767 Apr 16 14:52 test2_bench_adj_pvals.csv
-rw-rw-r-- 1 dkriukov dkriukov  315 Apr 16 14:52 test2_bench_bools.csv
-rw-rw-r-- 1 dkriukov dkriukov  504 Apr 16 14:52 test2_bench_CA_pred_MAE.csv
-rw-rw-r-- 1 dkriukov dkriukov  769 Apr 16 14:52 test2_bench_pvals.csv


In [20]:
keys = list(bench.datasets_predictions.keys())

In [45]:
k = keys[20]
for k in keys:
    print(k)
    print(bench.datasets_predictions[k].mean().mean())

GSE56046
65.4142860696623
GSE56581
55.13255337182477
GSE62867
31.215732548258
GSE69138
62.47126678961085
GSE107143
57.19305682109346
GSE203399
61.94609151920956
GSE32148
20.731623206433387
GSE87640
35.286675937340405
GSE87648
41.85642104183096
GSE53840
50.79653539049245
GSE53841
46.83137803468343
GSE67705
53.85686111243916
GSE67751
44.02221506354244
GSE77696
49.96474548804203
GSE100264
50.45620166710822
GSE107080
43.28720844178879
GSE117859
50.488792859419036
GSE117860
43.59981965326171
GSE140800
36.642266967190544
GSE143942
28.071588029362857
GSE145714
25.653688703690584
GSE185389
44.78235273123339
GSE185390
37.76696786113916
GSE217633
31.007763191675203
GSE42861
48.61753166953791
GSE71841
37.78540696887688
GSE99624
57.21610846278078
GSE131989
32.58802709740392
GSE134429
52.1182119804742
GSE137593
45.42194524318468
GSE137594
52.63327198459547
GSE138653
35.39955305383512
GSE175364
40.57425293269749
GSE176168
41.51377045469469
GSE228104
47.5018742986105
GSE49909
31.133025221238466
GSE56

In [46]:
data, meta = pd.read_pickle('/tank/projects/computage/benchmarking/GSE56606.pkl.gz', compression='gzip').values()

In [56]:
data.min(1)

GSM1365585    0.000000e+00
GSM1365586    0.000000e+00
GSM1365587    0.000000e+00
GSM1365588    0.000000e+00
GSM1365589    3.660000e-03
                  ...     
GSM1365680   -3.402823e+38
GSM1365681   -3.402823e+38
GSM1365682   -3.402823e+38
GSM1365683   -3.402823e+38
GSM1365684   -3.402823e+38
Length: 100, dtype: float32

In [12]:
# listmodels = glob('/tank/projects/computage/checkpoints/pls*pheno*ultra*')
# listmodels, len(listmodels)

# legacy

In [2]:
# from biolearn.data_library import GeoData
# from biolearn.model_gallery import ModelGallery

# published = False

# gallery = ModelGallery()

# bench_results_AAP = pd.DataFrame()
# bench_results_AA0 = pd.DataFrame()
# datasets_predictions = {}
# for gse, conf in datasets_config.items():
#     #import data
#     path, cond, test = conf.values()
#     dnam, meta = pd.read_pickle(path, compression='gzip').values()
#     data = GeoData(meta, dnam.T)

#     ###Predict datasets and gather predictions
#     #Note that by default clocks will impute missing data.
#     #To change this behavior set the imputation= parameter when getting the clock #???

#     predictions = {}
#     if published:
#         #published clocks prediction
#         keys = list(gallery.model_definitions.keys())
#         for k in tqdm(keys):
#             try:
#                 if gallery.model_definitions[k]['output']=='Age (Years)':
#                     results = gallery.get(k, imputation_method='none').predict(data)
#                     predictions[k] = results['Predicted']
#             except:
#                 print('Oops!')
#                 continue
#     else:
#         #de novo clocks prediction
#         for path in tqdm(listmodels):
#             k = splitext(basename(path))[0]
#             model = pickle.load(open(path, 'rb'))
#             try:
#                 dnam_ = dnam.reindex(columns = list(model.pls.feature_names_in_)).copy()
#             except:
#                 dnam_ = dnam.reindex(columns = list(model.feature_names_in_)).copy()
#                 dnam_ = dnam_.fillna(0.)
            
#             preds_ = model.predict(dnam_)
#             if type(preds_) == np.ndarray:
#                 predictions[k] = pd.Series(preds_, index=dnam.index)
#             else:
#                 predictions[k] = pd.Series(preds_.values, index=dnam.index)
        
#     pred = pd.DataFrame(predictions)
#     datasets_predictions[gse] = pred.copy()
#     #meta filtering
#     no_age_na_indices = meta[~meta['Age'].isna()].index
#     meta = meta.loc[no_age_na_indices]
#     if test == 'AAP':
#         #calculating mann-whitney test for difference in age acceleration between disease and healthy cohorts
#         disease_idx = meta.index[meta['Condition'] == cond]
#         healthy_idx = meta.index[meta['Condition'] == 'HC']
#         print(f'{cond}:{gse} - AAP testing {len(disease_idx)} disease versus {len(healthy_idx)} healthy samples')
#         pvals = {}
#         for col in pred.columns:
#             disease_true = meta.loc[disease_idx, 'Age'].values
#             healthy_true = meta.loc[healthy_idx, 'Age'].values
#             disease_pred = pred.loc[disease_idx, col].values
#             healthy_pred = pred.loc[healthy_idx, col].values
#             disease_delta = disease_pred - disease_true
#             healthy_delta = healthy_pred - healthy_true
#             stat, pval = mannwhitneyu(disease_delta, healthy_delta, alternative='greater')
#             pvals[col] = pval
#         bench_results_AAP[f'{cond}:{gse}:AAP'] = pd.Series(pvals)
#     elif test == 'AA0':
#         #calculating wilcoxon test for positive age (>0) acceleration in disease cohort
#         disease_idx = meta.index[meta['Condition'] == cond]
#         print(f'{cond}:{gse} - AA0 testing {len(disease_idx)} disease samples')
#         pvals = {}
#         for col in pred.columns:
#             disease_true = meta.loc[disease_idx, 'Age'].values
#             disease_pred = pred.loc[disease_idx, col].values
#             disease_delta = disease_pred - disease_true
#             stat, pval = wilcoxon(disease_delta, alternative='greater')
#             pvals[col] = pval
#         bench_results_AA0[f'{cond}:{gse}:AA0'] = pd.Series(pvals)
#     else:
#         NotImplementedError("Only two tests are currently available: ['AAP', 'AA0'].")
    

In [167]:
#dnam, meta = pd.read_pickle(datasets_config['GSE53840']['path'], compression='gzip').values()

In [3]:
# def correction(x):
#     return multipletests(x, method='fdr_bh')[1]

# bench_results = pd.concat([bench_results_AAP, bench_results_AA0], axis=1).dropna(axis=0)
# corrected_results_AAP = bench_results_AAP.T.apply(correction, axis=0).T < 0.05
# corrected_results_AA0 = bench_results_AA0.T.apply(correction, axis=0).T < 0.05
# corrected_results = pd.concat([corrected_results_AAP, corrected_results_AA0], axis=1)

# print(corrected_results.shape)
# corrected_results.sum(axis=1).sort_values(ascending=False)

# bench_results.to_csv('/tank/projects/computage/results/bench_results_pls_pheno_ultra.csv')
# corrected_results.to_csv('/tank/projects/computage/results/bench_results_pls_pheno_ultra.csv')

In [15]:
pd.read_csv('/tank/projects/computage/results/bench_results_pre_noimputation_corrected.csv', index_col=0)#.sum(axis=1).sort_values(ascending=False)

Unnamed: 0,DS:GSE52588,Rheumatoid arthritis:GSE42861,AD:GSE59685,AD:GSE80970,Overweight:GSE73103,IBD:GSE87640,IBD:GSE87648,HT:GSE157131
Horvathv1,True,False,False,False,False,False,False,False
Hannum,True,True,False,False,False,True,True,False
Lin,True,False,False,False,False,True,False,False
PhenoAge,True,True,False,False,False,True,True,True
YingCausAge,True,False,False,False,False,False,False,False
YingDamAge,False,False,False,False,False,False,True,False
YingAdaptAge,True,False,False,False,False,False,False,False
Horvathv2,True,False,False,False,False,False,False,False
PEDBE,True,False,False,False,False,False,False,False
HRSInCHPhenoAge,True,True,False,False,True,True,True,True


In [13]:
pd.read_csv('/tank/projects/computage/results/bench_results_ultrav1_train_pheno_corrected.csv', index_col=0)#.sum(axis=1).sort_values(ascending=False)

Unnamed: 0,DS:GSE52588,Rheumatoid arthritis:GSE42861,AD:GSE59685,AD:GSE80970,Overweight:GSE73103,IBD:GSE87640,IBD:GSE87648,HT:GSE157131
pls1_11_pheno_ultrav1,True,False,False,False,False,False,False,False
pls1_7_kdm_pheno_ultrav1,True,False,False,False,False,False,True,False
pls1_6_pheno_ultrav1,True,False,False,False,False,False,False,False
pls1_1_kdm_pheno_ultrav1,True,True,False,False,False,False,False,False
pls1_12_kdm_pheno_ultrav1,True,False,False,False,False,False,True,False
pls1_6_kdm_pheno_ultrav1,True,False,False,False,False,False,True,False
pls1_7_pheno_ultrav1,True,False,False,False,False,False,False,False
pls1_13_kdm_pheno_ultrav1,True,False,False,False,False,False,True,False
pls1_10_pheno_ultrav1,True,False,False,False,False,False,False,False
pls1_1_pheno_ultrav1,True,True,False,False,False,False,False,False


In [23]:
pd.read_csv('/tank/projects/computage/results/bench_results_kdm_corrected.csv', index_col=0)

Unnamed: 0,DS:GSE52588,Rheumatoid arthritis:GSE42861,AD:GSE59685,AD:GSE80970,Overweight:GSE73103,IBD:GSE87640,IBD:GSE87648,HT:GSE157131
kdm_rse_all_20k,True,True,False,False,False,False,True,False
kdm_rse_forward_20k,False,False,False,False,True,True,False,False


In [84]:
bench_results_logical = bench_results < 0.05
bench_results_logical

In [49]:
#look up at all models
pd.DataFrame(gallery.model_definitions).T

Unnamed: 0,year,species,tissue,source,output,model
Horvathv1,2013,Human,Multi-tissue,https://genomebiology.biomedcentral.com/articl...,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Ho..."
Hannum,2013,Human,Blood,https://www.sciencedirect.com/science/article/...,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Ha..."
Lin,2016,Human,Blood,https://www.aging-us.com/article/100908/text,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Li..."
PhenoAge,2018,Human,Blood,https://www.aging-us.com/article/101414/text,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Ph..."
YingCausAge,2022,Human,Blood,https://www.biorxiv.org/content/10.1101/2022.1...,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Yi..."
YingDamAge,2022,Human,Blood,https://www.biorxiv.org/content/10.1101/2022.1...,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Yi..."
YingAdaptAge,2022,Human,Blood,https://www.biorxiv.org/content/10.1101/2022.1...,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Yi..."
Horvathv2,2018,Human,Skin + blood,https://www.aging-us.com/article/101508/text,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'Ho..."
PEDBE,2019,Human,Buccal,https://www.pnas.org/doi/10.1073/pnas.1820843116,Age (Years),"{'type': 'LinearMethylationModel', 'file': 'PE..."
Zhang_10,2019,Human,Blood,https://www.nature.com/articles/ncomms14617,Mortality Risk,"{'type': 'LinearMethylationModel', 'file': 'Zh..."
