In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm


In [None]:
def horvath_transformed_age(age: float) -> float:
    if age<=20:
        return np.log(age+1)-np.log(21)
    else:
        return (age-20)/21

def inverse_horvath_age(tr_age: float) -> float:
    if tr_age<=0:
        return 21*np.exp(tr_age) - 1
    else:
        return 21*tr_age + 20

def predict_age(model_df: pd.DataFrame, methylation: pd.DataFrame) -> pd.Series:
    
    lost_probes = np.setdiff1d(model_df.index.to_numpy(),methylation.columns.to_numpy())
    lost_probes = np.setdiff1d(lost_probes,["intercept"])
    if len(lost_probes)>0:
        print(f"There are {len(lost_probes)} missing probes; will replace by median value in the original set")

    red_meth = methylation[methylation.columns.intersection(model_df.index)].copy()

    missing_probes = pd.concat([pd.DataFrame(model_df.loc[lost_probes]["median"]).T]*red_meth.shape[0])
    missing_probes.index = red_meth.index

    red_meth = pd.concat([red_meth,missing_probes],axis=1)

    coefs = model_df.loc[red_meth.columns].coefs

    scaled_age = (red_meth*coefs).sum(axis=1)+model_df.loc["intercept"]["coefs"]

    y_pred = scaled_age.apply(inverse_horvath_age)

    return y_pred

In [None]:
data_dir = pl.Path("/add/path/here/")
horvath_probes = pd.read_csv(data_dir / "horvath_cpg.csv",header=None,sep="\t",index_col=0)
horvath_probes.columns = ["coefs","shrinked_coef","median"]
cpgs = horvath_probes[horvath_probes["coefs"]!=0].index[1:].to_numpy()

In [None]:
import os
from glob import glob
PATH1 = "/add/path/here/"
EXT = "*.csv"
all_csv_files_4 = [file
                 for path, subdir, files in os.walk(PATH1)
                 for file in glob(os.path.join(path, EXT)) if "sample_sheet" not in file]
PATH2 = "/add/path/here/"
all_csv_files_rest = [file
                 for path, subdir, files in os.walk(PATH2)
                 for file in glob(os.path.join(path, EXT)) if "sample_sheet" not in file]

all_csv_files = np.append(all_csv_files_rest,all_csv_files_4)

sheet_dir = pl.Path("/add/path/here/")
sample_sheet_rest = pd.read_csv(sheet_dir / "SWEPIC_full_sample_sheet.csv",index_col=0)
idx = sample_sheet_rest["Sentrix_ID"].astype(str) + "_" + sample_sheet_rest["Sentrix_Position"].astype(str)
sample_sheet_rest.index = idx
mapping_rest = sample_sheet_rest["Sample_Name"].astype(str).to_dict()

sample_sheet_4 = pd.read_csv(sheet_dir / "/add/path/here/sample_sheet_EPIC4.csv",index_col=0)
idx = sample_sheet_4["Sentrix_ID"].astype(str) + "_" + sample_sheet_4["Sentrix_Position"].astype(str)
sample_sheet_4.index = idx
mapping_4 = sample_sheet_4["Sample_Name"].astype(str).to_dict()

mapping = dict(mapping_rest, **mapping_4)

full_meth = []
for file in tqdm(all_csv_files):
    f = pd.read_csv(file,index_col=0)
    name = file.split("/")[-1].split("_")[:2]
    name = name[0] + "_" + name[1]
    f = f[f.poobah_pval<0.05]
    beta = f.loc[f.index.intersection(cpgs),"beta_value"]
    beta.name = name
    full_meth.append(beta)

In [None]:
meth_for_clocks = pd.concat(full_meth,axis=1)

meth_for_clocks = meth_for_clocks.rename(columns=mapping).T

In [None]:
qt_20 = int(meth_for_clocks.shape[0]*0.2)

meth_for_clocks = meth_for_clocks.loc[:,~(meth_for_clocks.isna().sum(axis=0)>qt_20)]

In [None]:
meth_for_clocks = meth_for_clocks.fillna(meth_for_clocks.median())

In [None]:
horvath_probes = pd.read_csv(data_dir / "horvath_cpg.csv",header=None,sep="\t",index_col=0)
horvath_probes.columns = ["coefs","shrinked_coef","median"]
horvath_probes = horvath_probes.rename(index={"(Intercept)": "intercept"})

In [None]:
y_pred_horvath = predict_age(model_df=horvath_probes,
                    methylation=meth_for_clocks[meth_for_clocks.columns.intersection(horvath_probes.index)])

In [None]:
y_pred_horvath.to_csv(data_dir / "auxiliary" / "horvath_age.csv")