In [1]:
import pandas as pd

import statsmodels.api as sm
import pandas as pd
import datetime
from pathlib import Path

pd.set_option("display.max_columns", None)
# !pip install statsmodels


# %config InlineBackend.figure_format = "svg"
# %config InlineBackend.print_figure_kwargs = {"dpi" : 300}
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


# from cadFace.vis import percentiles_plot
import sci_palettes

try:
    sci_palettes.register_cmap()
except:
    pass
import scienceplots
from pathlib import Path
import pandas as pd
from ppp_aging.ppp_model import *
from ppp_aging.model import generate_states_cols
import json

plt.style.use(["nature", "no-latex"])
sns.set_context("paper", font_scale=1.5)
sns.set_palette("nejm")

In [2]:
from ppp_prediction.model import fit_best_model

In [3]:
risk_factors = [
    "age",
    "sex",
    "ldl_a",
    "hdl_a",
    "tc_a",
    "sbp_a",

]


E = "incident_cad"
T = "future_cad_time_months"

In [4]:
train_imputed = pd.read_pickle("1_train_imputed.pkl")
test_imputed = pd.read_pickle("1_test_imputed.pkl").dropna(
    subset=["PRS"] + risk_factors
)

organ_combination = pd.read_pickle("olink_organ_genes.pkl")

In [5]:
print(f"organ_genes have {len(organ_combination)} keys")
print(
    f"train_imputed shape is {train_imputed.shape}, test_imputed shape is {test_imputed.shape}"
)
print(
    f"train have {E}: {train_imputed[E].value_counts()}; test have {E}: {test_imputed[E].value_counts()}"
)

organ_genes have 13 keys
train_imputed shape is (40806, 2966), test_imputed shape is (8298, 2966)
train have incident_cad: 0.0    38544
1.0     2262
Name: incident_cad, dtype: int64; test have incident_cad: 0.0    7859
1.0     439
Name: incident_cad, dtype: int64


In [6]:
from collections import OrderedDict

organ_combination = OrderedDict(
    sorted(organ_combination.items(), key=lambda x: len(x[1]))
)
organ_combination

OrderedDict([('Adipose', ['FABP4', 'ADIPOQ', 'LEP']),
             ('Heart', ['BMP10', 'TNNI3', 'NPPB', 'PXDNL']),
             ('Kidney', ['PTH1R', 'ITGB6', 'PDZK1', 'UMOD', 'REN']),
             ('Lung', ['ALPP', 'AGER', 'CCL18', 'SFTPD', 'CSF2']),
             ('Muscle',
              ['TTN',
               'CA3',
               'ENO3',
               'IDI2',
               'ATP1B4',
               'SOD2',
               'MYBPC1',
               'EGLN1',
               'FGF6']),
             ('Artery',
              ['TNFRSF11B',
               'MFGE8',
               'VCAN',
               'THBS2',
               'ACAN',
               'BGN',
               'CYTL1',
               'SOST',
               'FRZB',
               'CRLF1']),
             ('Intestine',
              ['FGF19',
               'REG4',
               'PYY',
               'FABP2',
               'SPINK4',
               'CA7',
               'APOA4',
               'CDHR5',
               'CCL25',
          

In [7]:
organ_combination_all = OrderedDict()
for k, v in organ_combination.items():
    organ_combination_all[f"{k}"] = v
    organ_combination_all[f"{k} + RF"] = v + risk_factors
    organ_combination_all[f"{k} + RF + PRS"] = v + risk_factors + ["PRS"]
organ_combination_all

OrderedDict([('Adipose', ['FABP4', 'ADIPOQ', 'LEP']),
             ('Adipose + RF',
              ['FABP4',
               'ADIPOQ',
               'LEP',
               'age',
               'sex',
               'ldl_a',
               'hdl_a',
               'tc_a',
               'tg_a',
               'apob',
               'sbp_a',
               'dbp_a']),
             ('Adipose + RF + PRS',
              ['FABP4',
               'ADIPOQ',
               'LEP',
               'age',
               'sex',
               'ldl_a',
               'hdl_a',
               'tc_a',
               'tg_a',
               'apob',
               'sbp_a',
               'dbp_a',
               'PRS']),
             ('Heart', ['BMP10', 'TNNI3', 'NPPB', 'PXDNL']),
             ('Heart + RF',
              ['BMP10',
               'TNNI3',
               'NPPB',
               'PXDNL',
               'age',
               'sex',
               'ldl_a',
               'hdl_a',
               'tc

In [9]:
import pickle

pickle.dump(organ_combination_all, open("organ_combination_all.pkl", "wb"))

In [10]:
from collections import defaultdict
import pickle
from pathlib import Path

Regression_model_result_dict = defaultdict(dict)
output = "./1_regression_models/organ/"
Path(output).mkdir(exist_ok=True)

In [11]:
from cuml import LogisticRegression, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
import numpy as np


def fit_best_model(train_df, test_df, X_var, y_var, method_list=None, cv=10, verbose=1):
    models_params = {
        "Logistic": {
            "model": LogisticRegression(
                solver="qn", random_state=42, class_weight="balanced"
            ),
            "param_grid": {
                "C": np.logspace(-4, 4, 10),  # C参数的范围，使用对数间隔
                "penalty": ["l1", "l2"],  # 正则化类型
            },
        },
        "Lasso": {
            "model": Lasso(),
            "param_grid": {
                "alpha": np.logspace(-4, 4, 10),
            },
        },
        "ElasticNet": {
            "model": ElasticNet(),
            "param_grid": {
                "alpha": np.logspace(-4, 4, 10),
                "l1_ratio": np.linspace(0, 1, 10),
            },
        },
        # "RandomForest": {
        #     "model": RandomForestClassifier(),
        #     "param_grid": {"n_estimators": range(10, 101, 10)},
        # },
    }
    if method_list is not None:
        models_params = {k: v for k, v in models_params.items() if k in method_list}

    train_df = train_df[[y_var] + X_var].copy().dropna()
    test_df = test_df[[y_var] + X_var].copy().dropna()
    train_df[y_var] = train_df[y_var].astype(int)
    test_df[y_var] = test_df[y_var].astype(int)

    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

    X_train = train_df[X_var]
    y_train = train_df[y_var]
    X_val = val_df[X_var]
    y_val = val_df[y_var]

    X_test = test_df[X_var]
    y_test = test_df[y_var]
    print(
        f"train shape: {X_train.shape}, val shape is {X_val.shape}, test shape is {X_test.shape}"
    )
    best_models = []

    for model_name, mp in models_params.items():
        # if model_name == "RandomForest":
        #     best_model = RandomForestClassifier(verbose=verbose)
        #     best_model.fit(X_train.values, y_train.values)
        #     auc = roc_auc_score(y_val, best_model.predict(X_val.values))
        #     bset_params = None  # no params for RandomForest

        # else:
        if model_name == "Logistic":
            scorer = make_scorer(roc_auc_score, needs_proba=True)
        else:
            scorer = make_scorer(roc_auc_score)

        grid_search = GridSearchCV(
            mp["model"], mp["param_grid"], scoring=scorer, cv=cv, verbose=verbose
        )
        grid_search.fit(X_train.values, y_train.values)

        best_model = grid_search.best_estimator_
        bset_params = grid_search.best_params_

        if model_name == "Logistic":
            auc = roc_auc_score(y_val, best_model.predict_proba(X_val.values)[:, 1])
        else:
            auc = roc_auc_score(y_val, best_model.predict(X_val.values))
        print(f"model: {model_name}\tBest parameters: {bset_params}, with auc: {auc}")
        best_models.append((model_name, best_model, grid_search, auc))

    ## select the currently best
    # print(best_models)

    # 还原原始的train_df
    train_df = pd.concat([train_df, val_df], axis=0)
    X_train = train_df[X_var]
    y_train = train_df[y_var]

    best_mdoels = list(sorted(best_models, key=lambda x: x[-1], reverse=True))
    best_model_name, best_model, *_ = best_mdoels[0]

    if best_model_name == "Logistic":
        train_pred = best_model.predict_proba(X_train.values)[:, 1]

        test_pred = best_model.predict_proba(X_test.values)[:, 1]
    else:
        train_pred = best_model.predict(X_train.values)
        val_pred = best_model.predict(X_val.values)
        test_pred = best_model.predict(X_test.values)

    train_df[f"{y_var}_pred"] = train_pred

    test_df[f"{y_var}_pred"] = test_pred

    train_auc = roc_auc_score(y_train, train_pred)
    test_auc = roc_auc_score(y_test, test_pred)

    train_metrics = {
        "train_auc": train_auc,
    }
    test_metrics = {
        "test_auc": test_auc,
    }
    return best_model, train_metrics, test_metrics, train_df, test_df, best_mdoels

In [12]:
# %%time
import numpy as np

methods = ["Lasso", "ElasticNet", "Logistic"]
cv = 10
for k, combination in organ_combination_all.items():
    if Path(f"{output}/{k}.pkl").exists():
        print(f"{k} already exists")
        continue
    if k in Regression_model_result_dict.keys():
        print(f"{k} already in result dict")
        continue
    (
        model,
        train_metrics,
        test_metrics,
        train_imputed_data,
        test_imputed_data,
        best_models,
    ) = fit_best_model(
        train_df=train_imputed,
        test_df=test_imputed,
        X_var=organ_combination_all[k],
        y_var=E,
        method_list=methods,
        cv=cv,
    )
    print(
        f"{k} train auc: {train_metrics['train_auc']}, test auc: {test_metrics['test_auc']}"
    )
    all_obj = {
        "model": model,
        "train_metrics": train_metrics,
        "test_metrics": test_metrics,
        "train_data": train_imputed_data,
        "test_data": test_imputed_data,
        "best_models": best_models,
    }
    pickle.dump(all_obj, open(f"{output}/{k}.pkl", "wb"))
    Regression_model_result_dict[k] = all_obj

Adipose already exists
Adipose + RF already exists
Adipose + RF + PRS already exists
Heart already exists
Heart + RF already exists
Heart + RF + PRS already exists
Kidney already exists
Kidney + RF already exists
Kidney + RF + PRS already exists
Lung already exists
Lung + RF already exists
Lung + RF + PRS already exists
Muscle already exists
Muscle + RF already exists
Muscle + RF + PRS already exists
Artery already exists
Artery + RF already exists
Artery + RF + PRS already exists
Intestine already exists
Intestine + RF already exists
Intestine + RF + PRS already exists
Pancreas already exists
Pancreas + RF already exists
Pancreas + RF + PRS already exists
Brain already exists
Brain + RF already exists
Brain + RF + PRS already exists
Liver already exists
Liver + RF already exists
Liver + RF + PRS already exists
organismal already exists
organismal + RF already exists
organismal + RF + PRS already exists
Immune already exists
Immune + RF already exists
Immune + RF + PRS already exists
[



[W] [14:46:26.064761] QWL-QN stopped, because the line search failed to advance (step delta = 0.000000)
[W] [14:46:42.553078] QWL-QN stopped, because the line search failed to advance (step delta = 0.000000)
[W] [14:46:48.687407] QWL-QN stopped, because the line search failed to advance (step delta = 0.000000)
[W] [14:50:10.939153] QWL-QN: max iterations reached
[W] [14:50:10.939616] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [14:50:26.329998] QWL-QN: max iterations reached
[W] [14:50:26.330648] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [14:50:39.401548] QWL-QN: max iterations reached
[W] [14:50:39.402239] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number

KeyboardInterrupt: 

In [15]:
organ_combination_all.keys()

odict_keys(['Adipose', 'Adipose + RF', 'Adipose + RF + PRS', 'Heart', 'Heart + RF', 'Heart + RF + PRS', 'Kidney', 'Kidney + RF', 'Kidney + RF + PRS', 'Lung', 'Lung + RF', 'Lung + RF + PRS', 'Muscle', 'Muscle + RF', 'Muscle + RF + PRS', 'Artery', 'Artery + RF', 'Artery + RF + PRS', 'Intestine', 'Intestine + RF', 'Intestine + RF + PRS', 'Pancreas', 'Pancreas + RF', 'Pancreas + RF + PRS', 'Brain', 'Brain + RF', 'Brain + RF + PRS', 'Liver', 'Liver + RF', 'Liver + RF + PRS', 'organismal', 'organismal + RF', 'organismal + RF + PRS', 'Immune', 'Immune + RF', 'Immune + RF + PRS', 'conventional', 'conventional + RF', 'conventional + RF + PRS'])