In [1]:
import pandas as pd

import statsmodels.api as sm
import pandas as pd
import datetime
from pathlib import Path

pd.set_option("display.max_columns", None)
# !pip install statsmodels


# %config InlineBackend.figure_format = "svg"
# %config InlineBackend.print_figure_kwargs = {"dpi" : 300}
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


# from cadFace.vis import percentiles_plot
import sci_palettes

try:
    sci_palettes.register_cmap()
except:
    pass
import scienceplots
from pathlib import Path
import pandas as pd
from ppp_aging.ppp_model import *
from ppp_aging.model import generate_states_cols

plt.style.use(["nature", "no-latex"])
sns.set_context("paper", font_scale=1.5)
sns.set_palette("nejm")

In [2]:
import json

part2_dir = "/home/xutingfeng/ukb/project/ppp_prediction/result/part2"
data = pd.read_pickle("result/part1/data.pkl")

cofounders = ["assessment_center", "age", "sex", *[f"PC{i}" for i in range(1, 11)]]
combination_dict_dir = f"{part2_dir}/dataset/combination_dict.json"
combination_dict = json.load(open(combination_dict_dir, "r"))
features = combination_dict["proteomics"]["features"]

In [3]:
## 添加时间


def add_k_year_follow(df, k=5, unit="month"):

    assert unit in ["month", "year", "day"]
    year_of_cad_after_recuit = "year_of_cad_after_recuit"
    E = f"{k}_years_cad"
    T = f"{k}_years_cad_{unit}"

    df[E] = (df[year_of_cad_after_recuit] <= k).astype(int)

    def survive_time(x):
        cad_date = x["cad_date"]
        death_date = x["death_date"]
        recuit_date = x["recuit_date"]

        if x[E] == 1:  # K年内得了冠心病
            days = (cad_date - recuit_date).days
        elif x[E] == 0:  # K年没有冠心病
            if pd.notnull(death_date):
                death_times_after_recuit = (death_date - recuit_date).days
                if death_times_after_recuit <= k * 365:  # K年内死亡了
                    days = death_times_after_recuit
                else:
                    days = k * 365  # 死亡了，但是没在k年内
            else:
                days = k * 365  # 直到隨訪結束也沒有死
        if unit == "month":
            return days / 30
        elif unit == "year":
            return days / 365
        else:
            return days

    df[T] = df.apply(survive_time, axis=1)
    return df, E, T


data, E_3, T_3 = add_k_year_follow(data, k=3)
data, E_5, T_5 = add_k_year_follow(data, k=5)
data, E_10, T_10 = add_k_year_follow(data, k=10)

## age bin 划分

In [4]:
data["age_bin"] = pd.cut(
    data["age"],
    bins=[0, 60, 65, 70],
    labels=["<60", "60-65", "65-70"],
)

data.groupby(["age_bin"]).value_counts(["incident_cad"])

age_bin  incident_cad
<60      0.0             30132
         1.0              1074
60-65    0.0             11183
         1.0               893
65-70    0.0              7292
         1.0               865
dtype: int64

In [5]:
data.groupby(["age_bin"]).value_counts([E_3])

age_bin  3_years_cad
<60      0              31034
         1                172
60-65    0              11894
         1                182
65-70    0               8006
         1                151
dtype: int64

In [6]:
data.groupby(["age_bin"]).value_counts([E_5])

age_bin  5_years_cad
<60      0              30860
         1                346
60-65    0              11744
         1                332
65-70    0               7858
         1                299
dtype: int64

In [7]:
data.groupby(["age_bin"]).value_counts([E_10])

age_bin  10_years_cad
<60      0               30394
         1                 812
60-65    0               11362
         1                 714
65-70    0                7474
         1                 683
dtype: int64

## 关联分析

In [8]:
from ppp_prediction.corr import cal_corr, generate_multipletests_result


save_dir = "/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc"

cofounders = ["assessment_center", "age", "sex", *[f"PC{i}" for i in range(1, 11)]]
proteomics = combination_dict["proteomics"]["features"]

res = []
for label in ["incident_cad", E_3, E_5, E_10]:

    for age_bin, age_bin_df in data.groupby("age_bin"):
        c_save_dir = f"{save_dir}/{label}/{age_bin}.csv"
        Path(c_save_dir).parent.mkdir(parents=True, exist_ok=True)
        if Path(c_save_dir).exists():
            print(f"{c_save_dir} exists")
            continue
        print(f"age_bin: {age_bin}, label: {label}")
        single_association_proteins_result_df = cal_corr(
            age_bin_df, proteomics, y=label, cofounders=cofounders, model_type="logit"
        )  # 使用原始没有imputated版本

        single_association_proteins_result_df["LOG10P"] = -np.log10(
            single_association_proteins_result_df["pvalue"].astype(float)
        )

        single_association_proteins_result_df = generate_multipletests_result(
            single_association_proteins_result_df
        )
        single_association_proteins_result_df["age_bin"] = age_bin
        single_association_proteins_result_df["label"] = label
        single_association_proteins_result_df.sort_values(
            "LOG10P", ascending=False
        ).reset_index(drop=True).reset_index(drop=False, names="rank")
        single_association_proteins_result_df.to_csv(c_save_dir, index=False)

        res.append(single_association_proteins_result_df)

res_df = pd.concat(res)
res_df.to_csv(f"{save_dir}/all.csv", index=False)

/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/incident_cad/<60.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/incident_cad/60-65.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/incident_cad/65-70.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/3_years_cad/<60.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/3_years_cad/60-65.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/3_years_cad/65-70.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/5_years_cad/<60.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/5_years_cad/60-65.csv exists
/home/xutingfeng/ukb/project/ppp_prediction/result/part4/age_specific_assoc/5_years_cad/65-70.csv exists


Output()

age_bin: <60, label: 10_years_cad


  for x_ in tqdm(


In [11]:
res_df[0]

NameError: name 'res_df' is not defined