In [26]:
from datetime import datetime
import os
import statsmodels.api as sm
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt

In [None]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

In [3]:
np.random.seed(123)

## Array

### Continuous Phenotype R2

In [4]:
Sample_all_quant = pd.read_csv(f"{bucket}/Pheno/quant_all.tsv",sep = "\t", index_col="person_id")
Height_df = pd.read_csv(f"{bucket}/Scores/Array/Height_PRScs.gz",sep = "\t", index_col="s")
DBP_df = pd.read_csv(f"{bucket}/Scores/Array/DBP_PRScs.gz",sep = "\t", index_col="s")
HDL_df = pd.read_csv(f"{bucket}/Scores/Array/HDL_PRScs.gz",sep = "\t", index_col="s")
TC_df = pd.read_csv(f"{bucket}/Scores/Array/TC_PRScs.gz",sep = "\t", index_col="s")
RBC_df = pd.read_csv(f"{bucket}/Scores/Array/RBC_PRScs.gz",sep = "\t", index_col="s")
leukocyte_df = pd.read_csv(f"{bucket}/Scores/Array/leukocyte_PRScs.gz",sep = "\t", index_col="s")

In [5]:
def Compute_R2_quant(phenodf, pheno, anc):
    import statsmodels.api as sm
    
    # Define predictors and phenotype
    PHE = pheno
    COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
            'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
            'PC14', 'PC15', 'PC16']
    PGS = 'pgs'  # Assumes 'pgs' column contains polygenic scores

    # Filter data for the specified ancestry and ensure phenotype is not NaN
    phenodf_anc = phenodf.loc[
        (Sample_all_quant["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())
    ]

    # Add covariates
    phenodf_anc_COVs = sm.add_constant(phenodf_anc[COVs].astype("float"))
    phenodf_anc_PGS = sm.add_constant(
        phenodf_anc[COVs + [PGS]].astype("float")
    )
    phenodf_anc_PHE = phenodf_anc[PHE].astype("float")

    # Fit null model (covariates only)
    model0 = sm.OLS(phenodf_anc_PHE, phenodf_anc_COVs, missing='drop').fit()

    # Fit model with covariates + PGS
    model1 = sm.OLS(phenodf_anc_PHE, phenodf_anc_PGS, missing='drop').fit()

    # Compute R2 difference
    R2 = model1.rsquared - model0.rsquared

    return R2

In [6]:
def Make_df2plot(phenodf, pheno):
    ancs = ["eur", "amr", "afr"]
    df2plot = pd.DataFrame(index=range(0), columns=["pheno", "anc", "R2"])
    
    for anc in ancs:
        if pheno in ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]:
            R2 = Compute_R2_quant(phenodf, pheno, anc)
        elif pheno in ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]:
            R2 = Compute_R2_binary(phenodf, pheno, anc)
        
        df2plot_anc = pd.DataFrame({
            "pheno": [pheno],
            "anc": [anc],
            "R2": [R2]
        })
        df2plot = pd.concat([df2plot, df2plot_anc], axis=0)

    return df2plot

In [7]:
df2plot_quant = pd.concat([Make_df2plot(Height_df, "Height"),
                           Make_df2plot(DBP_df, "DBP"),
                           Make_df2plot(HDL_df, "HDL"),
                           Make_df2plot(TC_df, "TC"),
                           Make_df2plot(RBC_df, "RBC"),
                           Make_df2plot(leukocyte_df, "leukocyte")],
                          axis = 0)

In [10]:
# Rename the 'anc' column to 'Ancestry'
df2plot_quant.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_quant['Ancestry'] = df2plot_quant['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})

### Binary phenotype R2

In [27]:
Sample_all_binary = pd.read_csv(f"{bucket}/Pheno/binary_all.tsv",sep = "\t", index_col="person_id")
Asthma_df = pd.read_csv(f"{bucket}/Scores/Array/Asthma_PRScs.gz",sep = "\t", index_col="s")
T2D_df = pd.read_csv(f"{bucket}/Scores/Array/T2D_PRScs.gz",sep = "\t", index_col="s")
Colorectal_Cancer_df = pd.read_csv(f"{bucket}/Scores/Array/Colorectal_Cancer_PRScs.gz",sep = "\t", index_col="s")
Breast_Cancer_df = pd.read_csv(f"{bucket}/Scores/Array/Breast_Cancer_PRScs.gz",sep = "\t", index_col="s")

In [28]:
def Compute_R2_binary(phenodf, pheno, anc):
    import statsmodels.api as sm
    
    # Define predictors and phenotype
    PHE = pheno
    COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
            'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
            'PC14', 'PC15', 'PC16']
    PGS = 'pgs'  # Assumes 'pgs' column contains polygenic scores

    # Filter data for the specified ancestry and ensure phenotype is not NaN
    phenodf_anc = phenodf.loc[
        (Sample_all_quant["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())
    ]

    # Add covariates
    phenodf_anc_COVs = sm.add_constant(phenodf_anc[COVs].astype("float"))
    phenodf_anc_PGS = sm.add_constant(
        phenodf_anc[COVs + [PGS]].astype("float")
    )
    phenodf_anc_PHE = phenodf_anc[PHE].astype("float")

    # Fit null model (covariates only)
    model0 = sm.GLM(phenodf_anc_PHE, phenodf_anc_COVs, missing='drop', family=sm.families.Binomial()).fit(disp = 0)

    # Fit model with covariates + PGS
    model1 = sm.GLM(phenodf_anc_PHE, phenodf_anc_PGS, missing='drop', family=sm.families.Binomial()).fit(disp = 0)

    # Compute R2 difference
    CSR2 = 1 - math.exp( (2/model0.nobs) * (model0.llf - model1.llf)  )
    R2 = CSR2/(1 - math.exp( (2/model0.nobs) * model0.llf) )

    return R2

In [29]:
df2plot_binary = pd.concat([Make_df2plot(Asthma_df, "Asthma"),
                            Make_df2plot(T2D_df, "T2D"),
                            Make_df2plot(Colorectal_Cancer_df, "Colorectal_Cancer"),
                            Make_df2plot(Breast_Cancer_df, "Breast_Cancer")],
                            axis = 0)

In [31]:
# Rename the 'anc' column to 'Ancestry'
df2plot_binary.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_binary['Ancestry'] = df2plot_binary['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})

In [33]:
df2plot_quant.to_csv(f'{bucket}/R2/PRScs_array_quant.tsv', sep='\t', index=False)
df2plot_binary.to_csv(f'{bucket}/R2/PRScs_array_binary.tsv', sep='\t', index=False)

## WGS

### Continuous Phenotype R2

In [4]:
Sample_all_quant = pd.read_csv(f"{bucket}/Pheno/quant_all.tsv",sep = "\t", index_col="person_id")
Height_df = pd.read_csv(f"{bucket}/Scores/WGS/Height_PRScs.gz",sep = "\t", index_col="s")
DBP_df = pd.read_csv(f"{bucket}/Scores/WGS/DBP_PRScs.gz",sep = "\t", index_col="s")
HDL_df = pd.read_csv(f"{bucket}/Scores/WGS/HDL_PRScs.gz",sep = "\t", index_col="s")
TC_df = pd.read_csv(f"{bucket}/Scores/WGS/TC_PRScs.gz",sep = "\t", index_col="s")
RBC_df = pd.read_csv(f"{bucket}/Scores/WGS/RBC_PRScs.gz",sep = "\t", index_col="s")
leukocyte_df = pd.read_csv(f"{bucket}/Scores/WGS/leukocyte_PRScs.gz",sep = "\t", index_col="s")

In [5]:
def Compute_R2_quant(phenodf, pheno, anc):
    import statsmodels.api as sm
    
    # Define predictors and phenotype
    PHE = pheno
    COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
            'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
            'PC14', 'PC15', 'PC16']
    PGS = 'pgs'  # Assumes 'pgs' column contains polygenic scores

    # Filter data for the specified ancestry and ensure phenotype is not NaN
    phenodf_anc = phenodf.loc[
        (Sample_all_quant["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())
    ]

    # Add covariates
    phenodf_anc_COVs = sm.add_constant(phenodf_anc[COVs].astype("float"))
    phenodf_anc_PGS = sm.add_constant(
        phenodf_anc[COVs + [PGS]].astype("float")
    )
    phenodf_anc_PHE = phenodf_anc[PHE].astype("float")

    # Fit null model (covariates only)
    model0 = sm.OLS(phenodf_anc_PHE, phenodf_anc_COVs, missing='drop').fit()

    # Fit model with covariates + PGS
    model1 = sm.OLS(phenodf_anc_PHE, phenodf_anc_PGS, missing='drop').fit()

    # Compute R2 difference
    R2 = model1.rsquared - model0.rsquared

    return R2

In [6]:
def Make_df2plot(phenodf, pheno):
    ancs = ["eur", "amr", "afr"]
    df2plot = pd.DataFrame(index=range(0), columns=["pheno", "anc", "R2"])
    
    for anc in ancs:
        if pheno in ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]:
            R2 = Compute_R2_quant(phenodf, pheno, anc)
        elif pheno in ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]:
            R2 = Compute_R2_binary(phenodf, pheno, anc)
        
        df2plot_anc = pd.DataFrame({
            "pheno": [pheno],
            "anc": [anc],
            "R2": [R2]
        })
        df2plot = pd.concat([df2plot, df2plot_anc], axis=0)

    return df2plot

In [7]:
df2plot_quant = pd.concat([Make_df2plot(Height_df, "Height"),
                           Make_df2plot(DBP_df, "DBP"),
                           Make_df2plot(HDL_df, "HDL"),
                           Make_df2plot(TC_df, "TC"),
                           Make_df2plot(RBC_df, "RBC"),
                           Make_df2plot(leukocyte_df, "leukocyte")],
                          axis = 0)

In [9]:
# Rename the 'anc' column to 'Ancestry'
df2plot_quant.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_quant['Ancestry'] = df2plot_quant['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})

### Binary Phenotype R2

In [11]:
Sample_all_binary = pd.read_csv(f"{bucket}/Pheno/binary_all.tsv",sep = "\t", index_col="person_id")
Asthma_df = pd.read_csv(f"{bucket}/Scores/WGS/Asthma_PRScs.gz",sep = "\t", index_col="s")
T2D_df = pd.read_csv(f"{bucket}/Scores/WGS/T2D_PRScs.gz",sep = "\t", index_col="s")
Colorectal_Cancer_df = pd.read_csv(f"{bucket}/Scores/WGS/Colorectal_Cancer_PRScs.gz",sep = "\t", index_col="s")
Breast_Cancer_df = pd.read_csv(f"{bucket}/Scores/WGS/Breast_Cancer_PRScs.gz",sep = "\t", index_col="s")

In [12]:
def Compute_R2_binary(phenodf, pheno, anc):
    import statsmodels.api as sm
    
    # Define predictors and phenotype
    PHE = pheno
    COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
            'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
            'PC14', 'PC15', 'PC16']
    PGS = 'pgs'  # Assumes 'pgs' column contains polygenic scores

    # Filter data for the specified ancestry and ensure phenotype is not NaN
    phenodf_anc = phenodf.loc[
        (Sample_all_quant["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())
    ]

    # Add covariates
    phenodf_anc_COVs = sm.add_constant(phenodf_anc[COVs].astype("float"))
    phenodf_anc_PGS = sm.add_constant(
        phenodf_anc[COVs + [PGS]].astype("float")
    )
    phenodf_anc_PHE = phenodf_anc[PHE].astype("float")

    # Fit null model (covariates only)
    model0 = sm.GLM(phenodf_anc_PHE, phenodf_anc_COVs, missing='drop', family=sm.families.Binomial()).fit(disp = 0)

    # Fit model with covariates + PGS
    model1 = sm.GLM(phenodf_anc_PHE, phenodf_anc_PGS, missing='drop', family=sm.families.Binomial()).fit(disp = 0)

    # Compute R2 difference
    CSR2 = 1 - math.exp( (2/model0.nobs) * (model0.llf - model1.llf)  )
    R2 = CSR2/(1 - math.exp( (2/model0.nobs) * model0.llf) )

    return R2

In [13]:
df2plot_binary = pd.concat([Make_df2plot(Asthma_df, "Asthma"),
                            Make_df2plot(T2D_df, "T2D"),
                            Make_df2plot(Colorectal_Cancer_df, "Colorectal_Cancer"),
                            Make_df2plot(Breast_Cancer_df, "Breast_Cancer")],
                            axis = 0)

In [15]:
# Rename the 'anc' column to 'Ancestry'
df2plot_binary.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_binary['Ancestry'] = df2plot_binary['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})

In [17]:
df2plot_quant.to_csv(f'{bucket}/R2/PRScs_wgs_quant.tsv', sep='\t', index=False)
df2plot_binary.to_csv(f'{bucket}/R2/PRScs_wgs_binary.tsv', sep='\t', index=False)