In [1]:
from datetime import datetime
import os
import statsmodels.api as sm
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib

In [None]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

In [3]:
np.random.seed(123)

### Continuous Phenotype R2

In [4]:
Sample_all_quant = pd.read_csv(f"{bucket}/Pheno/quant_all.tsv",sep = "\t", index_col="person_id")
Height_df = pd.read_csv(f"{bucket}/Scores/Array/Height_clump.gz",sep = "\t", index_col="s")
DBP_df = pd.read_csv(f"{bucket}/Scores/Array/DBP_clump.gz",sep = "\t", index_col="s")
HDL_df = pd.read_csv(f"{bucket}/Scores/Array/HDL_clump.gz",sep = "\t", index_col="s")
TC_df = pd.read_csv(f"{bucket}/Scores/Array/TC_clump.gz",sep = "\t", index_col="s")
RBC_df = pd.read_csv(f"{bucket}/Scores/Array/RBC_clump.gz",sep = "\t", index_col="s")
leukocyte_df = pd.read_csv(f"{bucket}/Scores/Array/leukocyte_clump.gz",sep = "\t", index_col="s")

In [5]:
def Select_threshold_quant(phenodf_anc_val, phenodf_anc_val_COVs, phenodf_anc_val_PHE):
    PGSs = ['pgs1', 'pgs2', 'pgs3', 'pgs4','pgs5', 'pgs6', 'pgs7', 'pgs8', 'pgs9', 'pgs10']
    R2_val = np.empty([10])
    R2_val[:] = np.nan
    for j in range(len(PGSs)):
        phenodf_anc_val_COVs_PGS = pd.concat([phenodf_anc_val_COVs, phenodf_anc_val[[PGSs[j]]]], axis = 1)
        model0_val = sm.OLS(phenodf_anc_val_PHE, phenodf_anc_val_COVs, missing = 'drop').fit()
        model1_val = sm.OLS(phenodf_anc_val_PHE, phenodf_anc_val_COVs_PGS, missing = 'drop').fit()
        R2_val[j] = model1_val.rsquared - model0_val.rsquared
    best_thresh =  "pgs" + str(np.argmax(R2_val) + 1)
    return best_thresh    

In [6]:
def Compute_R2_CV_quant(phenodf, pheno, anc):
    PGSs = ['pgs1', 'pgs2', 'pgs3', 'pgs4','pgs5', 'pgs6', 'pgs7', 'pgs8', 'pgs9', 'pgs10']
    PHE = pheno
    COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
           'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
           'PC14', 'PC15', 'PC16']
    phenodf_anc = phenodf.loc[ (Sample_all_quant["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())] 
    phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
    
    R2_test = np.empty([10])
    thresholds = [None] * 10
    R2_test[:] = np.nan

    for i in range(0, 10):
        phenodf_anc_val = phenodf_anc.loc[phenodf_anc.bin == i,]
        phenodf_anc_val_COVs = sm.add_constant(phenodf_anc_val.loc[:, COVs].astype("float"))
        phenodf_anc_val_PHE = phenodf_anc_val[PHE].astype("float")
        
        best_thresh = Select_threshold_quant(phenodf_anc_val, phenodf_anc_val_COVs, phenodf_anc_val_PHE)
        thresholds[i] = best_thresh
        phenodf_anc_test = phenodf_anc.loc[phenodf_anc.bin != i,]
        phenodf_anc_test_COVs = sm.add_constant(phenodf_anc_test.loc[:, COVs].astype("float"))
        phenodf_anc_test_COVs_bestPGS = sm.add_constant(phenodf_anc_test.loc[:, COVs + [best_thresh]].astype("float"))
        phenodf_anc_test_PHE = phenodf_anc_test[PHE].astype("float")
        
        model0_test = sm.OLS(phenodf_anc_test_PHE, phenodf_anc_test_COVs, missing = 'drop').fit()
        model1_test = sm.OLS(phenodf_anc_test_PHE, phenodf_anc_test_COVs_bestPGS, missing = 'drop').fit()
        R2_test[i] = model1_test.rsquared - model0_test.rsquared

    return(R2_test, thresholds)

In [7]:
def Make_df2plot(phenodf, pheno):
    ancs = ["eur", "amr", "afr"]
    df2plot = pd.DataFrame(index=range(0),columns=range(4))
    df2plot.columns = ["pheno", "anc", "R2", "threshold"]
    for anc in ancs:
        if pheno in ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]:
            R2_thresh = Compute_R2_CV_quant(phenodf, pheno, anc)
        elif pheno in ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]:
            R2_thresh = Compute_R2_CV_binary(phenodf, pheno, anc)
        
        df2plot_anc = pd.concat([pd.Series([pheno]*10, name = "pheno"), 
                                 pd.Series([anc]*10, name = "anc"), 
                                 pd.Series(R2_thresh[0], name = "R2"), 
                                 pd.Series(R2_thresh[1], name = "threshold")], axis = 1)
        df2plot = pd.concat([df2plot, df2plot_anc], axis = 0)

    return df2plot 

In [8]:
df2plot_quant = pd.concat([Make_df2plot(Height_df, "Height"),
                     Make_df2plot(DBP_df, "DBP"),
                     Make_df2plot(HDL_df, "HDL"),
                     Make_df2plot(TC_df, "TC"),
                     Make_df2plot(RBC_df, "RBC"),
                     Make_df2plot(leukocyte_df, "leukocyte")],
                   axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(

In [11]:
# Rename the 'anc' column to 'Ancestry'
df2plot_quant.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_quant['Ancestry'] = df2plot_quant['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})

### Binary phenotype R2

In [13]:
Sample_all_binary = pd.read_csv(f"{bucket}/Pheno/binary_all.tsv",sep = "\t", index_col="person_id")
Asthma_df = pd.read_csv(f"{bucket}/Scores/Array/Asthma_clump.gz",sep = "\t", index_col="s")
T2D_df = pd.read_csv(f"{bucket}/Scores/Array/T2D_clump.gz",sep = "\t", index_col="s")
Colorectal_Cancer_df = pd.read_csv(f"{bucket}/Scores/Array/Colorectal_Cancer_clump.gz",sep = "\t", index_col="s")
Breast_Cancer_df = pd.read_csv(f"{bucket}/Scores/Array/Breast_Cancer_clump.gz",sep = "\t", index_col="s")

In [14]:
def Select_threshold_binary(phenodf_anc_val, phenodf_anc_val_COVs, phenodf_anc_val_PHE):
    PGSs = ['pgs1', 'pgs2', 'pgs3', 'pgs4','pgs5', 'pgs6', 'pgs7', 'pgs8', 'pgs9', 'pgs10']
    R2_val = np.empty([10])
    R2_val[:] = np.nan
    for j in range(len(PGSs)):
        phenodf_anc_val_COVs_PGS = pd.concat([phenodf_anc_val_COVs, phenodf_anc_val[[PGSs[j]]]], axis = 1)
        model0_val = sm.GLM(phenodf_anc_val_PHE, phenodf_anc_val_COVs, missing = 'drop', family=sm.families.Binomial()).fit(disp = 0)
        model1_val = sm.GLM(phenodf_anc_val_PHE, phenodf_anc_val_COVs_PGS, missing = 'drop', family=sm.families.Binomial()).fit(disp = 0)
        CSR2_val = 1 - math.exp( (2/model0_val.nobs) * (model0_val.llf - model1_val.llf)  )
        R2_val[j] = CSR2_val/(1 - math.exp( (2/model0_val.nobs) * model0_val.llf) )
    best_thresh =  "pgs" + str(np.argmax(R2_val) + 1)
    return best_thresh     

In [15]:
def Compute_R2_CV_binary(phenodf, pheno, anc):
    PGSs = ['pgs1', 'pgs2', 'pgs3', 'pgs4','pgs5', 'pgs6', 'pgs7', 'pgs8', 'pgs9', 'pgs10']
    PHE = pheno
    # breast cancer restrict to women only
    if pheno == "Breast_Cancer":
        COVs = ['Age', 'PC1', 'PC2', 'PC3', 'PC4',
           'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
           'PC14', 'PC15', 'PC16']
        phenodf_anc = phenodf.loc[ (Sample_all_binary["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna()) & (Sample_all_binary["is_sex_Female"] == 1) ] 
    else:
        COVs = ['Age', 'is_sex_Male', 'is_sex_Female', 'PC1', 'PC2', 'PC3', 'PC4',
           'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
           'PC14', 'PC15', 'PC16']
        phenodf_anc = phenodf.loc[ (Sample_all_binary["is_anc_pred_" + anc] == 1) & (~phenodf[pheno].isna())] 

    phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
    
    R2_test = np.empty([10])
    thresholds = [None] * 10
    R2_test[:] = np.nan

    for i in range(0, 10):
        phenodf_anc_val = phenodf_anc.loc[phenodf_anc.bin == i,]
        phenodf_anc_val_COVs = sm.add_constant(phenodf_anc_val.loc[:, COVs].astype("float"))
        phenodf_anc_val_PHE = phenodf_anc_val[PHE].astype("int8")
        
        best_thresh = Select_threshold_binary(phenodf_anc_val, phenodf_anc_val_COVs, phenodf_anc_val_PHE)
        thresholds[i] = best_thresh
        phenodf_anc_test = phenodf_anc.loc[phenodf_anc.bin != i,]
        phenodf_anc_test_COVs = sm.add_constant(phenodf_anc_test.loc[:, COVs].astype("float"))
        phenodf_anc_test_COVs_bestPGS = sm.add_constant(phenodf_anc_test.loc[:, COVs + [best_thresh]].astype("float"))
        phenodf_anc_test_PHE = phenodf_anc_test[PHE].astype("int8")
        
        model0_test = sm.GLM(phenodf_anc_test_PHE, phenodf_anc_test_COVs, missing = 'drop', family=sm.families.Binomial()).fit(disp = 0)
        model1_test = sm.GLM(phenodf_anc_test_PHE, phenodf_anc_test_COVs_bestPGS, missing = 'drop', family=sm.families.Binomial()).fit(disp = 0)
        CSR2_test = 1 - math.exp( (2/model0_test.nobs) * (model0_test.llf - model1_test.llf)  )
        R2_test[i] = CSR2_test/(1 - math.exp( (2/model0_test.nobs) * model0_test.llf) )

    return(R2_test, thresholds)

In [22]:
df2plot_binary = pd.concat([Make_df2plot(Asthma_df, "Asthma"),
                     Make_df2plot(T2D_df, "T2D"),
                     Make_df2plot(Colorectal_Cancer_df, "Colorectal_Cancer"),
                     Make_df2plot(Breast_Cancer_df, "Breast_Cancer")],
                   axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)


  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  t = np.exp(-z)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenodf_anc.loc[:,"bin"] = np.random.randint(0,10,size=len(phenodf_anc))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [28]:
# Rename the 'anc' column to 'Ancestry'
df2plot_binary.rename(columns={'anc': 'Ancestry'}, inplace=True)

# Change the values in the 'Ancestry' column
df2plot_binary['Ancestry'] = df2plot_binary['Ancestry'].replace({'eur': 'EUR', 'amr': 'AMR', 'afr': 'AFR'})