# Estimate Multi-ancestry PRS versus PD risk with admixure 
- **Project:** Multi-ancestry PRS
- **Version:** Python/3.9
- **Status:** COMPLETE
- **Last Updated:** 11-MARCH-2024

## Notebook Overview
- Logistic regression models adjusted by covariates (age, gender, admixture)

In [1]:
## Load packages
module load python
module load R

[+] Loading python 3.10  ... 
[+] Loading gcc  11.3.0  ... 
[+] Loading HDF5  1.12.2 
[+] Loading netcdf  4.9.0 
[-] Unloading gcc  11.3.0  ... 
[+] Loading gcc  11.3.0  ... 
[+] Loading openmpi/4.1.3/gcc-11.3.0  ... 
[+] Loading pandoc  2.18  on cn0978 
[+] Loading pcre2  10.40 
[+] Loading R 4.3.2 


In [8]:
import pandas as pd
import statsmodels.api as sm

## RUN PRS versus RISK across ancestries (AFRICAN summary stats)
# List of ancestries
ancestries = ["AAC", "AFR", "AJ", "AMR", "EAS", "EUR", "CAS"]

for ancestry in ancestries:  
    print("Ancestry:", ancestry)
    # Construct file paths
    prs_file = f"{WORK_DIR}/imputed_data/" + ancestry + "/PRS_score_release_AFRICANS.profile"

    # Read PRS data
    temp_data = pd.read_csv(prs_file, delim_whitespace =True)

    # Read admx data
    temp_covs = pd.read_csv(f"{WORK_DIR}/projects/ref_panel_prep/gp2_admixture/GP2_round6/release6_ancestry_merge_train.10.Q.labeled", delim_whitespace =True)

    # Read additional covariates
    temp_covs_2 = pd.read_csv(f"{WORK_DIR}/covariates.txt", sep="\t")
    temp_covs_2 = temp_covs_2.rename(columns={"GP2sampleID": "IID"})

    # Merge covariates
    covs = pd.merge(temp_covs, temp_covs_2, on="IID")
    covs.head()

    # Merge PRS data and covariates
    dat = pd.merge(temp_data, covs, on="IID")

    # Remove missing or unknown cases
    dat = dat[dat["PHENO"] != -9]

    # Logistic regression model phenotype
    dat['CASE'] = dat['PHENO'] - 1 

    # Standardize PRS
    mean_controls = dat.loc[dat["CASE"] == 0, "SCORE"].mean()
    sd_controls = dat.loc[dat["CASE"] == 0, "SCORE"].std()
    dat["zSCORE"] = (dat["SCORE"] - mean_controls) / sd_controls

    # Logistic regression model using statsmodels formula
    formula = "CASE ~ zSCORE + sex_for_qc + age + AJ + EUR + EAS + AMR + AFR + CAS + AAC"
    model = sm.Logit.from_formula(formula, data=dat)
    result = model.fit()

    # Print the summary
    print(result.summary())

    # Print
    print("Done analyzing " + ancestry + " now on to the next thing.")

Ancestry: AAC
Optimization terminated successfully.
         Current function value: 0.397559
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   CASE   No. Observations:                  910
Model:                          Logit   Df Residuals:                      899
Method:                           MLE   Df Model:                           10
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.2240
Time:                        09:17:08   Log-Likelihood:                -361.78
converged:                       True   LL-Null:                       -466.24
Covariance Type:            nonrobust   LLR p-value:                 2.218e-39
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.2440      1.473      0.844      0.398      -1.643       4.131
zSCORE         

In [9]:
import pandas as pd
import statsmodels.api as sm

## RUN PRS versus RISK across ancestries (EUROPEAN summary stats)
# List of ancestries
ancestries = ["AAC", "AFR", "AJ", "AMR", "EAS", "EUR", "CAS"]

for ancestry in ancestries:  
    print("Ancestry:", ancestry)
    # Construct file paths
    prs_file = f"{WORK_DIR}/imputed_data/" + ancestry + "/PRS_score_release_EUROPEAN.profile"

    # Read PRS data
    temp_data = pd.read_csv(prs_file, delim_whitespace =True)

    # Read admx data
    temp_covs = pd.read_csv(f"{WORK_DIR}/projects/ref_panel_prep/gp2_admixture/GP2_round6/release6_ancestry_merge_train.10.Q.labeled", delim_whitespace =True)

    # Read additional covariates
    temp_covs_2 = pd.read_csv(f"{WORK_DIR}/CLEAN/covariates.txt", sep="\t")
    temp_covs_2 = temp_covs_2.rename(columns={"GP2sampleID": "IID"})

    # Merge covariates
    covs = pd.merge(temp_covs, temp_covs_2, on="IID")
    covs.head()

    # Merge PRS data and covariates
    dat = pd.merge(temp_data, covs, on="IID")

    # Remove missing or unknown cases
    dat = dat[dat["PHENO"] != -9]

    # Logistic regression model phenotype
    dat['CASE'] = dat['PHENO'] - 1 

    # Standardize PRS
    mean_controls = dat.loc[dat["CASE"] == 0, "SCORE"].mean()
    sd_controls = dat.loc[dat["CASE"] == 0, "SCORE"].std()
    dat["zSCORE"] = (dat["SCORE"] - mean_controls) / sd_controls

    # Logistic regression model using statsmodels formula
    formula = "CASE ~ zSCORE + sex_for_qc + age + AJ + EUR + EAS + AMR + AFR + CAS + AAC"
    model = sm.Logit.from_formula(formula, data=dat)
    result = model.fit()

    # Print the summary
    print(result.summary())

    # Print
    print("Done analyzing " + ancestry + " now on to the next thing.")

Ancestry: AAC
Optimization terminated successfully.
         Current function value: 0.388589
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   CASE   No. Observations:                  910
Model:                          Logit   Df Residuals:                      899
Method:                           MLE   Df Model:                           10
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.2416
Time:                        09:17:20   Log-Likelihood:                -353.62
converged:                       True   LL-Null:                       -466.24
Covariance Type:            nonrobust   LLR p-value:                 8.514e-43
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9985      1.509      0.662      0.508      -1.959       3.956
zSCORE         

In [10]:
import pandas as pd
import statsmodels.api as sm

## RUN PRS versus RISK across ancestries (LATINO summary stats)
# List of ancestries
ancestries = ["AAC", "AFR", "AJ", "AMR", "EAS", "EUR", "CAS"]

for ancestry in ancestries:  
    print("Ancestry:", ancestry)
    # Construct file paths
    prs_file = f"{WORK_DIR}/imputed_data/" + ancestry + "/PRS_score_release_LATINO.profile"

    # Read PRS data
    temp_data = pd.read_csv(prs_file, delim_whitespace =True)

    # Read admx data
    temp_covs = pd.read_csv(f"{WORK_DIR}/projects/ref_panel_prep/gp2_admixture/GP2_round6/release6_ancestry_merge_train.10.Q.labeled", delim_whitespace =True)

    # Read additional covariates
    temp_covs_2 = pd.read_csv(f"{WORK_DIR}/covariates.txt", sep="\t")
    temp_covs_2 = temp_covs_2.rename(columns={"GP2sampleID": "IID"})

    # Merge covariates
    covs = pd.merge(temp_covs, temp_covs_2, on="IID")
    covs.head()

    # Merge PRS data and covariates
    dat = pd.merge(temp_data, covs, on="IID")

    # Remove missing or unknown cases
    dat = dat[dat["PHENO"] != -9]

    # Logistic regression model phenotype
    dat['CASE'] = dat['PHENO'] - 1 

    # Standardize PRS
    mean_controls = dat.loc[dat["CASE"] == 0, "SCORE"].mean()
    sd_controls = dat.loc[dat["CASE"] == 0, "SCORE"].std()
    dat["zSCORE"] = (dat["SCORE"] - mean_controls) / sd_controls

    # Logistic regression model using statsmodels formula
    formula = "CASE ~ zSCORE + sex_for_qc + age + AJ + EUR + EAS + AMR + AFR + CAS + AAC"
    model = sm.Logit.from_formula(formula, data=dat)
    result = model.fit()

    # Print the summary
    print(result.summary())

    # Print
    print("Done analyzing " + ancestry + " now on to the next thing.")

Ancestry: AAC
Optimization terminated successfully.
         Current function value: 0.400387
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   CASE   No. Observations:                  910
Model:                          Logit   Df Residuals:                      899
Method:                           MLE   Df Model:                           10
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.2185
Time:                        09:17:25   Log-Likelihood:                -364.35
converged:                       True   LL-Null:                       -466.24
Covariance Type:            nonrobust   LLR p-value:                 2.634e-38
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8085      1.463      1.236      0.216      -1.059       4.676
zSCORE         

In [11]:
import pandas as pd
import statsmodels.api as sm

## RUN PRS versus RISK across ancestries (EAST ASIANS summary stats)
# List of ancestries
ancestries = ["AAC", "AFR", "AJ", "AMR", "EAS", "EUR", "CAS"]

for ancestry in ancestries:  
    print("Ancestry:", ancestry)
    # Construct file paths
    prs_file = f"{WORK_DIR}/imputed_data/" + ancestry + "/PRS_score_release_EASTASIANS.profile"

    # Read PRS data
    temp_data = pd.read_csv(prs_file, delim_whitespace =True)

    # Read admx data
    temp_covs = pd.read_csv(f"{WORK_DIR}/projects/ref_panel_prep/gp2_admixture/GP2_round6/release6_ancestry_merge_train.10.Q.labeled", delim_whitespace =True)

    # Read additional covariates
    temp_covs_2 = pd.read_csv(f"{WORK_DIR}/covariates.txt", sep="\t")
    temp_covs_2 = temp_covs_2.rename(columns={"GP2sampleID": "IID"})

    # Merge covariates
    covs = pd.merge(temp_covs, temp_covs_2, on="IID")
    covs.head()

    # Merge PRS data and covariates
    dat = pd.merge(temp_data, covs, on="IID")

    # Remove missing or unknown cases
    dat = dat[dat["PHENO"] != -9]

    # Logistic regression model phenotype
    dat['CASE'] = dat['PHENO'] - 1 

    # Standardize PRS
    mean_controls = dat.loc[dat["CASE"] == 0, "SCORE"].mean()
    sd_controls = dat.loc[dat["CASE"] == 0, "SCORE"].std()
    dat["zSCORE"] = (dat["SCORE"] - mean_controls) / sd_controls

    # Logistic regression model using statsmodels formula
    formula = "CASE ~ zSCORE + sex_for_qc + age + AJ + EUR + EAS + AMR + AFR + CAS + AAC"
    model = sm.Logit.from_formula(formula, data=dat)
    result = model.fit()

    # Print the summary
    print(result.summary())

    # Print
    print("Done analyzing " + ancestry + " now on to the next thing.")

Ancestry: AAC
Optimization terminated successfully.
         Current function value: 0.395995
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   CASE   No. Observations:                  910
Model:                          Logit   Df Residuals:                      899
Method:                           MLE   Df Model:                           10
Date:                Mon, 11 Mar 2024   Pseudo R-squ.:                  0.2271
Time:                        09:17:29   Log-Likelihood:                -360.36
converged:                       True   LL-Null:                       -466.24
Covariance Type:            nonrobust   LLR p-value:                 5.636e-40
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3560      1.482      0.915      0.360      -1.550       4.262
zSCORE         