# Estimate Multi-ancestry PRS profiles
- **Project:** Multi-ancestry PRS
- **Version:** Python/3.9
- **Status:** COMPLETE
- **Last Updated:** 25-FEB-2024

## Notebook Overview
- Extract 90 SNPs from individual level data across ancestries in unrelated individual level data
- Merge files
- Estimate profiles

## Extract 90 SNPs from individual level data across ancestries in unrelated individual level data

In [1]:
## Input data
cd ${WORK_DIR}/imputed_data/

In [2]:
## Load packages
module load plink/2.0-alpha

[+] Loading plink  2.0-alpha 


In [None]:
## Extract 90 risk loci per chr

# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    for chnum in {1..22}; do
        plink2 \
        --pfile "$ANCESTRY"/chr"$chnum"_"$ANCESTRY"_release6 \
        --extract range /data/GP2/projects/PRS_multi_ancestry_Nov2023/CHR_POS_HG38.txt  \
        --make-bed \
        --out "$ANCESTRY"/PRS_chr"$chnum"_release6
    done
done

In [2]:
## Change kernel to python
import pandas as pd

# Read the file into a DataFrame
file_path = f"{WORK_DIR}/GP2_master_key_release6.txt"
df = pd.read_csv(file_path, delimiter='\t')  # Assuming columns are tab-separated, adjust if needed

# Add a new column with "0"
df.insert(0, 'New_Column', 0)

# Extract columns GP2sampleID and sex_for_qc
result_df = df[['New_Column', 'GP2sampleID', 'sex_for_qc']]

# Save the result to a CSV file without headers
result_df.to_csv(f'{WORK_DIR}/sex.txt', index=False, header=False, sep = ' ')

In [1]:
module load plink/1.9

[+] Loading plink  1.9.0-beta4.4  on cn2458 


In [None]:
# Change kernel back to bash
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

cd ${WORK_DIR}/imputed_data/

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    for chnum in {1..22};
        do
            plink \
            --bfile "$ANCESTRY"/PRS_chr"$chnum"_release6 \
            --update-sex ${WORK_DIR}/sex.txt \
            --remove ${WORK_DIR}/raw_genotypes/"$ANCESTRY"/related_"$ANCESTRY" \
            --make-bed \
            --out "$ANCESTRY"/PRS_chr"$chnum"_release6_gender
    done
done

In [3]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    # Change directory to the specified path
    cd ${WORK_DIR}/imputed_data/"$ANCESTRY"

    # List files matching the pattern and save to temp file
    ls *PRS*release6_gender.bim > temp

    # Print files for the current ancestry
    echo "Files for Ancestry $ANCESTRY:"
    cat temp

    # Go back to the original directory
    cd ${WORK_DIR}/imputed_data/
done

Files for Ancestry AAC:
PRS_ALL_release6_gender.bim
PRS_chr10_release6_gender.bim
PRS_chr11_release6_gender.bim
PRS_chr12_release6_gender.bim
PRS_chr13_release6_gender.bim
PRS_chr14_release6_gender.bim
PRS_chr15_release6_gender.bim
PRS_chr16_release6_gender.bim
PRS_chr17_release6_gender.bim
PRS_chr18_release6_gender.bim
PRS_chr19_release6_gender.bim
PRS_chr1_release6_gender.bim
PRS_chr20_release6_gender.bim
PRS_chr21_release6_gender.bim
PRS_chr2_release6_gender.bim
PRS_chr3_release6_gender.bim
PRS_chr4_release6_gender.bim
PRS_chr5_release6_gender.bim
PRS_chr6_release6_gender.bim
PRS_chr7_release6_gender.bim
PRS_chr8_release6_gender.bim
PRS_chr9_release6_gender.bim
Files for Ancestry AFR:
PRS_ALL_release6_gender.bim
PRS_chr10_release6_gender.bim
PRS_chr11_release6_gender.bim
PRS_chr12_release6_gender.bim
PRS_chr13_release6_gender.bim
PRS_chr14_release6_gender.bim
PRS_chr15_release6_gender.bim
PRS_chr16_release6_gender.bim
PRS_chr17_release6_gender.bim
PRS_chr18_release6_gender.bim
PRS_c

In [4]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do

    # Remove the ".bim" extension from the file names and save to temp2
    cd ${WORK_DIR}/imputed_data/"$ANCESTRY"
    sed 's/.bim//g' temp > temp2 || { echo "Error: Sed command failed for $ANCESTRY"; exit 1; }
    
    # Exclude specific file(s) from temp2 and save to files.txt
    grep -v "PRS_chr1_release6_gender" temp2 > files.txt || { echo "Error: Grep command failed for $ANCESTRY"; exit 1; }
    echo "Final list of files:"
    cat files.txt

    # Return to the original directory
    cd ..
done

Final list of files:
PRS_ALL_release6_gender
PRS_chr10_release6_gender
PRS_chr11_release6_gender
PRS_chr12_release6_gender
PRS_chr13_release6_gender
PRS_chr14_release6_gender
PRS_chr15_release6_gender
PRS_chr16_release6_gender
PRS_chr17_release6_gender
PRS_chr18_release6_gender
PRS_chr19_release6_gender
PRS_chr20_release6_gender
PRS_chr21_release6_gender
PRS_chr2_release6_gender
PRS_chr3_release6_gender
PRS_chr4_release6_gender
PRS_chr5_release6_gender
PRS_chr6_release6_gender
PRS_chr7_release6_gender
PRS_chr8_release6_gender
PRS_chr9_release6_gender
Final list of files:
PRS_ALL_release6_gender
PRS_chr10_release6_gender
PRS_chr11_release6_gender
PRS_chr12_release6_gender
PRS_chr13_release6_gender
PRS_chr14_release6_gender
PRS_chr15_release6_gender
PRS_chr16_release6_gender
PRS_chr17_release6_gender
PRS_chr18_release6_gender
PRS_chr19_release6_gender
PRS_chr20_release6_gender
PRS_chr21_release6_gender
PRS_chr2_release6_gender
PRS_chr3_release6_gender
PRS_chr4_release6_gender
PRS_chr5_re

## Merge files

In [5]:
module load plink/1.9

[-] Unloading plink  1.9.0-beta4.4  on cn2458 
[+] Loading plink  1.9.0-beta4.4  on cn2458 


In [None]:
## Merge list using chr 1 as your core input

# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    # Execute plink command for each ancestry
    cd ${WORK_DIR}/imputed_data/"${ANCESTRY}"/
    plink \
    --bfile PRS_chr1_release6_gender \
    --merge-list "files.txt" \
    --make-bed \
    --out "PRS_ALL_release6_gender"
done


In [7]:
cd ${WORK_DIR}/imputed_data/

# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    # Change directory to the specified ancestry
    cd "$ANCESTRY/"

    # Reformat the bim file and save to temp3
    awk '{print $1, $1":"$4, $3, $4, $5, $6}' PRS_ALL_release6_gender.bim > temp3

    # Display the first few lines of temp3
    echo "Reformatted file for Ancestry: $ANCESTRY"
    head temp3
    
    # Rename file
    mv temp3 PRS_ALL_release6_gender.bim

    # Return to the original directory
    cd -
done

Reformatted file for Ancestry: AAC
1 1:154925709 0 154925709 C G
1 1:155162560 0 155162560 A G
1 1:155235843 0 155235843 C T
1 1:161499264 0 161499264 G C
1 1:171750629 0 171750629 T C
1 1:205754444 0 205754444 T C
1 1:205768611 0 205768611 A G
1 1:226728377 0 226728377 C T
1 1:232528865 0 232528865 T C
2 2:17966582 0 17966582 T A
/data/CARD_training/imputed_data
Reformatted file for Ancestry: AFR
1 1:155162560 0 155162560 A G
1 1:155235843 0 155235843 C T
1 1:161499264 0 161499264 G C
1 1:171750629 0 171750629 T C
1 1:205754444 0 205754444 T C
1 1:205768611 0 205768611 A G
1 1:226728377 0 226728377 C T
1 1:232528865 0 232528865 T C
2 2:17966582 0 17966582 T A
2 2:95335195 0 95335195 A T
/data/CARD_training/imputed_data
Reformatted file for Ancestry: AJ
1 1:154925709 0 154925709 C G
1 1:155162560 0 155162560 A G
1 1:155235843 0 155235843 C T
1 1:161499264 0 161499264 C G
1 1:171750629 0 171750629 T C
1 1:205754444 0 205754444 C T
1 1:205754444 0 205754444 C T
1 1:205768611 0 205768611 

In [8]:
module load plink/2.0-alpha

[-] Unloading plink  1.9.0-beta4.4  on cn2458 
[+] Loading plink  2.0-alpha 

The following have been reloaded with a version change:
  1) plink/1.9.0-beta4.4 => plink/2.0-alpha



In [9]:
cd ${WORK_DIR}/imputed_data/

# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    plink2 \
    --bfile "$ANCESTRY"/PRS_ALL_release6_gender \
    --rm-dup force-first \
    --make-bed \
    --out "$ANCESTRY"/PRS_ALL_release6_gender_no_duplicates 
done

PLINK v2.00a3.1LM AVX2 Intel (19 May 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC/PRS_ALL_release6_gender_no_duplicates.log.
Options in effect:
  --bfile AAC/PRS_ALL_release6_gender
  --make-bed
  --out AAC/PRS_ALL_release6_gender_no_duplicates
  --rm-dup force-first

Start time: Mon Mar  4 16:12:51 2024
386344 MiB RAM detected; reserving 193172 MiB for main workspace.
Using up to 72 threads (change this with --threads).
1073 samples (629 females, 444 males; 1073 founders) loaded from
AAC/PRS_ALL_release6_gender.fam.
104 variants loaded from AAC/PRS_ALL_release6_gender.bim.
1 binary phenotype loaded (257 cases, 808 controls).
--rm-dup: 14 duplicated IDs, 14 variants removed.
Writing AAC/PRS_ALL_release6_gender_no_duplicates.fam ... done.
Writing AAC/PRS_ALL_release6_gender_no_duplicates.bim ... done.
Writing AAC/PRS_ALL_release6_gender_no_duplicates.bed ... done.
End time: Mon Mar  4 16:12:51 202

## Estimate profiles

In [10]:
module load plink/1.9

[-] Unloading plink  2.0-alpha 
[+] Loading plink  1.9.0-beta4.4  on cn2458 

The following have been reloaded with a version change:
  1) plink/2.0-alpha => plink/1.9.0-beta4.4



In [11]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    # Change directory to the specified ancestry
 
    # Calculate scores using plink
    plink \
    --bfile "$ANCESTRY/"PRS_ALL_release6_gender_no_duplicates \
    --score ${WORK_DIR}/90LOCI_AFRICAN_CHR_POS.txt \
    --out "$ANCESTRY/"PRS_score_release_AFRICANS

    # Return to the original directory
    cd /data/CARD_training/imputed_data/
done

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC/PRS_score_release_AFRICANS.log.
Options in effect:
  --bfile AAC/PRS_ALL_release6_gender_no_duplicates
  --out AAC/PRS_score_release_AFRICANS
  --score /data/GP2/projects/PRS_multi_ancestry_Nov2023/90LOCI_AFRICAN_CHR_POS.txt

386344 MB RAM detected; reserving 193172 MB for main workspace.
90 variants loaded from .bim file.
1073 people (444 males, 629 females) loaded from .fam.
1065 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1073 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.9895.
90 variants and 1073 people 

In [None]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    plink \
    --bfile "$ANCESTRY/"PRS_ALL_release6_gender_no_duplicates \
    --score ${WORK_DIR}/META_FOO_23ANDME_90LOCI_EASTASIAN_CHR_POS.txt \
    --out "$ANCESTRY/"PRS_score_release_EASTASIANS

    # Return to the original directory
    cd -
done

In [None]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    plink \
    --bfile "$ANCESTRY/"PRS_ALL_release6_gender_no_duplicates \
    --score ${WORK_DIR}/META_LOESCH_23ANDME_90LOCI_LATINO_CHR_POS.txt \
    --out "$ANCESTRY/"PRS_score_release_LATINO

    # Return to the original directory
    cd /data/CARD_training/imputed_data
done

In [None]:
# List of ancestries
ANCESTRIES=("AAC" "AFR" "AJ" "AMR" "EAS" "EUR" "CAS")

# Iterate over ancestries
for ANCESTRY in "${ANCESTRIES[@]}"; do
    plink \
    --bfile "$ANCESTRY/"PRS_ALL_release6_gender_no_duplicates \
    --score ${WORK_DIR}/90LOCI_EUROPEANS_CHR_POS.txt \
    --out "$ANCESTRY/"PRS_score_release_EUROPEAN

    # Return to the original directory
    cd /data/CARD_training/imputed_data
done