# Generating Multi-ancestry Base Data
- **Project:** Multi-ancestry PRS
- **Version:** Python/3.9
- **Status:** COMPLETE
- **Last Updated:** 16-NOV-2023

## Notebook Overview
- Generate metaGWAS - summary statistics for EAS (Foo + 23andMe)
- Generate metaGWAS - summary statistics for AMR (Loesch + 23andMe)

## Generate metaGWAS - summary statistics for EAS (Foo + 23andMe)

In [None]:
## switch kernel to bash
cd ${WORK_DIR}

In [None]:
## switch kernel to R
## Extract 90 SNPs from Foo et al. 
library(data.table)
sumstats <- fread("{WORK_DIR}/summary_stats/asian_GWAS/6724PDcases-24851controls-5843213snps-summary-stats-metaP-SE.txt", header =T)
SNPs <- fread("{WORK_DIR}/summary_stats/asian_GWAS/90riskloci.txt", header =T)
total <- merge(sumstats, SNPs, by="BP")
head(total)
outPut <- total[,c("SNP.y","BETA", "SE", "P", "A1", "A2")]
write.table(outPut, file = "{WORK_DIR}/Foo_90riskloci.txt", quote = F, row.names = F, sep = "\t")
## 69 risk SNPs

In [None]:
## Extract 90 SNPs from 23andMe EAS
library(data.table)
sumstats <- fread("{WORK_DIR}/summary_stats/23andMe/RISK/filtered_sumstats_23andme_EASTASIAN_PD.hg19.txt", header =T)
SNPs <- fread("{WORK_DIR}/summary_stats/asian_GWAS/90riskloci.txt", header =T)
names(SNPs)[3] <- "position"
total <- merge(sumstats, SNPs, by="position")
head(total)
outPut <- total[,c("SNP","effect", "stderr", "pvalue", "effect_allele", "alt_allele")]
write.table(outPut, file = "{WORK_DIR}/23andMe_90riskloci.txt", quote = F, row.names = F, sep = "\t")

In [None]:
## Load METAL for meta-analysis
module load metal

In [None]:
## switch kernel to bash
## Now meta-analyze
metal
# UNCOMMENT THE NEXT LINE TO ENABLE GenomicControl CORRECTION
SCHEME STDERR
GENOMICCONTROL ON

# === DESCRIBE AND PROCESS THE FIRST INPUT FILE ===
MARKER SNP.y
ALLELE A1 A2
EFFECT BETA
STDERR SE
PVALUE P
PROCESS Foo_90riskloci.txt

# === DESCRIBE AND PROCESS THE SECOND INPUT FILE ===
MARKER SNP
ALLELE effect_allele alt_allele
EFFECT effect
STDERR stderr
PVALUE pvalue
PROCESS 23andMe_90riskloci.txt

OUTFILE ASIAN .tbl
ANALYZE HETEROGENEITY
QUIT

In [None]:
###########################################################################
## Running second pass analysis to evaluate heterogeneity...
## Processing file '23andMe_90riskloci.txt'
## Processing file 'Foo_90riskloci.txt'

###########################################################################
## Executing meta-analysis ...
## Complete results will be stored in file 'ASIAN1.tbl'
## Column descriptions will be stored in file 'ASIAN1.tbl.info'
## Completed meta-analysis for 84 markers!
## Smallest p-value is 0.0001245 at marker 'rs356182'

In [None]:
## Remove SNPs only present in one of the datasets
cd ${WORK_DIR}
grep -v "?" ASIAN1.tbl > temp

In [None]:
## switch kernel to R
## Now convert all to hg38
library(data.table)
sumstats <- fread("{WORK_DIR}/temp", header =T)
SNPs <- fread("{WORK_DIR}/summary_stats/asian_GWAS/90riskloci_38.txt", header =T)
names(SNPs)[1] <- "MarkerName"
total <- merge(sumstats, SNPs, by="MarkerName")
total$A1cap <- toupper(total$Allele1)
outPut <- total[,c("markerID","A1cap", "Effect")]
write.table(outPut, file = "{WORK_DIR}/META_FOO_23ANDME_90LOCI_EASTASIAN_CHR_POS.txt", quote = F, col.names = FALSE, row.names = F, sep = "\t")

## Generate metaGWAS - summary statistics for AMR (Loesch + 23andMe)

In [None]:
## Extract 90 SNPs from Loesch et al. 
library(data.table)
sumstats <- fread("{WORK_DIR}/Loesch_et_al_2021_Latam_no23andme_hg38.txt", header =T)
SNPs <- fread("${WORK_DIR}/summary_stats/asian_GWAS/90riskloci_38.txt", header =T)
names(sumstats)[4] <- "BP"
total <- merge(sumstats, SNPs, by="BP")
total$SE <- total$beta/total$Score
head(total)
outPut <- total[,c("markerID","beta", "SE", "Score.pval", "ref", "alt")]
write.table(outPut, file = "{WORK_DIR}/Loesch_90riskloci.txt", quote = F, row.names = F, sep = "\t")

In [None]:
## Extract 90 SNPs from 23andMe LATINO
library(data.table)
sumstats <- fread("{WORK_DIR}/summary_stats/23andMe/RISK/filtered_sumstats_23andme_LATINO_PD.txt", header =T)
SNPs <- fread("{WORK_DIR}/summary_stats/asian_GWAS/90riskloci_38.txt", header =T)
names(SNPs)[3] <- "position"
total <- merge(sumstats, SNPs, by="position")
head(total)
outPut <- total[,c("markerID","effect", "stderr", "pvalue", "effect_allele", "alt_allele")]
write.table(outPut, file = "{WORK_DIR}/23andMe_90riskloci.txt", quote = F, row.names = F, sep = "\t")

In [None]:
## switch kernel to bash
## Now meta-analyze
metal
# UNCOMMENT THE NEXT LINE TO ENABLE GenomicControl CORRECTION
SCHEME STDERR
GENOMICCONTROL ON

# === DESCRIBE AND PROCESS THE FIRST INPUT FILE ===
MARKER markerID
ALLELE ref alt
EFFECT beta
STDERR SE
PVALUE Score.pval
PROCESS Loesch_90riskloci.txt

# === DESCRIBE AND PROCESS THE SECOND INPUT FILE ===
MARKER markerID
ALLELE effect_allele alt_allele
EFFECT effect
STDERR stderr
PVALUE pvalue
PROCESS 23andMe_90riskloci.txt

OUTFILE LATINO .tbl
ANALYZE HETEROGENEITY

QUIT

In [None]:
###########################################################################
## Running second pass analysis to evaluate heterogeneity...
## Processing file '23andMe_90riskloci.txt'
## Processing file 'Loesch_90riskloci.txt'

###########################################################################
## Executing meta-analysis ...
## Complete results will be stored in file 'LATINO1.tbl'
## Column descriptions will be stored in file 'LATINO1.tbl.info'
## Completed meta-analysis for 87 markers!
## Smallest p-value is 0.02353 at marker '12:122842051'

In [None]:
## Remove SNPs only present in one of the datasets
cd ${WORK_DIR}
grep -v "?" LATINO1.tbl > temp2

In [None]:
## switch kernel to R
## Now extract columns for score file
library(data.table)
sumstats <- fread("{WORK_DIR}/temp2", header =T)
sumstats$A1cap <- toupper(sumstats$Allele1)
outPut <- sumstats[,c("MarkerName","A1cap", "Effect")]
write.table(outPut, file = "{WORK_DIR}/META_LOESCH_23ANDME_90LOCI_LATINO_CHR_POS.txt", quote = F, col.names = FALSE, row.names = F, sep = "\t")

In [None]:
## Make output directories and organize files
mkdir ${WORK_DIR}/relatedness/
mkdir ${WORK_DIR}/meta/
mkdir ${WORK_DIR}/scores/

In [None]:
cd ${WORK_DIR}
mv *_related relatedness/
mv toextract* relatedness/
mv release5_* relatedness/
mv META_* meta/
mv ASIAN* meta/
mv LATINO* meta/
mv *riskloci.txt scores/
rm temp
rm temp2