# Investigating Genome-wide Significant Hits from Meta-GWASes
- **Project:** GP2 AFR-AAC meta-GWAS 
- **Version:** Python/3.9
- **Status:** COMPLETE
- **Started:** 22-FEB-2023
- **Last Updated:** 07-APR-2023
    - **Update Description:** Re-annotating to get conservation scores

## Notebook Overview
- Investigating the top hits from the joint meta-GWAS

### CHANGELOG
- 22-FEB-2023: Notebook started 
- 08-MAR-2023: Annotating full summary statistics 
- 07-APR-2023: Re-annotating to get conservation scores

---
# Data Overview 

| ANCESTRY |     DATASET     | CASES | CONTROLS |  TOTAL  |           ARRAY           |                NOTES                |
|:--------:|:---------------:|:-----:|:--------:|:-------------------------:|:---------------------------------------------------------------------------------------------------------------:|:-----------------------------------:|
|    AFR   | IPDGC – Nigeria |  304  |    285   |   589   |         NeuroChip         | . | 
|    AFR   |  GP2  |  711  |   1,011  |  1,722  |        NeuroBooster       | . |
|    AAC   |  GP2 |  185  |   1,149  |  1,334  |        NeuroBooster       | . | 
|    AAC   |     23andMe     |  288  |  193,985 | 194,273 | Omni Express & GSA & 550k |        Just summary statistics       |

# Getting Started

## Importing packages

In [3]:
## Import the necessary packages 
import os
import numpy as np
import pandas as pd
import math
import numbers
import sys
import subprocess
import statsmodels.api as sm
import scipy
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

## Print out package versions
## Getting packages loaded into this notebook and their versions to allow for reproducibility
    # Repurposed code from stackoverflow here: https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook

## Import packages 
import pkg_resources
import types
from datetime import date
today = date.today()
date = today.strftime("%d-%b-%Y").upper()

## Define function 
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different imported names vs. system/pip names
        # Unfortunately, there is no systematic way to get pip names from a package's imported name. You'll have to add exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name

## Get a list of packages imported 
imports = list(set(get_imports()))

# The only way I found to get the version of the root package from only the name of the package is to cross-check the names of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

## Print out packages and versions 
print(f"PACKAGE VERSIONS ({date})")
for r in requirements:
    print("\t{}=={}".format(*r))

PACKAGE VERSIONS (07-APR-2023)
	matplotlib==3.5.3
	numpy==1.23.5
	pandas==1.5.3
	scipy==1.8.1
	seaborn==0.12.2
	statsmodels==0.13.5


# African and African Admixed Meta-GWAS

## Figure out left- and right-most basepairs 

In [4]:
top_genomewide_hits = pd.read_csv(f"{WORK_DIR}/data/AFR-AAC-META/genomewide-hits-extractPVAR.txt", sep="\t", header=None)
top_genomewide_hits.columns = ['CHR', 'BP', 'markerID', 'REF', 'ALT', 'QUALITY', 'INFO']
top_genomewide_hits[['IMPUTE_AF','IMPUTE_MAF','R2', 'ER2','TYPE']] = top_genomewide_hits['INFO'].str.split(';',expand=True)
top_genomewide_hits.drop(columns=['INFO'], inplace=True)

top_genomewide_hits['IMPUTE_AF'] = top_genomewide_hits['IMPUTE_AF'].str.split('=').str[1]
top_genomewide_hits['IMPUTE_MAF'] = top_genomewide_hits['IMPUTE_MAF'].str.split('=').str[1]
top_genomewide_hits['R2'] = top_genomewide_hits['R2'].str.split('=').str[1]
top_genomewide_hits['TYPE'] = np.where(top_genomewide_hits['ER2'] == "IMPUTED", "IMPUTED", "TYPED")
top_genomewide_hits['ER2'] = top_genomewide_hits['ER2'].str.split('=').str[1]

top_genomewide_hits['MarkerName'] = top_genomewide_hits['CHR'].astype(str) + ':' + top_genomewide_hits['BP'].astype(str)

top_genomewide_hits

Unnamed: 0,CHR,BP,markerID,REF,ALT,QUALITY,IMPUTE_AF,IMPUTE_MAF,R2,ER2,TYPE,MarkerName
0,1,155059314,chr1:155059314:G:A,G,A,PASS,0.18293,0.18293,0.98515,,IMPUTED,1:155059314
1,1,155060276,chr1:155060276:A:G,A,G,PASS,0.18934,0.18934,0.99966,0.98345,TYPED,1:155060276
2,1,155064184,chr1:155064184:G:C,G,C,PASS,0.18896,0.18896,0.99712,,IMPUTED,1:155064184
3,1,155142298,chr1:155142298:A:G,A,G,PASS,0.69876,0.30124,0.97002,,IMPUTED,1:155142298
4,1,155151361,chr1:155151361:G:C,G,C,PASS,0.35655,0.35655,0.98412,,IMPUTED,1:155151361
5,1,155155273,chr1:155155273:T:C,T,C,PASS,0.63345,0.36655,0.98113,,IMPUTED,1:155155273
6,1,155157915,chr1:155157915:G:A,G,A,PASS,0.63552,0.36448,0.99931,0.98899,TYPED,1:155157915
7,1,155160272,chr1:155160272:C:T,C,T,PASS,0.25913,0.25913,0.98373,,IMPUTED,1:155160272
8,1,155162859,chr1:155162859:T:C,T,C,PASS,0.62197,0.37803,1.0,0.99991,TYPED,1:155162859
9,1,155162930,chr1:155162930:G:A,G,A,PASS,0.62676,0.37324,1.0,1.0,TYPED,1:155162930


In [38]:
top_genomewide_hits_subset = top_genomewide_hits[['markerID', 'MarkerName', 'R2', 'TYPE']].copy()
top_genomewide_hits_subset.columns = ['MarkerName', 'markerID', 'R2', 'TYPE']
top_genomewide_hits_subset

Unnamed: 0,MarkerName,markerID,R2,TYPE
0,chr1:155059314:G:A,1:155059314,0.98515,IMPUTED
1,chr1:155060276:A:G,1:155060276,0.99966,TYPED
2,chr1:155064184:G:C,1:155064184,0.99712,IMPUTED
3,chr1:155142298:A:G,1:155142298,0.97002,IMPUTED
4,chr1:155151361:G:C,1:155151361,0.98412,IMPUTED
5,chr1:155155273:T:C,1:155155273,0.98113,IMPUTED
6,chr1:155157915:G:A,1:155157915,0.99931,TYPED
7,chr1:155160272:C:T,1:155160272,0.98373,IMPUTED
8,chr1:155162859:T:C,1:155162859,1.0,TYPED
9,chr1:155162930:G:A,1:155162930,1.0,TYPED


## Make subset of METAL for top hits

In [27]:
%%bash

head -1 ${WORK_DIR}/data/AFR-AAC-META/AFR-AAC-META-UpdatedforMETAL1.tbl > ${WORK_DIR}/data/AFR-AAC-META/genomewide-hits_allINFO.txt
cat ${WORK_DIR}/data/AFR-AAC-META/AFR-AAC-META-UpdatedforMETAL1.tbl | awk '$6 <= 0.00000005' >> ${WORK_DIR}/data/AFR-AAC-META/genomewide-hits_allINFO.txt

### Generate ANNOVAR Input

In [28]:
## Make ANNOVAR input 
# Format 
    # Chr Start End Ref Alt 
    # Start and End for SNPs must be the same!!
    
annovar_df = pd.DataFrame()
annovar_df['Chr'] = top_genomewide_hits['CHR']
annovar_df['Start'] = top_genomewide_hits['BP']
annovar_df['End'] = top_genomewide_hits['BP']
annovar_df['Ref'] = top_genomewide_hits['REF']
annovar_df['Alt'] = top_genomewide_hits['ALT']
annovar_df.to_csv(f"{WORK_DIR}/data/AFR-AAC-META/genomewide-hits_forANNOVAR.txt", sep=" ", index=False, header=None)
annovar_df

Unnamed: 0,Chr,Start,End,Ref,Alt
0,1,155059314,155059314,G,A
1,1,155060276,155060276,A,G
2,1,155064184,155064184,G,C
3,1,155142298,155142298,A,G
4,1,155151361,155151361,G,C
5,1,155155273,155155273,T,C
6,1,155157915,155157915,G,A
7,1,155160272,155160272,C,T
8,1,155162859,155162859,T,C
9,1,155162930,155162930,G,A


### Run ANNOVAR

In [None]:
%%bash

module load annovar

table_annovar.pl ${WORK_DIR}/data/AFR-AAC-META/genomewide-hits_forANNOVAR.txt $ANNOVAR_DATA/hg38 \
-buildver hg38 \
-out ${WORK_DIR}/data/other/ANNOVAR/genomewide-hits_ANNOVAR \
-arg '-splicing 15',,, \
-remove \
-protocol refGene,avsnp150,ljb26_all,gnomad312_genome \
-operation g,f,f,f \
-nastring . \
-polish

### Process ANNOVAR Output 

In [39]:
annovar_output = pd.read_csv(f"{WORK_DIR}/data/other/ANNOVAR/genomewide-hits_ANNOVAR.hg38_multianno.txt", sep="\t")
annovar_output['markerID'] = annovar_output['Chr'].astype(str) + ':' + annovar_output['Start'].astype(str)
annovar_output

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas,markerID
0,1,155059314,155059314,G,A,ncRNA_intronic,DCST1-AS1,.,.,.,...,0.0,0.0106,0.0,0.0,0.0,0.0,0.0003,0.023,0.0004,1:155059314
1,1,155060276,155060276,A,G,exonic,ADAM15,.,nonsynonymous SNV,"ADAM15:NM_001261464:exon18:c.A2170G:p.S724G,AD...",...,0.0,0.0103,0.0,0.0,0.0,0.0,0.0002,0.024,0.0002,1:155060276
2,1,155064184,155064184,G,C,intronic,EFNA4,.,.,.,...,0.0,0.0103,0.0,0.0,0.0,0.0,0.0002,0.0239,0.0002,1:155064184
3,1,155142298,155142298,A,G,intergenic,DPM3;KRTCAP2,dist=1767;dist=27110,.,.,...,1.0,0.9789,1.0,1.0,1.0,0.9842,0.9997,0.9608,0.9992,1:155142298
4,1,155151361,155151361,G,C,intergenic,DPM3;KRTCAP2,dist=10830;dist=18047,.,.,...,0.3147,0.2839,0.4288,0.1664,0.3748,0.4304,0.3955,0.3497,0.3325,1:155151361
5,1,155155273,155155273,T,C,intergenic,DPM3;KRTCAP2,dist=14742;dist=14135,.,.,...,0.4901,0.5977,0.4085,0.8205,0.4752,0.4367,0.4671,0.525,0.5274,1:155155273
6,1,155157915,155157915,G,A,intergenic,DPM3;KRTCAP2,dist=17384;dist=11493,.,.,...,0.4923,0.5991,0.4294,0.8198,0.4752,0.465,0.4675,0.5268,0.5282,1:155157915
7,1,155160272,155160272,C,T,intergenic,DPM3;KRTCAP2,dist=19741;dist=9136,.,.,...,0.0,0.0157,0.0,0.0,0.0,0.0127,0.0005,0.0307,0.0006,1:155160272
8,1,155162859,155162859,T,C,intergenic,DPM3;KRTCAP2,dist=22328;dist=6549,.,.,...,0.4912,0.5971,0.4069,0.8038,0.5264,0.4395,0.4785,0.5298,0.5303,1:155162859
9,1,155162930,155162930,G,A,intergenic,DPM3;KRTCAP2,dist=22399;dist=6478,.,.,...,0.4923,0.5991,0.4075,0.8039,0.5263,0.443,0.4784,0.5311,0.5294,1:155162930


In [40]:
annovar_output.columns

Index(['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'avsnp150', 'SIFT_score', 'SIFT_pred', 'Polyphen2_HDIV_score',
       'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred',
       'LRT_score', 'LRT_pred', 'MutationTaster_score', 'MutationTaster_pred',
       'MutationAssessor_score', 'MutationAssessor_pred', 'FATHMM_score',
       'FATHMM_pred', 'RadialSVM_score', 'RadialSVM_pred', 'LR_score',
       'LR_pred', 'VEST3_score', 'CADD_raw', 'CADD_phred', 'GERP++_RS',
       'phyloP46way_placental', 'phyloP100way_vertebrate',
       'SiPhy_29way_logOdds', 'gnomad312_AF', 'gnomad312_AF_raw',
       'gnomad312_AF_XX', 'gnomad312_AF_XY', 'gnomad312_AF_popmax',
       'gnomad312_AF_faf95_popmax', 'gnomad312_AF_afr', 'gnomad312_AF_ami',
       'gnomad312_AF_amr', 'gnomad312_AF_asj', 'gnomad312_AF_eas',
       'gnomad312_AF_fin', 'gnomad312_AF_mid', 'gnomad312_AF_nfe',
       

In [41]:
rearrange_annovar_output = annovar_output[['markerID', 'avsnp150', 
                                           'Chr', 'Start', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene','GeneDetail.refGene', 'CADD_phred', 
                                           'gnomad312_AF', 'gnomad312_AF_raw','gnomad312_AF_XX', 'gnomad312_AF_XY', 'gnomad312_AF_popmax',
                                           'gnomad312_AF_faf95_popmax', 'gnomad312_AF_afr', 'gnomad312_AF_ami','gnomad312_AF_amr', 'gnomad312_AF_asj',
                                           'gnomad312_AF_eas','gnomad312_AF_fin', 'gnomad312_AF_mid', 'gnomad312_AF_nfe','gnomad312_AF_oth', 'gnomad312_AF_sas']].copy()

## Process METAL top hits

In [42]:
metal_tophits = pd.read_csv(f"{WORK_DIR}/data/AFR-AAC-META/genomewide-hits_allINFO.txt", sep="\t")
metal_tophits.sort_values('P-value').reset_index(inplace=True)
metal_tophits

Unnamed: 0,MarkerName,Allele1,Allele2,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,HetDf,HetPVal
0,chr1:155235878:G:T,t,g,-0.4494,0.0589,2.397e-14,----,0.0,2.878,3,0.4108
1,chr1:155167551:T:C,t,c,0.2868,0.0509,1.749e-08,++++,54.0,6.52,3,0.08888
2,chr1:155165746:T:C,t,c,0.2902,0.0501,7.086e-09,++++,55.3,6.706,3,0.08189
3,chr1:155317797:C:T,t,c,-0.3894,0.0625,4.685e-10,----,29.2,4.236,3,0.2371
4,chr1:155394894:G:A,a,g,-0.3913,0.0627,4.317e-10,----,17.7,3.646,3,0.3023
5,chr1:155490050:A:G,a,g,0.424,0.0631,1.871e-11,++++,0.0,1.806,3,0.6137
6,chr1:155060276:A:G,a,g,-0.3735,0.0641,5.688e-09,----,0.0,2.298,3,0.5129
7,chr1:155168172:C:T,t,c,-0.2854,0.05,1.125e-08,----,49.7,5.959,3,0.1136
8,chr1:155160272:C:T,t,c,0.4367,0.0583,6.724e-14,++++,34.2,4.558,3,0.2072
9,chr1:155619507:T:C,t,c,0.3757,0.0641,4.663e-09,++++,0.0,1.521,3,0.6775


# Merge and Save

In [43]:
merge_hits_imputationscores = pd.merge(metal_tophits, top_genomewide_hits_subset, how="left", on="MarkerName")

In [44]:
rearrange_annovar_output

Unnamed: 0,markerID,avsnp150,Chr,Start,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,CADD_phred,...,gnomad312_AF_afr,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas
0,1:155059314,rs143764007,1,155059314,G,A,ncRNA_intronic,DCST1-AS1,.,.,...,0.1112,0.0,0.0106,0.0,0.0,0.0,0.0,0.0003,0.023,0.0004
1,1:155060276,rs61751624,1,155060276,A,G,exonic,ADAM15,.,22.2,...,0.1138,0.0,0.0103,0.0,0.0,0.0,0.0,0.0002,0.024,0.0002
2,1:155064184,rs115212960,1,155064184,G,C,intronic,EFNA4,.,.,...,0.1135,0.0,0.0103,0.0,0.0,0.0,0.0,0.0002,0.0239,0.0002
3,1:155142298,rs6413882,1,155142298,A,G,intergenic,DPM3;KRTCAP2,dist=1767;dist=27110,.,...,0.8004,1.0,0.9789,1.0,1.0,1.0,0.9842,0.9997,0.9608,0.9992
4,1:155151361,rs6676150,1,155151361,G,C,intergenic,DPM3;KRTCAP2,dist=10830;dist=18047,.,...,0.2997,0.3147,0.2839,0.4288,0.1664,0.3748,0.4304,0.3955,0.3497,0.3325
5,1:155155273,rs7535292,1,155155273,T,C,intergenic,DPM3;KRTCAP2,dist=14742;dist=14135,.,...,0.6743,0.4901,0.5977,0.4085,0.8205,0.4752,0.4367,0.4671,0.525,0.5274
6,1:155157915,rs4971079,1,155157915,G,A,intergenic,DPM3;KRTCAP2,dist=17384;dist=11493,.,...,0.6749,0.4923,0.5991,0.4294,0.8198,0.4752,0.465,0.4675,0.5268,0.5282
7,1:155160272,rs59025885,1,155160272,C,T,intergenic,DPM3;KRTCAP2,dist=19741;dist=9136,.,...,0.1585,0.0,0.0157,0.0,0.0,0.0,0.0127,0.0005,0.0307,0.0006
8,1:155162859,rs4460629,1,155162859,T,C,intergenic,DPM3;KRTCAP2,dist=22328;dist=6549,.,...,0.6696,0.4912,0.5971,0.4069,0.8038,0.5264,0.4395,0.4785,0.5298,0.5303
9,1:155162930,rs12752585,1,155162930,G,A,intergenic,DPM3;KRTCAP2,dist=22399;dist=6478,.,...,0.6725,0.4923,0.5991,0.4075,0.8039,0.5263,0.443,0.4784,0.5311,0.5294


In [45]:
merge_hits_imputationscores_anno = pd.merge(merge_hits_imputationscores, rearrange_annovar_output, how="left", on="markerID")
merge_hits_imputationscores_anno.sort_values('P-value', inplace=True)
merge_hits_imputationscores_anno.reset_index(inplace=True, drop=True)
merge_hits_imputationscores_anno

Unnamed: 0,MarkerName,Allele1,Allele2,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,HetDf,...,gnomad312_AF_afr,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas
0,chr1:155235878:G:T,t,g,-0.4494,0.0589,2.397e-14,----,0.0,2.878,3,...,0.8407,0.9978,0.9828,0.9988,0.9996,0.9985,0.9767,0.998,0.97,0.9985
1,chr1:155160272:C:T,t,c,0.4367,0.0583,6.724e-14,++++,34.2,4.558,3,...,0.1585,0.0,0.0157,0.0,0.0,0.0,0.0127,0.0005,0.0307,0.0006
2,chr1:155490050:A:G,a,g,0.424,0.0631,1.871e-11,++++,0.0,1.806,3,...,0.8584,1.0,0.9847,1.0,1.0,1.0,0.9873,0.9997,0.9759,0.9994
3,chr1:155517618:A:G,a,g,0.4229,0.0633,2.379e-11,++++,0.0,1.838,3,...,0.8583,1.0,0.9848,1.0,1.0,1.0,0.9873,0.9998,0.9759,0.9994
4,chr1:155281459:T:C,t,c,0.4123,0.0624,3.95e-11,++++,36.1,4.698,3,...,0.86,1.0,0.985,1.0,0.9998,0.9997,0.9873,0.9997,0.9751,0.9992
5,chr1:155447428:T:C,t,c,0.4151,0.0631,4.74e-11,++++,0.0,2.682,3,...,0.859,1.0,0.9848,1.0,1.0,0.9999,0.9873,0.9997,0.9756,0.9936
6,chr1:155438844:T:C,t,c,0.4141,0.063,5.002e-11,++++,0.0,2.68,3,...,0.859,1.0,0.9847,1.0,0.9998,1.0,0.9873,0.9997,0.9756,0.9994
7,chr1:155394894:G:A,a,g,-0.3913,0.0627,4.317e-10,----,17.7,3.646,3,...,0.8585,1.0,0.9846,1.0,1.0,0.9999,0.9873,0.9997,0.9756,0.9994
8,chr1:155317797:C:T,t,c,-0.3894,0.0625,4.685e-10,----,29.2,4.236,3,...,0.8603,1.0,0.9851,1.0,1.0,1.0,0.9873,0.9998,0.9756,0.9994
9,chr1:155324483:A:G,a,g,0.3891,0.0625,4.823e-10,++++,29.3,4.241,3,...,0.8587,1.0,0.9846,1.0,1.0,1.0,0.9873,0.9998,0.9756,0.9994


In [46]:
merge_hits_imputationscores_anno.columns

Index(['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value',
       'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal', 'markerID', 'R2',
       'TYPE', 'avsnp150', 'Chr', 'Start', 'Ref', 'Alt', 'Func.refGene',
       'Gene.refGene', 'GeneDetail.refGene', 'CADD_phred', 'gnomad312_AF',
       'gnomad312_AF_raw', 'gnomad312_AF_XX', 'gnomad312_AF_XY',
       'gnomad312_AF_popmax', 'gnomad312_AF_faf95_popmax', 'gnomad312_AF_afr',
       'gnomad312_AF_ami', 'gnomad312_AF_amr', 'gnomad312_AF_asj',
       'gnomad312_AF_eas', 'gnomad312_AF_fin', 'gnomad312_AF_mid',
       'gnomad312_AF_nfe', 'gnomad312_AF_oth', 'gnomad312_AF_sas'],
      dtype='object')

In [47]:
rearrange_merge = merge_hits_imputationscores_anno[['MarkerName', 'avsnp150', 
                                                    'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal', 'R2', 'TYPE', 'Func.refGene',
                                                    'Gene.refGene', 'GeneDetail.refGene', 'CADD_phred', 
                                                    'gnomad312_AF', 'gnomad312_AF_raw', 'gnomad312_AF_XX', 'gnomad312_AF_XY', 'gnomad312_AF_popmax','gnomad312_AF_faf95_popmax', 
                                                    'gnomad312_AF_afr', 'gnomad312_AF_ami','gnomad312_AF_amr', 'gnomad312_AF_asj', 'gnomad312_AF_eas','gnomad312_AF_fin', 'gnomad312_AF_mid', 'gnomad312_AF_nfe',
                                                    'gnomad312_AF_oth', 'gnomad312_AF_sas']].copy()

rearrange_merge.rename(columns={'avsnp150':'rsID'}, inplace=True)
rearrange_merge['Allele1'] = rearrange_merge['Allele1'].str.upper()
rearrange_merge['Allele2'] = rearrange_merge['Allele2'].str.upper()

display(rearrange_merge)
rearrange_merge.to_csv(f"{WORK_DIR}/data/AFR-AAC-META/genomewide_hits_AFR_AAC_META_METAL_ANNOVAR_IMPUTATION_FEB2023.txt", sep="\t", index=False)

Unnamed: 0,MarkerName,rsID,Allele1,Allele2,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,...,gnomad312_AF_afr,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas
0,chr1:155235878:G:T,rs3115534,T,G,-0.4494,0.0589,2.397e-14,----,0.0,2.878,...,0.8407,0.9978,0.9828,0.9988,0.9996,0.9985,0.9767,0.998,0.97,0.9985
1,chr1:155160272:C:T,rs59025885,T,C,0.4367,0.0583,6.724e-14,++++,34.2,4.558,...,0.1585,0.0,0.0157,0.0,0.0,0.0,0.0127,0.0005,0.0307,0.0006
2,chr1:155490050:A:G,rs4971081,A,G,0.424,0.0631,1.871e-11,++++,0.0,1.806,...,0.8584,1.0,0.9847,1.0,1.0,1.0,0.9873,0.9997,0.9759,0.9994
3,chr1:155517618:A:G,rs11264378,A,G,0.4229,0.0633,2.379e-11,++++,0.0,1.838,...,0.8583,1.0,0.9848,1.0,1.0,1.0,0.9873,0.9998,0.9759,0.9994
4,chr1:155281459:T:C,rs7367998,T,C,0.4123,0.0624,3.95e-11,++++,36.1,4.698,...,0.86,1.0,0.985,1.0,0.9998,0.9997,0.9873,0.9997,0.9751,0.9992
5,chr1:155447428:T:C,rs6668947,T,C,0.4151,0.0631,4.74e-11,++++,0.0,2.682,...,0.859,1.0,0.9848,1.0,1.0,0.9999,0.9873,0.9997,0.9756,0.9936
6,chr1:155438844:T:C,rs4971053,T,C,0.4141,0.063,5.002e-11,++++,0.0,2.68,...,0.859,1.0,0.9847,1.0,0.9998,1.0,0.9873,0.9997,0.9756,0.9994
7,chr1:155394894:G:A,rs7539746,A,G,-0.3913,0.0627,4.317e-10,----,17.7,3.646,...,0.8585,1.0,0.9846,1.0,1.0,0.9999,0.9873,0.9997,0.9756,0.9994
8,chr1:155317797:C:T,rs1409140,T,C,-0.3894,0.0625,4.685e-10,----,29.2,4.236,...,0.8603,1.0,0.9851,1.0,1.0,1.0,0.9873,0.9998,0.9756,0.9994
9,chr1:155324483:A:G,rs914616,A,G,0.3891,0.0625,4.823e-10,++++,29.3,4.241,...,0.8587,1.0,0.9846,1.0,1.0,1.0,0.9873,0.9998,0.9756,0.9994


# AFR ONLY Meta-GWAS Hits

## Process information from imputation server 

In [61]:
## Read and clean the information from the .pvar for the top hit
hits_df = pd.read_csv(f"{WORK_DIR}/data/AFR-META/top-AFR-ONLY-META-hits_extractPVAR.txt", sep="\t", header=None)
hits_df.columns = ['CHR', 'BP', 'markerID', 'REF', 'ALT', 'QUALITY', 'INFO']
hits_df[['IMPUTE_AF','IMPUTE_MAF','R2','TYPE']] = hits_df['INFO'].str.split(';',expand=True)
hits_df.drop(columns=['INFO'], inplace=True)

hits_df['IMPUTE_AF'] = hits_df['IMPUTE_AF'].str.split('=').str[1]
hits_df['IMPUTE_MAF'] = hits_df['IMPUTE_MAF'].str.split('=').str[1]
hits_df['R2'] = hits_df['R2'].str.split('=').str[1]
hits_df['TYPE'] = "IMPUTED"
#hits_df['ER2'] = hits_df['ER2'].str.split('=').str[1]

hits_df['MarkerName'] = hits_df['CHR'].astype(str) + ':' + hits_df['BP'].astype(str)
hits_df_subset = hits_df[['MarkerName', 'markerID', 'R2', 'TYPE']].copy()
hits_df_subset.columns = ['markerID', 'MarkerName', 'R2', 'TYPE']

hits_df_subset

Unnamed: 0,markerID,MarkerName,R2,TYPE
0,1:155160272,chr1:155160272:C:T,0.98373,IMPUTED
1,1:155235878,chr1:155235878:G:T,0.9706,IMPUTED


## Generate ANNOVAR input

In [62]:
## Make ANNOVAR input 
# Format 
    # Chr Start End Ref Alt 
    # Start and End for SNPs must be the same!!
    
annovar_df = pd.DataFrame()
annovar_df['Chr'] = hits_df['CHR']
annovar_df['Start'] = hits_df['BP']
annovar_df['End'] = hits_df['BP']
annovar_df['Ref'] = hits_df['REF']
annovar_df['Alt'] = hits_df['ALT']
annovar_df.to_csv(f"{WORK_DIR}/data/AFR-META/AFR-ONLY-META-hits_forANNOVAR.txt", sep=" ", index=False, header=None)
annovar_df

Unnamed: 0,Chr,Start,End,Ref,Alt
0,1,155160272,155160272,C,T
1,1,155235878,155235878,G,T


## Run ANNOVAR

In [None]:
%%bash

module load annovar

table_annovar.pl ${WORK_DIR}/data/AFR-META/AFR-ONLY-META-hits_forANNOVAR.txt $ANNOVAR_DATA/hg38 \
-buildver hg38 \
-out ${WORK_DIR}/data/other/ANNOVAR/AFR-ONLY-META-hits_ANNOVAR \
-arg '-splicing 15',,, \
-remove \
-protocol refGene,avsnp150,ljb26_all,gnomad312_genome \
-operation g,f,f,f \
-nastring . \
-polish

## Process ANNOVAR output

In [79]:
annovar_output = pd.read_csv(f"{WORK_DIR}/data/other/ANNOVAR/AFR-ONLY-META-hits_ANNOVAR.hg38_multianno.txt", sep="\t")
annovar_output['markerID'] = annovar_output['Chr'].astype(str) + ':' + annovar_output['Start'].astype(str)

rearrange_annovar_output = annovar_output[['markerID', 'avsnp150', 
                                           'Chr', 'Start', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene','GeneDetail.refGene', 'CADD_phred', 
                                           'gnomad312_AF', 'gnomad312_AF_raw','gnomad312_AF_XX', 'gnomad312_AF_XY', 'gnomad312_AF_popmax',
                                           'gnomad312_AF_faf95_popmax', 'gnomad312_AF_afr', 'gnomad312_AF_ami','gnomad312_AF_amr', 'gnomad312_AF_asj',
                                           'gnomad312_AF_eas','gnomad312_AF_fin', 'gnomad312_AF_mid', 'gnomad312_AF_nfe','gnomad312_AF_oth', 'gnomad312_AF_sas']].copy()

## Process METAL top hits

In [64]:
metal_tophits = pd.read_csv(f"{WORK_DIR}/data/AFR-META/top-AFR-ONLY-META-hits_allINFO.txt", sep="\t")
metal_tophits.sort_values('P-value').reset_index(inplace=True)

## Merge and Save

In [74]:
merge_hits_imputationscores = pd.merge(metal_tophits, hits_df_subset, how="left", on="MarkerName")
merge_hits_imputationscores

Unnamed: 0,MarkerName,Allele1,Allele2,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,HetDf,HetPVal,markerID,R2,TYPE
0,chr1:155235878:G:T,t,g,-0.4579,0.0775,3.436e-09,--,20.9,1.264,1,0.2609,1:155235878,0.9706,IMPUTED
1,chr1:155160272:C:T,t,c,0.4178,0.0761,3.97e-08,++,32.8,1.488,1,0.2225,1:155160272,0.98373,IMPUTED


In [84]:
merge_hits_imputationscores_anno = pd.merge(merge_hits_imputationscores, rearrange_annovar_output, how="left", on="markerID")

merge_hits_imputationscores_anno.sort_values('P-value', inplace=True)
merge_hits_imputationscores_anno.reset_index(inplace=True, drop=True)
merge_hits_imputationscores_anno

rearrange_merge = merge_hits_imputationscores_anno[['MarkerName','avsnp150', 
                                                    'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal', 'R2', 'TYPE', 'Func.refGene',
                                                    'Gene.refGene', 'GeneDetail.refGene', 'CADD_phred', 
                                                    'gnomad312_AF', 'gnomad312_AF_raw', 'gnomad312_AF_XX', 'gnomad312_AF_XY', 'gnomad312_AF_popmax','gnomad312_AF_faf95_popmax', 
                                                    'gnomad312_AF_afr', 'gnomad312_AF_ami','gnomad312_AF_amr', 'gnomad312_AF_asj', 'gnomad312_AF_eas','gnomad312_AF_fin', 'gnomad312_AF_mid', 'gnomad312_AF_nfe',
                                                    'gnomad312_AF_oth', 'gnomad312_AF_sas']].copy()

rearrange_merge.rename(columns={'avsnp150':'rsID'}, inplace=True)
rearrange_merge['Allele1'] = rearrange_merge['Allele1'].str.upper()
rearrange_merge['Allele2'] = rearrange_merge['Allele2'].str.upper()

rearrange_merge.to_csv(f"{WORK_DIR}/data/AFR-META/AFR-ONLY-META_METAL_ANNOVAR_IMPUTATION_FEB2023.txt", sep="\t", index=False)

In [83]:
rearrange_merge

Unnamed: 0,MarkerName,rsID,Allele1,Allele2,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,...,gnomad312_AF_afr,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas
0,chr1:155235878:G:T,rs3115534,T,G,-0.4579,0.0775,3.436e-09,--,20.9,1.264,...,0.8407,0.9978,0.9828,0.9988,0.9996,0.9985,0.9767,0.998,0.97,0.9985
1,chr1:155160272:C:T,rs59025885,T,C,0.4178,0.0761,3.97e-08,++,32.8,1.488,...,0.1585,0.0,0.0157,0.0,0.0,0.0,0.0127,0.0005,0.0307,0.0006
