# Variants in DNAJC13 and their Association with Parkinson's Disease Across Different Ancestral Backgrounds

* **Project**: DNAJC13 Gene Assesment
* **Last updated**: September 2024
* **Version**: Python 3.9
* **Data**: GP2 Release 7

# Summary
This notebook is focused on analysing association between DNAJC13 variants in Admixed American/Latin American (AMR) ancestry and Parkinson's disease.

# Imports

In [1]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

  from IPython.core.display import display, HTML


In [2]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## AMP-PD v2.5
## Explicitly define release v2.5 path 
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'

AMP_WGS_RELEASE_PLINK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'plink')
AMP_WGS_RELEASE_GATK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'gatk')

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name @ `WORKSPACE_NAME`: {WORKSPACE_NAME}')
print(f'Billing Project @ `BILLING_PROJECT_ID`: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data @ `WORKSPACE_BUCKET`: {WORKSPACE_BUCKET}')
print('')

print('AMP-PD v2.5')
print(f'Path to AMP-PD v2.5 Clinical Data @ `AMP_CLINICAL_RELEASE_PATH`: {AMP_CLINICAL_RELEASE_PATH}')
print(f'Path to AMP-PD v2.5 WGS Data @ `AMP_WGS_RELEASE_PLINK_PATH`: {AMP_WGS_RELEASE_PLINK_PATH}')
print('')

## GP2 v7.0
## Explicitly define release v7.0 path 
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v7.0')
print(f'Path to GP2 v7.0 Clinical Data @ `GP2_CLINICAL_RELEASE_PATH`: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Metadata @ `GP2_META_RELEASE_PATH`: {GP2_META_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Raw Genotype Data @ `GP2_RAW_GENO_PATH`: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v7.0 Imputed Genotype Data @ `GP2_IMPUTED_GENO_PATH`: {GP2_IMPUTED_GENO_PATH}')

# Make working directory

In [4]:
ancestry = 'AMR'
WORK_DIR = f'202407_BURDEN_{ancestry}_R7'
WORKSPACE_DIR = f'202407_BURDEN_{ancestry}_R7'

! mkdir {WORK_DIR}

## PLINK

In [5]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink1.9 is already installed in /home/jupyter/tools/


In [6]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink2; then
echo "Plink2 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink2 \n    -------"
wget -N https://s3.amazonaws.com/plink2-assets/plink2_linux_amd_avx2_20240704.zip
unzip -o plink2_linux_amd_avx2_20240704.zip
echo -e "\n plink2 downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink2 is already installed in /home/jupyter/tools/


## ANNOVAR

In [7]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

annovar is already installed in /home/jupyter/tools/


In [8]:
%%bash

# Install ANNOVAR: Download resources for annotation

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20140902 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished d

## RVTests

In [9]:
%%bash

#Install RVTESTS: Option 1 (~15min)
if test -e /home/jupyter/tools/rvtests; then

echo "rvtests is already installed"
else
echo "rvtests is not installed"

mkdir /home/jupyter/tools/rvtests
cd /home/jupyter/tools/rvtests

wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz 

tar -zxvf rvtests_linux64.tar.gz
fi

rvtests is already installed


In [10]:
# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2
! chmod 777 /home/jupyter/tools/rvtests/executable/rvtest

# Copy Over Files 

In [None]:
## chr 3
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_IMPUTED_GENO_PATH}/{ancestry}/chr3_{ancestry}_release7.* {WORK_DIR}')

In [None]:
## clinical data 

## master key
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_CLINICAL_RELEASE_PATH}/master_key_release7_final.csv {WORK_DIR}')

## related file 
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_META_RELEASE_PATH}/related_samples/{ancestry}_release7.related {WORK_DIR}')

In [None]:
## PCs
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_RAW_GENO_PATH}/{ancestry}/{ancestry}_release7.eigenvec {WORK_DIR}')

In [None]:
# refFlat
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {WORKSPACE_BUCKET}/notebooks/misc/refFlat_HG38_all_chr.txt {WORK_DIR}')

In [15]:
## Check files
! ls {WORK_DIR}

AMR_release7.eigenvec  chr3_AMR_release7.pgen  master_key_release7_final.csv
AMR_release7.related   chr3_AMR_release7.psam  refFlat_HG38_all_chr.txt
chr3_AMR_release7.log  chr3_AMR_release7.pvar


# Covariate File

In [None]:
# Let's load the master key
key = pd.read_csv(f'{WORK_DIR}/master_key_release7_final.csv')
print(key.shape)
key.head()

In [None]:
key = key[['GP2sampleID', 'baseline_GP2_phenotype', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset', 'race_for_qc', 'label']]
key.rename(columns = {'GP2sampleID': 'IID',
                      'baseline_GP2_phenotype':'phenotype',
                                     'biological_sex_for_qc':'SEX', 
                                     'age_at_sample_collection':'AGE', 
                                     'race_for_qc':'RACE',
                                     'age_of_onset':'AAO'}, inplace = True)
key

In [None]:
## Subset to keep ancestry of interest 
ancestry_key = key[key['label']==ancestry].copy()
ancestry_key.reset_index(drop=True)

In [None]:
# Load information about related individuals 
related_df = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.related')
print(related_df.shape)
related_df

In [None]:
# Make a list of just one set of related people
related_list = list(related_df['IID1'])

# Check value counts of related and remove only one related individual
ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]

# Check size
print(ancestry_key.shape)
ancestry_key

In [21]:
# Convert phenotype to binary (0/1)
    # PD = 1; control = 0
pheno_mapping = {"PD": 1, "Control": 0}
ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')

In [22]:
# Check value counts of pheno
ancestry_key['PHENO'].value_counts(dropna=False)

PHENO
1       457
0       160
<NA>     30
Name: count, dtype: Int64

In [None]:
## Get the PCs
pcs = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.eigenvec', sep='\t')

# Split the single column into multiple columns (No longer needed with release 7)
#pcs_split = pcs['#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10'].str.split(',', expand=True)

selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']


#pcs = pd.DataFrame(data=pcs_split.iloc[:, 1:7].values, columns=selected_columns)
pcs = pd.DataFrame(data=pcs.iloc[:, 1:7].values, columns=selected_columns)

# Drop the first row (since it's now the column names)
pcs = pcs.drop(0)

# Reset the index to remove any potential issues
pcs = pcs.reset_index(drop=True)


# Display the resulting DataFrame
print(pcs)

In [24]:
# Convert sex to binary (0/1)
    # Female = 1; male = 0
sex_mapping = {"Female": 1, "Male": 0}
ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')

In [25]:
# Check value counts of SEX
ancestry_key['SEX'].value_counts(dropna=False)

SEX
0       360
1       285
<NA>      2
Name: count, dtype: Int64

In [26]:
## Make covariate file
df = pd.merge(pcs, ancestry_key, on='IID', how='left')
df.columns

Index(['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'phenotype', 'SEX', 'AGE',
       'AAO', 'RACE', 'label', 'PHENO'],
      dtype='object')

In [27]:
df['FID'] = 0

In [None]:
## Clean up and keep columns we need 
final_df = df[['FID','IID', 'SEX', 'PHENO', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
final_df

In [29]:
final_df.groupby(['PHENO'])['SEX'].value_counts()

PHENO  SEX
0      0       77
       1       76
1      0      251
       1      177
Name: count, dtype: int64

In [30]:
## Make file of sample IDs to keep 
samples_toKeep = final_df[['FID', 'IID']].copy()
samples_toKeep.to_csv(f'{WORK_DIR}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)

In [31]:
## Save your covariate file
final_df.to_csv(f'{WORK_DIR}/{ancestry}_covariate_file.txt', sep = '\t', index=False)

In [32]:
## check to make sure file was created and saved
! ls {WORK_DIR}

AMR_covariate_file.txt	chr3_AMR_release7.log	master_key_release7_final.csv
AMR_release7.eigenvec	chr3_AMR_release7.pgen	refFlat_HG38_all_chr.txt
AMR_release7.related	chr3_AMR_release7.psam
AMR.samplestoKeep	chr3_AMR_release7.pvar


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_covariate_file.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}.samplestoKeep {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}//')

# Annotation using ANNOVAR

* *DNAJC13* from NCBI gene
* hg38 (chr3:132417502-132539032)

In [35]:
## extract region using plink

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--chr 3 \
--from-bp 132417502  \
--to-bp 132539032 \
--recode vcf id-paste=iid \
--mac 2 \
--out {WORK_DIR}/{ancestry}_DNAJC13

PLINK v2.00a6LM AVX2 AMD (4 Jul 2024)          www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.log.
Options in effect:
  --chr 3
  --export vcf id-paste=iid
  --from-bp 132417502
  --mac 2
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13
  --pfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --to-bp 132539032

Start time: Tue Jul  9 23:34:13 2024
7450 MiB RAM detected, ~5065 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
614 samples (271 females, 343 males; 614 founders) loaded from
202407_BURDEN_AMR_R7/chr3_AMR_release7.psam.
2464814 variants loaded from 202407_BURDEN_AMR_R7/chr3_AMR_release7.pvar.
1 binary phenotype loaded (431 cases, 150 controls).
Calculating allele frequencies... done.
646 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
639 variants remaining after main filters.
--export vcf to 202407_BURDEN_AMR_R

In [36]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz

In [37]:
## annotate using ANNOVAR
! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out {WORK_DIR}/{ancestry}_DNAJC13.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 202407_BURDEN_AMR_R7/AMR_DNAJC13.vcf.gz > 202407_BURDEN_AMR_R7/AMR_DNAJC13.annovar.avinput>
NOTICE: Finished reading 658 lines from VCF file
NOTICE: A total of 639 locus in VCF file passed QC threshold, representing 591 SNPs (425 transitions and 166 transversions) and 48 indels/substitutions
NOTICE: Finished writing allele frequencies based on 362874 SNP genotypes (260950 transitions and 101924 transversions) and 29472 indels/substitutions for 614 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl 202407_BURDEN_AMR_R7/AMR_DNAJC13.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile 202407_BURDEN_AMR_R7/AMR_DNAJC13.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=re

In [38]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo617,Otherinfo618,Otherinfo619,Otherinfo620,Otherinfo621,Otherinfo622,Otherinfo623,Otherinfo624,Otherinfo625,Otherinfo626
0,3,132417596,132417596,C,T,UTR5,DNAJC13,NM_001329126:c.-16955C>T;NM_015268:c.-16955C>T,.,.,...,1|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,3,132417864,132417864,G,C,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,3,132418296,132418296,C,T,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0
3,3,132418415,132418415,A,G,intronic,DNAJC13,.,.,.,...,0|0,0|0,1|0,1|0,0|0,0|1,0|0,1|1,1|0,0|0
4,3,132418488,132418488,G,T,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,3,132538225,132538225,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon56:c.G6675A:p.M2225I,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
635,3,132538292,132538292,G,A,UTR3,DNAJC13,NM_001329126:c.*10G>A;NM_015268:c.*10G>A,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,.|.,0|0,0|0
636,3,132538541,132538541,A,G,UTR3,DNAJC13,NM_001329126:c.*259A>G;NM_015268:c.*259A>G,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
637,3,132538575,132538575,A,G,UTR3,DNAJC13,NM_001329126:c.*293A>G;NM_015268:c.*293A>G,.,.,...,0|0,0|0,0|0,1|0,0|0,0|0,0|0,0|1,1|0,0|0


In [39]:
## Filter intronic
intronic = gene[(gene['Func.refGene'] == 'intronic')]

In [40]:
## Filter UTR3
utr3 = gene[(gene['Func.refGene'] == 'UTR3')]

In [41]:
## Filter UTR5
utr5 = gene[(gene['Func.refGene'] == 'UTR5')]

In [42]:
## Filter exonic and synonymous variants
coding_synonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'synonymous SNV')]

In [43]:
# Filter exonic and non-synonymous variants
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_nonsynonymous

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo617,Otherinfo618,Otherinfo619,Otherinfo620,Otherinfo621,Otherinfo622,Otherinfo623,Otherinfo624,Otherinfo625,Otherinfo626
285,3,132475009,132475009,C,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon22:c.C2369A:p.S790Y,DNAJ...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
301,3,132478139,132478139,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon24:c.G2708A:p.R903K,DNAJ...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
383,3,132494190,132494190,A,G,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon34:c.A3872G:p.E1291G,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
407,3,132499231,132499231,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon37:c.C4262T:p.A1421V,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
410,3,132499777,132499777,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon38:c.G4385A:p.R1462H,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
411,3,132499779,132499779,G,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon38:c.G4387T:p.A1463S,DNA...",...,1|0,1|1,0|0,1|1,0|1,1|0,0|0,1|1,1|1,0|0
426,3,132502295,132502295,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon40:c.C4543T:p.P1515S,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0
427,3,132502298,132502298,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon40:c.C4546T:p.R1516C,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
428,3,132502299,132502299,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon40:c.G4547A:p.R1516H,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
448,3,132507256,132507256,A,G,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon43:c.A5018G:p.Y1673C,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [44]:
totalVariants = len(gene.axes[0])
totalIntronic = len(intronic.axes[0])
totalUTR3 = len(utr3.axes[0])
totalUTR5 = len(utr5.axes[0])
totalExonicSyn = len(coding_synonymous.axes[0])
totalExonicNonSyn = len(coding_nonsynonymous.axes[0])

In [45]:
print("Total Variants: ", totalVariants)
print("Intronic:", totalIntronic)
print("UTR3:", totalUTR3)
print("UTR5:", totalUTR5)
print("Exonic Syn:", totalExonicSyn)
print("Exonic NonSyn:", totalExonicNonSyn)

Total Variants:  639
Intronic: 613
UTR3: 4
UTR5: 1
Exonic Syn: 6
Exonic NonSyn: 15


In [46]:
# Save in PLINK format 
variants_toKeep = coding_nonsynonymous[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep', sep="\t", index=False, header=False)
variants_toKeep

Unnamed: 0,Chr,Start,End,Gene.refGene
285,3,132475009,132475009,DNAJC13
301,3,132478139,132478139,DNAJC13
383,3,132494190,132494190,DNAJC13
407,3,132499231,132499231,DNAJC13
410,3,132499777,132499777,DNAJC13
411,3,132499779,132499779,DNAJC13
426,3,132502295,132502295,DNAJC13
427,3,132502298,132502298,DNAJC13
428,3,132502299,132502299,DNAJC13
448,3,132507256,132507256,DNAJC13


In [47]:
## check to make sure file was created and saved
! ls {WORK_DIR}

AMR_covariate_file.txt			      AMR_release7.related
AMR_DNAJC13.all_coding_nonsyn.variantstoKeep  AMR.samplestoKeep
AMR_DNAJC13.annovar.avinput		      chr3_AMR_release7.log
AMR_DNAJC13.annovar.hg38_multianno.txt	      chr3_AMR_release7.pgen
AMR_DNAJC13.annovar.hg38_multianno.vcf	      chr3_AMR_release7.psam
AMR_DNAJC13.log				      chr3_AMR_release7.pvar
AMR_DNAJC13.vcf.gz			      master_key_release7_final.csv
AMR_DNAJC13.vcf.gz.tbi			      refFlat_HG38_all_chr.txt
AMR_release7.eigenvec


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep {WORKSPACE_BUCKET}//{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

## Burden Analyses using RVTests

In [50]:
# Convert the files from Plink 2.0 to Plink 1.9 format

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--make-bed \
--out {WORK_DIR}/chr3_{ancestry}_release7

PLINK v2.00a6LM AVX2 AMD (4 Jul 2024)          www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/chr3_AMR_release7.log.
Options in effect:
  --make-bed
  --out 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --pfile 202407_BURDEN_AMR_R7/chr3_AMR_release7

Start time: Tue Jul  9 23:34:33 2024
7450 MiB RAM detected, ~5069 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
614 samples (271 females, 343 males; 614 founders) loaded from
202407_BURDEN_AMR_R7/chr3_AMR_release7.psam.
2464814 variants loaded from 202407_BURDEN_AMR_R7/chr3_AMR_release7.pvar.
1 binary phenotype loaded (431 cases, 150 controls).
Writing 202407_BURDEN_AMR_R7/chr3_AMR_release7.fam ... done.
Writing 202407_BURDEN_AMR_R7/chr3_AMR_release7.bim ... done.
Writing 202407_BURDEN_AMR_R7/chr3_AMR_release7.bed ... 10131518212326293134373942454750535558616366697174777982858790939598done.
End time: Tue Jul  9 23:3

In [51]:
## extract variants
    # later do based on class and frequency

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--recode vcf-iid \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding.log.
Options in effect:
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding
  --recode vcf-iid

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132333435

In [52]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz

In [53]:
## RVtests with covariates 
! /home/jupyter/tools/rvtests/executable/rvtest --noweb --hide-covar \
--out {WORK_DIR}/{ancestry}_DNAJC13.burden.coding \
--kernel skat,skato \
--inVcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--mpheno 4 \
--gene DNAJC13 \
--geneFile {WORK_DIR}/refFlat_HG38_all_chr.txt \
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX,PC1,PC2,PC3,PC4 #[WARN]Covariate [ PC5 ] has strong correlation [ r^2 = 1 ] with the response!


# --out : Name of output 
# --burden cmc --kernel skato: tests to run 
# --inVcf : VCF file 
# --gene: gene name (if only looking at one or a few)
# --geneFile refFlat.txt
# --pheno :  covar file
# --mpheno : # column that has phenotype information
# --pheno-name : column name with phenotype in file
# --covar : covar file
# --freqUpper : optional, MAF cut-off
# --covar-name : covariates, listed by column name, separated by commas (no spaces between commas)
## 0=controls; 1=cases

Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output:
                          --inVcf [202407_BURDEN_AMR_R7/AMR_DNAJC13.coding.vcf.gz]
                          --inBgen [], --inBgenSample [], --inKgg []
                          --out [202407_BURDEN_AMR_R7/AMR_DNAJC13.burden.coding]
                          --outputRaw
       Specify Covariate: --covar [202407_BURDEN_AMR_R7/AMR_covariate_file.txt]
                          --covar-name [SEX,PC1,PC2,PC3,PC4], --sex
       Specify Phenotype: --pheno [202407_BURDEN_AMR_R7/AMR_covariate_file.txt]
                          --inverseNormal

In [54]:
## look at results 
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.Skat.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	Pvalue	NumPerm	ActualPerm	Stat	NumGreater	NumEqual	PermPvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	581	15	15	95.2662	0.041396	10000	10000	95.2662	538	0	0.0538


In [55]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.SkatO.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	rho	Pvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	581	15	15	32072.1	0	0.0812026


In [56]:
## check to make sure file was created and saved
! ls {WORK_DIR}

AMR_covariate_file.txt			      AMR_DNAJC13.vcf.gz.tbi
AMR_DNAJC13.all_coding_nonsyn.variantstoKeep  AMR_release7.eigenvec
AMR_DNAJC13.annovar.avinput		      AMR_release7.related
AMR_DNAJC13.annovar.hg38_multianno.txt	      AMR.samplestoKeep
AMR_DNAJC13.annovar.hg38_multianno.vcf	      chr3_AMR_release7.bed
AMR_DNAJC13.burden.coding.log		      chr3_AMR_release7.bim
AMR_DNAJC13.burden.coding.Skat.assoc	      chr3_AMR_release7.fam
AMR_DNAJC13.burden.coding.SkatO.assoc	      chr3_AMR_release7.log
AMR_DNAJC13.coding.log			      chr3_AMR_release7.pgen
AMR_DNAJC13.coding.vcf.gz		      chr3_AMR_release7.psam
AMR_DNAJC13.coding.vcf.gz.tbi		      chr3_AMR_release7.pvar
AMR_DNAJC13.log				      master_key_release7_final.csv
AMR_DNAJC13.vcf.gz			      refFlat_HG38_all_chr.txt


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.*.assoc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

# Case/Control Frequencies

In [59]:
## extract variants

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --ci 0.95

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --ci 0.95
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718

In [60]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc

 CHR                  SNP         BP   A1      F_A      F_U   A2        CHISQ            P           OR           SE          L95          U95 
   3   chr3:132475009:C:A  132475009    A 0.002326 0.006667    C         1.22       0.2694       0.3473        1.002      0.04871        2.477 
   3   chr3:132478139:G:A  132478139    A 0.003488 0.003333    G     0.001556       0.9685        1.047        1.157       0.1085         10.1 
   3   chr3:132494190:A:G  132494190    G  0.00814        0    A        2.457        0.117           NA           NA           NA           NA 
   3   chr3:132499231:C:T  132499231    T  0.01279     0.01    C       0.1453       0.7031        1.283       0.6548       0.3554        4.629 
   3   chr3:132499777:G:A  132499777    A 0.004651 0.006667    G       0.1756       0.6752       0.6963       0.8686       0.1269        3.821 
   3   chr3:132499779:G:T  132499779    T    0.486   0.5167    G       0.8343        0.361       0.8847       0.1342       0.6801 

In [61]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc', delim_whitespace=True, usecols=['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR', 'L95', 'U95'])
assoc

Unnamed: 0,SNP,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132475009:C:A,A,0.002326,0.006667,C,0.2694,0.3473,0.04871,2.477
1,chr3:132478139:G:A,A,0.003488,0.003333,G,0.9685,1.047,0.1085,10.1
2,chr3:132494190:A:G,G,0.00814,0.0,A,0.117,,,
3,chr3:132499231:C:T,T,0.01279,0.01,C,0.7031,1.283,0.3554,4.629
4,chr3:132499777:G:A,A,0.004651,0.006667,G,0.6752,0.6963,0.1269,3.821
5,chr3:132499779:G:T,T,0.486,0.5167,G,0.361,0.8847,0.6801,1.151
6,chr3:132502295:C:T,T,0.03023,0.03333,C,0.7897,0.9041,0.4307,1.898
7,chr3:132502298:C:T,T,0.003488,0.003333,C,0.9685,1.047,0.1085,10.1
8,chr3:132502299:G:A,A,0.001163,0.003333,G,0.4352,0.3481,0.0217,5.582
9,chr3:132507256:A:G,G,0.03721,0.06333,A,0.05739,0.5716,0.3189,1.024


In [62]:
## repeat association study applying multiple testing corrections for the raw p-values (like Bonferroni post-hoc)
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --adjust

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --adjust
  --allow-no-sex
  --assoc
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181

In [63]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted

 CHR                  SNP      UNADJ         GC       BONF       HOLM   SIDAK_SS   SIDAK_SD     FDR_BH     FDR_BY
   3   chr3:132507256:A:G    0.05739     0.1248     0.8609     0.8609     0.5879     0.5879     0.7254          1 
   3   chr3:132494190:A:G      0.117     0.2055          1          1     0.8454     0.8249     0.7254          1 
   3   chr3:132475009:C:A     0.2694     0.3723          1          1      0.991     0.9831     0.7254          1 
   3   chr3:132522838:C:T     0.3057      0.408          1          1     0.9958     0.9874     0.7254          1 
   3   chr3:132538225:G:A     0.3445     0.4451          1          1     0.9982     0.9904     0.7254          1 
   3   chr3:132499779:G:T      0.361     0.4606          1          1     0.9988     0.9904     0.7254          1 
   3   chr3:132511169:G:C     0.4032     0.4995          1          1     0.9996     0.9904     0.7254          1 
   3   chr3:132525718:G:T     0.4032     0.4995          1          1    

In [64]:
assoc_adjusted = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted', delim_whitespace=True, usecols=['SNP', 'BONF'])
assoc_adjusted

Unnamed: 0,SNP,BONF
0,chr3:132507256:A:G,0.8609
1,chr3:132494190:A:G,1.0
2,chr3:132475009:C:A,1.0
3,chr3:132522838:C:T,1.0
4,chr3:132538225:G:A,1.0
5,chr3:132499779:G:T,1.0
6,chr3:132511169:G:C,1.0
7,chr3:132525718:G:T,1.0
8,chr3:132502299:G:A,1.0
9,chr3:132499777:G:A,1.0


In [65]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233

In [66]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq

 CHR                  SNP   A1   A2          MAF  NCHROBS
   3   chr3:132475009:C:A    A    C      0.00571     1226
   3   chr3:132478139:G:A    A    G     0.003263     1226
   3   chr3:132494190:A:G    G    A      0.00571     1226
   3   chr3:132499231:C:T    T    C      0.01223     1226
   3   chr3:132499777:G:A    A    G     0.004894     1226
   3   chr3:132499779:G:T    T    G       0.4927     1226
   3   chr3:132502295:C:T    T    C        0.031     1226
   3   chr3:132502298:C:T    T    C     0.003263     1226
   3   chr3:132502299:G:A    A    G     0.001631     1226
   3   chr3:132507256:A:G    G    A      0.04323     1226
   3   chr3:132511169:G:C    C    G     0.001631     1226
   3   chr3:132522838:C:T    T    C     0.002447     1226
   3   chr3:132525718:G:T    T    G     0.001631     1226
   3   chr3:132528316:T:G    G    T     0.002447     1226
   3   chr3:132538225:G:A    A    G      0.03344     1226


In [67]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq', delim_whitespace=True, usecols=['SNP', 'MAF'])
freq.head()

Unnamed: 0,SNP,MAF
0,chr3:132475009:C:A,0.00571
1,chr3:132478139:G:A,0.003263
2,chr3:132494190:A:G,0.00571
3,chr3:132499231:C:T,0.01223
4,chr3:132499777:G:A,0.004894


In [68]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq case-control \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq case-control
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262

In [69]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc

 CHR                  SNP   A1   A2        MAF_A        MAF_U  NCHROBS_A  NCHROBS_U
   3   chr3:132475009:C:A    A    C     0.002326     0.006667        860        300
   3   chr3:132478139:G:A    A    G     0.003488     0.003333        860        300
   3   chr3:132494190:A:G    G    A      0.00814            0        860        300
   3   chr3:132499231:C:T    T    C      0.01279         0.01        860        300
   3   chr3:132499777:G:A    A    G     0.004651     0.006667        860        300
   3   chr3:132499779:G:T    T    G        0.486       0.5167        860        300
   3   chr3:132502295:C:T    T    C      0.03023      0.03333        860        300
   3   chr3:132502298:C:T    T    C     0.003488     0.003333        860        300
   3   chr3:132502299:G:A    A    G     0.001163     0.003333        860        300
   3   chr3:132507256:A:G    G    A      0.03721      0.06333        860        300
   3   chr3:132511169:G:C    C    G     0.002326            0    

In [70]:
freq_cc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc', delim_whitespace=True, usecols=['SNP', 'MAF_A', 'MAF_U'])
freq_cc.head()

Unnamed: 0,SNP,MAF_A,MAF_U
0,chr3:132475009:C:A,0.002326,0.006667
1,chr3:132478139:G:A,0.003488,0.003333
2,chr3:132494190:A:G,0.00814,0.0
3,chr3:132499231:C:T,0.01279,0.01
4,chr3:132499777:G:A,0.004651,0.006667


In [71]:
all_freq = pd.merge(freq, freq_cc, on="SNP", how="left")
all_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U
0,chr3:132475009:C:A,0.00571,0.002326,0.006667
1,chr3:132478139:G:A,0.003263,0.003488,0.003333
2,chr3:132494190:A:G,0.00571,0.00814,0.0
3,chr3:132499231:C:T,0.01223,0.01279,0.01
4,chr3:132499777:G:A,0.004894,0.004651,0.006667
5,chr3:132499779:G:T,0.4927,0.486,0.5167
6,chr3:132502295:C:T,0.031,0.03023,0.03333
7,chr3:132502298:C:T,0.003263,0.003488,0.003333
8,chr3:132502299:G:A,0.001631,0.001163,0.003333
9,chr3:132507256:A:G,0.04323,0.03721,0.06333


In [72]:
assoc_and_freq = pd.merge(all_freq, assoc, on="SNP", how="left")
assoc_and_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132475009:C:A,0.00571,0.002326,0.006667,A,0.002326,0.006667,C,0.2694,0.3473,0.04871,2.477
1,chr3:132478139:G:A,0.003263,0.003488,0.003333,A,0.003488,0.003333,G,0.9685,1.047,0.1085,10.1
2,chr3:132494190:A:G,0.00571,0.00814,0.0,G,0.00814,0.0,A,0.117,,,
3,chr3:132499231:C:T,0.01223,0.01279,0.01,T,0.01279,0.01,C,0.7031,1.283,0.3554,4.629
4,chr3:132499777:G:A,0.004894,0.004651,0.006667,A,0.004651,0.006667,G,0.6752,0.6963,0.1269,3.821
5,chr3:132499779:G:T,0.4927,0.486,0.5167,T,0.486,0.5167,G,0.361,0.8847,0.6801,1.151
6,chr3:132502295:C:T,0.031,0.03023,0.03333,T,0.03023,0.03333,C,0.7897,0.9041,0.4307,1.898
7,chr3:132502298:C:T,0.003263,0.003488,0.003333,T,0.003488,0.003333,C,0.9685,1.047,0.1085,10.1
8,chr3:132502299:G:A,0.001631,0.001163,0.003333,A,0.001163,0.003333,G,0.4352,0.3481,0.0217,5.582
9,chr3:132507256:A:G,0.04323,0.03721,0.06333,G,0.03721,0.06333,A,0.05739,0.5716,0.3189,1.024


In [73]:
# Merge the results
assoc_and_freq = pd.merge(assoc_and_freq, assoc_adjusted, on="SNP", how="left")
assoc_and_freq.head()

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95,BONF
0,chr3:132475009:C:A,0.00571,0.002326,0.006667,A,0.002326,0.006667,C,0.2694,0.3473,0.04871,2.477,1.0
1,chr3:132478139:G:A,0.003263,0.003488,0.003333,A,0.003488,0.003333,G,0.9685,1.047,0.1085,10.1,1.0
2,chr3:132494190:A:G,0.00571,0.00814,0.0,G,0.00814,0.0,A,0.117,,,,1.0
3,chr3:132499231:C:T,0.01223,0.01279,0.01,T,0.01279,0.01,C,0.7031,1.283,0.3554,4.629,1.0
4,chr3:132499777:G:A,0.004894,0.004651,0.006667,A,0.004651,0.006667,G,0.6752,0.6963,0.1269,3.821,1.0


In [74]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--recode A \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_AMR_R7/chr3_AMR_release7
  --extract range 202407_BURDEN_AMR_R7/AMR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_AMR_R7/AMR.samplestoKeep
  --out 202407_BURDEN_AMR_R7/AMR_DNAJC13.coding_nonsyn
  --recode A

7450 MB RAM detected; reserving 3725 MB for main workspace.
2464814 variants loaded from .bim file.
614 people (343 males, 271 females) loaded from .fam.
581 phenotype values loaded from .fam.
--extract range: 2464799 variants excluded.
--extract range: 15 variants remaining.
--keep: 613 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 613 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw', delim_whitespace=True)
recode.head()

In [76]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for DNAJC13: {len(variants)}')
variants

Number of variants in AMR for DNAJC13: 15


['chr3:132475009:C:A_A',
 'chr3:132478139:G:A_A',
 'chr3:132494190:A:G_G',
 'chr3:132499231:C:T_T',
 'chr3:132499777:G:A_A',
 'chr3:132499779:G:T_T',
 'chr3:132502295:C:T_T',
 'chr3:132502298:C:T_T',
 'chr3:132502299:G:A_A',
 'chr3:132507256:A:G_G',
 'chr3:132511169:G:C_C',
 'chr3:132522838:C:T_T',
 'chr3:132525718:G:T_T',
 'chr3:132528316:T:G_G',
 'chr3:132538225:G:A_A']

In [77]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
#    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
#    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
#        'Freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
#        'Freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['SNP'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls,SNP
0,chr3:132475009:C:A_A,0,2,430,0,2,150,chr3:132475009:C:A
1,chr3:132478139:G:A_A,0,3,430,0,1,150,chr3:132478139:G:A
2,chr3:132494190:A:G_G,0,7,430,0,0,150,chr3:132494190:A:G
3,chr3:132499231:C:T_T,0,11,430,0,3,150,chr3:132499231:C:T
4,chr3:132499777:G:A_A,0,4,430,0,2,150,chr3:132499777:G:A
5,chr3:132499779:G:T_T,100,218,430,43,69,150,chr3:132499779:G:T
6,chr3:132502295:C:T_T,0,26,430,0,10,150,chr3:132502295:C:T
7,chr3:132502298:C:T_T,0,3,430,0,1,150,chr3:132502298:C:T
8,chr3:132502299:G:A_A,0,1,430,0,1,150,chr3:132502299:G:A
9,chr3:132507256:A:G_G,0,32,430,0,19,150,chr3:132507256:A:G


In [78]:
## Merge with assoc results 
full_results = pd.merge(assoc_and_freq, df_results, on="SNP", how="left")

In [79]:
clean_full_results = full_results[['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR','L95', 'U95', 'BONF', 
                                   'MAF', 'MAF_A', 'MAF_U', 
                                   'Hom Cases', 'Het Cases', 'Total Cases', 
                                   'Hom Controls','Het Controls', 'Total Controls']].copy()

In [80]:
print(clean_full_results.shape)
print(f'No. of SNPs: {clean_full_results.shape[0]}')
clean_full_results

(15, 19)
No. of SNPs: 15


Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
0,chr3:132475009:C:A,A,C,0.002326,0.006667,0.2694,0.3473,0.04871,2.477,1.0,0.00571,0.002326,0.006667,0,2,430,0,2,150
1,chr3:132478139:G:A,A,G,0.003488,0.003333,0.9685,1.047,0.1085,10.1,1.0,0.003263,0.003488,0.003333,0,3,430,0,1,150
2,chr3:132494190:A:G,G,A,0.00814,0.0,0.117,,,,1.0,0.00571,0.00814,0.0,0,7,430,0,0,150
3,chr3:132499231:C:T,T,C,0.01279,0.01,0.7031,1.283,0.3554,4.629,1.0,0.01223,0.01279,0.01,0,11,430,0,3,150
4,chr3:132499777:G:A,A,G,0.004651,0.006667,0.6752,0.6963,0.1269,3.821,1.0,0.004894,0.004651,0.006667,0,4,430,0,2,150
5,chr3:132499779:G:T,T,G,0.486,0.5167,0.361,0.8847,0.6801,1.151,1.0,0.4927,0.486,0.5167,100,218,430,43,69,150
6,chr3:132502295:C:T,T,C,0.03023,0.03333,0.7897,0.9041,0.4307,1.898,1.0,0.031,0.03023,0.03333,0,26,430,0,10,150
7,chr3:132502298:C:T,T,C,0.003488,0.003333,0.9685,1.047,0.1085,10.1,1.0,0.003263,0.003488,0.003333,0,3,430,0,1,150
8,chr3:132502299:G:A,A,G,0.001163,0.003333,0.4352,0.3481,0.0217,5.582,1.0,0.001631,0.001163,0.003333,0,1,430,0,1,150
9,chr3:132507256:A:G,G,A,0.03721,0.06333,0.05739,0.5716,0.3189,1.024,0.8609,0.04323,0.03721,0.06333,0,32,430,0,19,150


In [81]:
# Look at significant SNPs, if any 
sig_freq = clean_full_results[clean_full_results['P']<0.05]
sig_snps = sig_freq['SNP'].tolist()
sig_snps

[]

In [82]:
# Look at significant SNPs, if any 
sig_df_results = clean_full_results[clean_full_results['SNP'].isin(sig_snps)]
sig_df_results

Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls


In [83]:
for snp in sig_snps:
    ! grep {snp} {WORK_DIR}/chr3_{ancestry}_release7.pvar

In [84]:
# Save files to VM
clean_full_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt', sep="\t", index=False)
sig_df_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt' , sep="\t", index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')