# Variants in DNAJC13 and their Association with Parkinson's Disease Across Different Ancestral Backgrounds

* **Project**: DNAJC13 Gene Assesment
* **Last updated**: September 2024
* **Version**: Python 3.9
* **Data**: GP2 Release 7

# Summary
This notebook is focused on analysing association between DNAJC13 variants in South Asian (SAS) ancestry and Parkinson's disease.

# Imports

In [1]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

  from IPython.core.display import display, HTML


In [2]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## AMP-PD v2.5
## Explicitly define release v2.5 path 
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'

AMP_WGS_RELEASE_PLINK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'plink')
AMP_WGS_RELEASE_GATK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'gatk')

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name @ `WORKSPACE_NAME`: {WORKSPACE_NAME}')
print(f'Billing Project @ `BILLING_PROJECT_ID`: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data @ `WORKSPACE_BUCKET`: {WORKSPACE_BUCKET}')
print('')

print('AMP-PD v2.5')
print(f'Path to AMP-PD v2.5 Clinical Data @ `AMP_CLINICAL_RELEASE_PATH`: {AMP_CLINICAL_RELEASE_PATH}')
print(f'Path to AMP-PD v2.5 WGS Data @ `AMP_WGS_RELEASE_PLINK_PATH`: {AMP_WGS_RELEASE_PLINK_PATH}')
print('')

## GP2 v7.0
## Explicitly define release v7.0 path 
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v7.0')
print(f'Path to GP2 v7.0 Clinical Data @ `GP2_CLINICAL_RELEASE_PATH`: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Metadata @ `GP2_META_RELEASE_PATH`: {GP2_META_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Raw Genotype Data @ `GP2_RAW_GENO_PATH`: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v7.0 Imputed Genotype Data @ `GP2_IMPUTED_GENO_PATH`: {GP2_IMPUTED_GENO_PATH}')

# Make working directory

In [4]:
ancestry = 'SAS'
WORK_DIR = f'202407_BURDEN_{ancestry}_R7'
WORKSPACE_DIR = f'202407_BURDEN_{ancestry}_R7'

! mkdir {WORK_DIR}

mkdir: cannot create directory ‘202407_BURDEN_SAS_R7’: File exists


# Install Packages

## PLINK

In [5]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink1.9 is already installed in /home/jupyter/tools/


In [6]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink2; then
echo "Plink2 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink2 \n    -------"
wget -N https://s3.amazonaws.com/plink2-assets/plink2_linux_amd_avx2_20240704.zip
unzip -o plink2_linux_amd_avx2_20240704.zip
echo -e "\n plink2 downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink2 is already installed in /home/jupyter/tools/


## ANNOVAR

In [7]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

annovar is already installed in /home/jupyter/tools/


In [8]:
%%bash

# Install ANNOVAR: Download resources for annotation

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20140902 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished d

## RVTests

In [9]:
%%bash

#Install RVTESTS: Option 1 (~15min)
if test -e /home/jupyter/tools/rvtests; then

echo "rvtests is already installed"
else
echo "rvtests is not installed"

mkdir /home/jupyter/tools/rvtests
cd /home/jupyter/tools/rvtests

wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz 

tar -zxvf rvtests_linux64.tar.gz
fi

rvtests is already installed


In [10]:
# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2
! chmod 777 /home/jupyter/tools/rvtests/executable/rvtest

# Copy Over Files 

In [None]:
## chr 3
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_IMPUTED_GENO_PATH}/{ancestry}/chr3_{ancestry}_release7.* {WORK_DIR}')

In [None]:
## clinical data 

## master key
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_CLINICAL_RELEASE_PATH}/master_key_release7_final.csv {WORK_DIR}')

## related file 
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_META_RELEASE_PATH}/related_samples/{ancestry}_release7.related {WORK_DIR}')

In [13]:
## PCs
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_RAW_GENO_PATH}/{ancestry}/{ancestry}_release7.eigenvec {WORK_DIR}')

Executing: gsutil -u terra-f03d16c4 -m cp gs://gp2tier2/release7_30042024/raw_genotypes/SAS/SAS_release7.eigenvec 202407_BURDEN_SAS_R7


Copying gs://gp2tier2/release7_30042024/raw_genotypes/SAS/SAS_release7.eigenvec...
/ [1/1 files][ 72.2 KiB/ 72.2 KiB] 100% Done                                    
Operation completed over 1 objects/72.2 KiB.                                     


In [None]:
# refFlat
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {WORKSPACE_BUCKET}/notebooks/misc/refFlat_HG38_all_chr.txt {WORK_DIR}')

In [None]:
## Check files
! ls {WORK_DIR}

# Covariate File

In [None]:
# Let's load the master key
key = pd.read_csv(f'{WORK_DIR}/master_key_release7_final.csv')
print(key.shape)
key.head()

In [None]:
key = key[['GP2sampleID', 'baseline_GP2_phenotype', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset', 'race_for_qc', 'label']]
key.rename(columns = {'GP2sampleID': 'IID',
                      'baseline_GP2_phenotype':'phenotype',
                                     'biological_sex_for_qc':'SEX', 
                                     'age_at_sample_collection':'AGE', 
                                     'race_for_qc':'RACE',
                                     'age_of_onset':'AAO'}, inplace = True)
key

In [None]:
## Subset to keep ancestry of interest 
ancestry_key = key[key['label']==ancestry].copy()
ancestry_key.reset_index(drop=True)

In [None]:
# Load information about related individuals 
related_df = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.related')
print(related_df.shape)
related_df

In [None]:
# Make a list of just one set of related people
related_list = list(related_df['IID1'])

# Check value counts of related and remove only one related individual
ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]

# Check size
print(ancestry_key.shape)
ancestry_key

In [21]:
# Convert phenotype to binary (0/1)
    # PD = 1; control = 0
pheno_mapping = {"PD": 1, "Control": 0}
ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')

In [22]:
# Check value counts of pheno
ancestry_key['PHENO'].value_counts(dropna=False)

PHENO
1       335
0       206
<NA>     23
Name: count, dtype: Int64

In [None]:
## Get the PCs
pcs = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.eigenvec', sep='\t')

# Split the single column into multiple columns (No longer needed with release 7)
#pcs_split = pcs['#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10'].str.split(',', expand=True)

selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']


#pcs = pd.DataFrame(data=pcs_split.iloc[:, 1:7].values, columns=selected_columns)
pcs = pd.DataFrame(data=pcs.iloc[:, 1:7].values, columns=selected_columns)

# Drop the first row (since it's now the column names)
pcs = pcs.drop(0)

# Reset the index to remove any potential issues
pcs = pcs.reset_index(drop=True)


# Display the resulting DataFrame
print(pcs)

In [24]:
# Convert sex to binary (0/1)
    # Female = 1; male = 0
sex_mapping = {"Female": 1, "Male": 0}
ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')

In [25]:
# Check value counts of SEX
ancestry_key['SEX'].value_counts(dropna=False)

SEX
0       364
1       199
<NA>      1
Name: count, dtype: Int64

In [26]:
## Make covariate file
df = pd.merge(pcs, ancestry_key, on='IID', how='left')
df.columns

Index(['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'phenotype', 'SEX', 'AGE',
       'AAO', 'RACE', 'label', 'PHENO'],
      dtype='object')

In [27]:
df['FID'] = 0

In [None]:
## Clean up and keep columns we need 
final_df = df[['FID','IID', 'SEX', 'PHENO', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
final_df

In [29]:
final_df.groupby(['PHENO'])['SEX'].value_counts()

PHENO  SEX
0      0      133
       1       65
1      0      202
       1      116
Name: count, dtype: int64

In [30]:
## Make file of sample IDs to keep 
samples_toKeep = final_df[['FID', 'IID']].copy()
samples_toKeep.to_csv(f'{WORK_DIR}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)

In [31]:
## Save your covariate file
final_df.to_csv(f'{WORK_DIR}/{ancestry}_covariate_file.txt', sep = '\t', index=False)

In [32]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_SAS_release7.log	master_key_release7_final.csv  SAS_release7.related
chr3_SAS_release7.pgen	refFlat_HG38_all_chr.txt       SAS.samplestoKeep
chr3_SAS_release7.psam	SAS_covariate_file.txt
chr3_SAS_release7.pvar	SAS_release7.eigenvec


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_covariate_file.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}.samplestoKeep {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}//')

# Annotation using ANNOVAR

* *DNAJC13* from NCBI gene
* hg38 (chr3:132417502-132539032)

In [35]:
## extract region using plink

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--chr 3 \
--from-bp 132417502  \
--to-bp 132539032 \
--recode vcf id-paste=iid \
--mac 2 \
--out {WORK_DIR}/{ancestry}_DNAJC13

PLINK v2.00a6LM AVX2 AMD (4 Jul 2024)          www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.log.
Options in effect:
  --chr 3
  --export vcf id-paste=iid
  --from-bp 132417502
  --mac 2
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13
  --pfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --to-bp 132539032

Start time: Wed Jul 10 01:19:49 2024
7450 MiB RAM detected, ~4290 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
576 samples (200 females, 376 males; 576 founders) loaded from
202407_BURDEN_SAS_R7/chr3_SAS_release7.psam.
1959689 variants loaded from 202407_BURDEN_SAS_R7/chr3_SAS_release7.pvar.
1 binary phenotype loaded (338 cases, 216 controls).
Calculating allele frequencies... done.
462 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
540 variants remaining after main filters.
--export vcf to 202407_BURDEN_SAS_R

In [36]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz

In [37]:
## annotate using ANNOVAR
! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out {WORK_DIR}/{ancestry}_DNAJC13.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 202407_BURDEN_SAS_R7/SAS_DNAJC13.vcf.gz > 202407_BURDEN_SAS_R7/SAS_DNAJC13.annovar.avinput>
NOTICE: Finished reading 559 lines from VCF file
NOTICE: A total of 540 locus in VCF file passed QC threshold, representing 495 SNPs (347 transitions and 148 transversions) and 45 indels/substitutions
NOTICE: Finished writing allele frequencies based on 285120 SNP genotypes (199872 transitions and 85248 transversions) and 25920 indels/substitutions for 576 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl 202407_BURDEN_SAS_R7/SAS_DNAJC13.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile 202407_BURDEN_SAS_R7/SAS_DNAJC13.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=ref

In [38]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo579,Otherinfo580,Otherinfo581,Otherinfo582,Otherinfo583,Otherinfo584,Otherinfo585,Otherinfo586,Otherinfo587,Otherinfo588
0,3,132417623,132417623,C,-,UTR5,DNAJC13,NM_001329126:c.-16928del-;NM_015268:c.-16928del-,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,3,132418181,132418181,A,G,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,3,132418296,132418296,C,T,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,3,132418405,132418405,G,A,intronic,DNAJC13,.,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,3,132418415,132418415,A,G,intronic,DNAJC13,.,.,.,...,0|0,0|1,0|0,0|0,1|0,0|0,0|0,0|0,0|1,0|1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3,132538575,132538575,A,G,UTR3,DNAJC13,NM_001329126:c.*293A>G;NM_015268:c.*293A>G,.,.,...,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,0|1,0|1
536,3,132538679,132538679,A,T,UTR3,DNAJC13,NM_001329126:c.*397A>T;NM_015268:c.*397A>T,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
537,3,132538734,132538734,G,A,UTR3,DNAJC13,NM_001329126:c.*452G>A;NM_015268:c.*452G>A,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
538,3,132538890,132538890,G,A,UTR3,DNAJC13,NM_001329126:c.*608G>A;NM_015268:c.*608G>A,.,.,...,0|0,.|.,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [39]:
## Filter intronic
intronic = gene[(gene['Func.refGene'] == 'intronic')]

In [40]:
## Filter UTR3
utr3 = gene[(gene['Func.refGene'] == 'UTR3')]

In [41]:
## Filter UTR5
utr5 = gene[(gene['Func.refGene'] == 'UTR5')]

In [42]:
## Filter exonic and synonymous variants
coding_synonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'synonymous SNV')]

In [43]:
# Filter exonic and non-synonymous variants
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_nonsynonymous

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo579,Otherinfo580,Otherinfo581,Otherinfo582,Otherinfo583,Otherinfo584,Otherinfo585,Otherinfo586,Otherinfo587,Otherinfo588
235,3,132475009,132475009,C,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon22:c.C2369A:p.S790Y,DNAJ...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
326,3,132494190,132494190,A,G,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon34:c.A3872G:p.E1291G,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
349,3,132499231,132499231,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon37:c.C4262T:p.A1421V,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
353,3,132499777,132499777,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon38:c.G4385A:p.R1462H,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
354,3,132499779,132499779,G,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon38:c.G4387T:p.A1463S,DNA...",...,1|1,1|1,1|0,1|0,1|1,1|1,0|1,0|0,1|1,0|1
367,3,132502295,132502295,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon40:c.C4543T:p.P1515S,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
388,3,132507256,132507256,A,G,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon43:c.A5018G:p.Y1673C,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
437,3,132516789,132516789,G,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon48:c.G5646C:p.M1882I,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
467,3,132523636,132523636,G,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon51:c.G5983C:p.V1995L,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
532,3,132538225,132538225,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon56:c.G6675A:p.M2225I,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [44]:
totalVariants = len(gene.axes[0])
totalIntronic = len(intronic.axes[0])
totalUTR3 = len(utr3.axes[0])
totalUTR5 = len(utr5.axes[0])
totalExonicSyn = len(coding_synonymous.axes[0])
totalExonicNonSyn = len(coding_nonsynonymous.axes[0])

In [45]:
print("Total Variants: ", totalVariants)
print("Intronic:", totalIntronic)
print("UTR3:", totalUTR3)
print("UTR5:", totalUTR5)
print("Exonic Syn:", totalExonicSyn)
print("Exonic NonSyn:", totalExonicNonSyn)

Total Variants:  540
Intronic: 519
UTR3: 7
UTR5: 1
Exonic Syn: 3
Exonic NonSyn: 10


In [46]:
# Save in PLINK format 
variants_toKeep = coding_nonsynonymous[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep', sep="\t", index=False, header=False)
variants_toKeep

Unnamed: 0,Chr,Start,End,Gene.refGene
235,3,132475009,132475009,DNAJC13
326,3,132494190,132494190,DNAJC13
349,3,132499231,132499231,DNAJC13
353,3,132499777,132499777,DNAJC13
354,3,132499779,132499779,DNAJC13
367,3,132502295,132502295,DNAJC13
388,3,132507256,132507256,DNAJC13
437,3,132516789,132516789,DNAJC13
467,3,132523636,132523636,DNAJC13
532,3,132538225,132538225,DNAJC13


In [47]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_SAS_release7.log
chr3_SAS_release7.pgen
chr3_SAS_release7.psam
chr3_SAS_release7.pvar
master_key_release7_final.csv
refFlat_HG38_all_chr.txt
SAS_covariate_file.txt
SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
SAS_DNAJC13.annovar.avinput
SAS_DNAJC13.annovar.hg38_multianno.txt
SAS_DNAJC13.annovar.hg38_multianno.vcf
SAS_DNAJC13.log
SAS_DNAJC13.vcf.gz
SAS_DNAJC13.vcf.gz.tbi
SAS_release7.eigenvec
SAS_release7.related
SAS.samplestoKeep


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep {WORKSPACE_BUCKET}//{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

## Burden Analyses using RVTests

In [50]:
# Convert the files from Plink 2.0 to Plink 1.9 format

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--make-bed \
--out {WORK_DIR}/chr3_{ancestry}_release7

PLINK v2.00a6LM AVX2 AMD (4 Jul 2024)          www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/chr3_SAS_release7.log.
Options in effect:
  --make-bed
  --out 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --pfile 202407_BURDEN_SAS_R7/chr3_SAS_release7

Start time: Wed Jul 10 01:20:15 2024
7450 MiB RAM detected, ~4293 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
576 samples (200 females, 376 males; 576 founders) loaded from
202407_BURDEN_SAS_R7/chr3_SAS_release7.psam.
1959689 variants loaded from 202407_BURDEN_SAS_R7/chr3_SAS_release7.pvar.
1 binary phenotype loaded (338 cases, 216 controls).
Writing 202407_BURDEN_SAS_R7/chr3_SAS_release7.fam ... done.
Writing 202407_BURDEN_SAS_R7/chr3_SAS_release7.bim ... done.
Writing 202407_BURDEN_SAS_R7/chr3_SAS_release7.bed ... 101316202326303336404346505356606366707376808386909396done.
End time: Wed Jul 10 01:20:37 2024


In [51]:
## extract variants
    # later do based on class and frequency

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--recode vcf-iid \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding.log.
Options in effect:
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding
  --recode vcf-iid

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132333435

In [52]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz

In [53]:
## RVtests with covariates 
! /home/jupyter/tools/rvtests/executable/rvtest --noweb --hide-covar \
--out {WORK_DIR}/{ancestry}_DNAJC13.burden.coding \
--kernel skat,skato \
--inVcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--mpheno 4 \
--gene DNAJC13 \
--geneFile {WORK_DIR}/refFlat_HG38_all_chr.txt \
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX,PC1,PC2,PC3,PC4 #[WARN]Covariate [ PC5 ] has strong correlation [ r^2 = 1 ] with the response!


# --out : Name of output 
# --burden cmc --kernel skato: tests to run 
# --inVcf : VCF file 
# --gene: gene name (if only looking at one or a few)
# --geneFile refFlat.txt
# --pheno :  covar file
# --mpheno : # column that has phenotype information
# --pheno-name : column name with phenotype in file
# --covar : covar file
# --freqUpper : optional, MAF cut-off
# --covar-name : covariates, listed by column name, separated by commas (no spaces between commas)
## 0=controls; 1=cases

Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output:
                          --inVcf [202407_BURDEN_SAS_R7/SAS_DNAJC13.coding.vcf.gz]
                          --inBgen [], --inBgenSample [], --inKgg []
                          --out [202407_BURDEN_SAS_R7/SAS_DNAJC13.burden.coding]
                          --outputRaw
       Specify Covariate: --covar [202407_BURDEN_SAS_R7/SAS_covariate_file.txt]
                          --covar-name [SEX,PC1,PC2,PC3,PC4], --sex
       Specify Phenotype: --pheno [202407_BURDEN_SAS_R7/SAS_covariate_file.txt]
                          --inverseNormal

In [54]:
## look at results 
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.Skat.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	Pvalue	NumPerm	ActualPerm	Stat	NumGreater	NumEqual	PermPvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	516	10	9	12.1983	0.650295	10000	2757	12.1983	1000	0	0.362713


In [55]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.SkatO.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	rho	Pvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	516	10	9	14422.1	1	0.354309


In [56]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_SAS_release7.bed
chr3_SAS_release7.bim
chr3_SAS_release7.fam
chr3_SAS_release7.log
chr3_SAS_release7.pgen
chr3_SAS_release7.psam
chr3_SAS_release7.pvar
master_key_release7_final.csv
refFlat_HG38_all_chr.txt
SAS_covariate_file.txt
SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
SAS_DNAJC13.annovar.avinput
SAS_DNAJC13.annovar.hg38_multianno.txt
SAS_DNAJC13.annovar.hg38_multianno.vcf
SAS_DNAJC13.burden.coding.log
SAS_DNAJC13.burden.coding.Skat.assoc
SAS_DNAJC13.burden.coding.SkatO.assoc
SAS_DNAJC13.coding.log
SAS_DNAJC13.coding.vcf.gz
SAS_DNAJC13.coding.vcf.gz.tbi
SAS_DNAJC13.log
SAS_DNAJC13.vcf.gz
SAS_DNAJC13.vcf.gz.tbi
SAS_release7.eigenvec
SAS_release7.related
SAS.samplestoKeep


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.*.assoc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

# Case/Control Frequencies

In [59]:
## extract variants

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--pheno {WORK_DIR}/{ancestry}_covariate_file_fullAGE.txt \
--pheno-name PHENO \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --ci 0.95

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --ci 0.95
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718

In [60]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc

 CHR                  SNP         BP   A1      F_A      F_U   A2        CHISQ            P           OR           SE          L95          U95 
   3   chr3:132475009:C:A  132475009    A        0 0.009259    C        6.263      0.01233            0          inf            0          nan 
   3   chr3:132494190:A:G  132494190    G 0.004451 0.006944    A       0.3033       0.5818       0.6393       0.8188       0.1285        3.182 
   3   chr3:132499231:C:T  132499231    T 0.004451        0    C        1.928        0.165           NA           NA           NA           NA 
   3   chr3:132499777:G:A  132499777    A 0.004451  0.03472    G        15.07    0.0001037       0.1243       0.6355      0.03577       0.4319 
   3   chr3:132499779:G:T  132499779    G   0.3591   0.3495    T        0.104       0.7471        1.042        0.129       0.8097        1.342 
   3   chr3:132502295:C:T  132502295    T 0.001484 0.002315    C       0.1007       0.7509       0.6404        1.416      0.03995 

In [61]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc', delim_whitespace=True, usecols=['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR', 'L95', 'U95'])
assoc

Unnamed: 0,SNP,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132475009:C:A,A,0.0,0.009259,C,0.01233,0.0,0.0,
1,chr3:132494190:A:G,G,0.004451,0.006944,A,0.5818,0.6393,0.1285,3.182
2,chr3:132499231:C:T,T,0.004451,0.0,C,0.165,,,
3,chr3:132499777:G:A,A,0.004451,0.03472,G,0.000104,0.1243,0.03577,0.4319
4,chr3:132499779:G:T,G,0.3591,0.3495,T,0.7471,1.042,0.8097,1.342
5,chr3:132502295:C:T,T,0.001484,0.002315,C,0.7509,0.6404,0.03995,10.27
6,chr3:132507256:A:G,G,0.02226,0.03704,A,0.1462,0.5918,0.2895,1.21
7,chr3:132516789:G:C,C,0.004478,0.004651,G,0.9667,0.9625,0.1602,5.784
8,chr3:132523636:G:C,C,0.001488,0.0,G,0.4225,,,
9,chr3:132538225:G:A,A,0.0,0.0,G,,,,


In [62]:
## repeat association study applying multiple testing corrections for the raw p-values (like Bonferroni post-hoc)
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --adjust

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn.log.
Options in effect:
  --adjust
  --allow-no-sex
  --assoc
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181

In [63]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted

 CHR                  SNP      UNADJ         GC       BONF       HOLM   SIDAK_SS   SIDAK_SD     FDR_BH     FDR_BY
   3   chr3:132499777:G:A  0.0001037   0.001084  0.0009333  0.0009333   0.000933   0.000933  0.0009333    0.00264 
   3   chr3:132475009:C:A    0.01233    0.03513     0.1109    0.09861     0.1056    0.09446    0.05547     0.1569 
   3   chr3:132507256:A:G     0.1462     0.2212          1          1     0.7589     0.6693     0.3712          1 
   3   chr3:132499231:C:T      0.165     0.2424          1          1     0.8026     0.6693     0.3712          1 
   3   chr3:132523636:G:C     0.4225     0.4995          1          1     0.9929     0.9357     0.7604          1 
   3   chr3:132494190:A:G     0.5818     0.6429          1          1     0.9996     0.9694     0.8448          1 
   3   chr3:132499779:G:T     0.7471      0.786          1          1          1     0.9838     0.8448          1 
   3   chr3:132502295:C:T     0.7509     0.7893          1          1    

In [64]:
assoc_adjusted = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted', delim_whitespace=True, usecols=['SNP', 'BONF'])
assoc_adjusted

Unnamed: 0,SNP,BONF
0,chr3:132499777:G:A,0.000933
1,chr3:132475009:C:A,0.1109
2,chr3:132507256:A:G,1.0
3,chr3:132499231:C:T,1.0
4,chr3:132523636:G:C,1.0
5,chr3:132494190:A:G,1.0
6,chr3:132499779:G:T,1.0
7,chr3:132502295:C:T,1.0
8,chr3:132516789:G:C,1.0


In [65]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233

In [66]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq

 CHR                  SNP   A1   A2          MAF  NCHROBS
   3   chr3:132475009:C:A    A    C     0.003478     1150
   3   chr3:132494190:A:G    G    A     0.005217     1150
   3   chr3:132499231:C:T    T    C     0.002609     1150
   3   chr3:132499777:G:A    A    G      0.01565     1150
   3   chr3:132499779:G:T    G    T       0.3583     1150
   3   chr3:132502295:C:T    T    C     0.001739     1150
   3   chr3:132507256:A:G    G    A      0.02696     1150
   3   chr3:132516789:G:C    C    G     0.004371     1144
   3   chr3:132523636:G:C    C    G    0.0008711     1148
   3   chr3:132538225:G:A    A    G            0     1102


In [67]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq', delim_whitespace=True, usecols=['SNP', 'MAF'])
freq.head()

Unnamed: 0,SNP,MAF
0,chr3:132475009:C:A,0.003478
1,chr3:132494190:A:G,0.005217
2,chr3:132499231:C:T,0.002609
3,chr3:132499777:G:A,0.01565
4,chr3:132499779:G:T,0.3583


In [68]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq case-control \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq case-control
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262

In [69]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc

 CHR                  SNP   A1   A2        MAF_A        MAF_U  NCHROBS_A  NCHROBS_U
   3   chr3:132475009:C:A    A    C            0     0.009259        674        432
   3   chr3:132494190:A:G    G    A     0.004451     0.006944        674        432
   3   chr3:132499231:C:T    T    C     0.004451            0        674        432
   3   chr3:132499777:G:A    A    G     0.004451      0.03472        674        432
   3   chr3:132499779:G:T    G    T       0.3591       0.3495        674        432
   3   chr3:132502295:C:T    T    C     0.001484     0.002315        674        432
   3   chr3:132507256:A:G    G    A      0.02226      0.03704        674        432
   3   chr3:132516789:G:C    C    G     0.004478     0.004651        670        430
   3   chr3:132523636:G:C    C    G     0.001488            0        672        432
   3   chr3:132538225:G:A    A    G            0            0        638        424


In [70]:
freq_cc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc', delim_whitespace=True, usecols=['SNP', 'MAF_A', 'MAF_U'])
freq_cc.head()

Unnamed: 0,SNP,MAF_A,MAF_U
0,chr3:132475009:C:A,0.0,0.009259
1,chr3:132494190:A:G,0.004451,0.006944
2,chr3:132499231:C:T,0.004451,0.0
3,chr3:132499777:G:A,0.004451,0.03472
4,chr3:132499779:G:T,0.3591,0.3495


In [71]:
all_freq = pd.merge(freq, freq_cc, on="SNP", how="left")
all_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U
0,chr3:132475009:C:A,0.003478,0.0,0.009259
1,chr3:132494190:A:G,0.005217,0.004451,0.006944
2,chr3:132499231:C:T,0.002609,0.004451,0.0
3,chr3:132499777:G:A,0.01565,0.004451,0.03472
4,chr3:132499779:G:T,0.3583,0.3591,0.3495
5,chr3:132502295:C:T,0.001739,0.001484,0.002315
6,chr3:132507256:A:G,0.02696,0.02226,0.03704
7,chr3:132516789:G:C,0.004371,0.004478,0.004651
8,chr3:132523636:G:C,0.000871,0.001488,0.0
9,chr3:132538225:G:A,0.0,0.0,0.0


In [72]:
assoc_and_freq = pd.merge(all_freq, assoc, on="SNP", how="left")
assoc_and_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132475009:C:A,0.003478,0.0,0.009259,A,0.0,0.009259,C,0.01233,0.0,0.0,
1,chr3:132494190:A:G,0.005217,0.004451,0.006944,G,0.004451,0.006944,A,0.5818,0.6393,0.1285,3.182
2,chr3:132499231:C:T,0.002609,0.004451,0.0,T,0.004451,0.0,C,0.165,,,
3,chr3:132499777:G:A,0.01565,0.004451,0.03472,A,0.004451,0.03472,G,0.000104,0.1243,0.03577,0.4319
4,chr3:132499779:G:T,0.3583,0.3591,0.3495,G,0.3591,0.3495,T,0.7471,1.042,0.8097,1.342
5,chr3:132502295:C:T,0.001739,0.001484,0.002315,T,0.001484,0.002315,C,0.7509,0.6404,0.03995,10.27
6,chr3:132507256:A:G,0.02696,0.02226,0.03704,G,0.02226,0.03704,A,0.1462,0.5918,0.2895,1.21
7,chr3:132516789:G:C,0.004371,0.004478,0.004651,C,0.004478,0.004651,G,0.9667,0.9625,0.1602,5.784
8,chr3:132523636:G:C,0.000871,0.001488,0.0,C,0.001488,0.0,G,0.4225,,,
9,chr3:132538225:G:A,0.0,0.0,0.0,A,0.0,0.0,G,,,,


In [73]:
# Merge the results
assoc_and_freq = pd.merge(assoc_and_freq, assoc_adjusted, on="SNP", how="left")
assoc_and_freq.head()

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95,BONF
0,chr3:132475009:C:A,0.003478,0.0,0.009259,A,0.0,0.009259,C,0.01233,0.0,0.0,,0.1109
1,chr3:132494190:A:G,0.005217,0.004451,0.006944,G,0.004451,0.006944,A,0.5818,0.6393,0.1285,3.182,1.0
2,chr3:132499231:C:T,0.002609,0.004451,0.0,T,0.004451,0.0,C,0.165,,,,1.0
3,chr3:132499777:G:A,0.01565,0.004451,0.03472,A,0.004451,0.03472,G,0.000104,0.1243,0.03577,0.4319,0.000933
4,chr3:132499779:G:T,0.3583,0.3591,0.3495,G,0.3591,0.3495,T,0.7471,1.042,0.8097,1.342,1.0


In [74]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--recode A \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 202407_BURDEN_SAS_R7/chr3_SAS_release7
  --extract range 202407_BURDEN_SAS_R7/SAS_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 202407_BURDEN_SAS_R7/SAS.samplestoKeep
  --out 202407_BURDEN_SAS_R7/SAS_DNAJC13.coding_nonsyn
  --recode A

7450 MB RAM detected; reserving 3725 MB for main workspace.
1959689 variants loaded from .bim file.
576 people (376 males, 200 females) loaded from .fam.
554 phenotype values loaded from .fam.
--extract range: 1959679 variants excluded.
--extract range: 10 variants remaining.
--keep: 575 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 575 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw', delim_whitespace=True)
recode.head()

In [76]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for DNAJC13: {len(variants)}')
variants

Number of variants in SAS for DNAJC13: 10


['chr3:132475009:C:A_A',
 'chr3:132494190:A:G_G',
 'chr3:132499231:C:T_T',
 'chr3:132499777:G:A_A',
 'chr3:132499779:G:T_G',
 'chr3:132502295:C:T_T',
 'chr3:132507256:A:G_G',
 'chr3:132516789:G:C_C',
 'chr3:132523636:G:C_C',
 'chr3:132538225:G:A_A']

In [77]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
#    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
#    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
#        'Freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
#        'Freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['SNP'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls,SNP
0,chr3:132475009:C:A_A,0,0,337,0,4,216,chr3:132475009:C:A
1,chr3:132494190:A:G_G,0,3,337,0,3,216,chr3:132494190:A:G
2,chr3:132499231:C:T_T,0,3,337,0,0,216,chr3:132499231:C:T
3,chr3:132499777:G:A_A,0,3,337,1,13,216,chr3:132499777:G:A
4,chr3:132499779:G:T_G,43,156,337,24,103,216,chr3:132499779:G:T
5,chr3:132502295:C:T_T,0,1,337,0,1,216,chr3:132502295:C:T
6,chr3:132507256:A:G_G,0,15,337,1,14,216,chr3:132507256:A:G
7,chr3:132516789:G:C_C,0,3,337,0,2,216,chr3:132516789:G:C
8,chr3:132523636:G:C_C,0,1,337,0,0,216,chr3:132523636:G:C
9,chr3:132538225:G:A_A,0,0,337,0,0,216,chr3:132538225:G:A


In [78]:
## Merge with assoc results 
full_results = pd.merge(assoc_and_freq, df_results, on="SNP", how="left")

In [79]:
clean_full_results = full_results[['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR','L95', 'U95', 'BONF', 
                                   'MAF', 'MAF_A', 'MAF_U', 
                                   'Hom Cases', 'Het Cases', 'Total Cases', 
                                   'Hom Controls','Het Controls', 'Total Controls']].copy()

In [80]:
print(clean_full_results.shape)
print(f'No. of SNPs: {clean_full_results.shape[0]}')
clean_full_results

(10, 19)
No. of SNPs: 10


Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
0,chr3:132475009:C:A,A,C,0.0,0.009259,0.01233,0.0,0.0,,0.1109,0.003478,0.0,0.009259,0,0,337,0,4,216
1,chr3:132494190:A:G,G,A,0.004451,0.006944,0.5818,0.6393,0.1285,3.182,1.0,0.005217,0.004451,0.006944,0,3,337,0,3,216
2,chr3:132499231:C:T,T,C,0.004451,0.0,0.165,,,,1.0,0.002609,0.004451,0.0,0,3,337,0,0,216
3,chr3:132499777:G:A,A,G,0.004451,0.03472,0.000104,0.1243,0.03577,0.4319,0.000933,0.01565,0.004451,0.03472,0,3,337,1,13,216
4,chr3:132499779:G:T,G,T,0.3591,0.3495,0.7471,1.042,0.8097,1.342,1.0,0.3583,0.3591,0.3495,43,156,337,24,103,216
5,chr3:132502295:C:T,T,C,0.001484,0.002315,0.7509,0.6404,0.03995,10.27,1.0,0.001739,0.001484,0.002315,0,1,337,0,1,216
6,chr3:132507256:A:G,G,A,0.02226,0.03704,0.1462,0.5918,0.2895,1.21,1.0,0.02696,0.02226,0.03704,0,15,337,1,14,216
7,chr3:132516789:G:C,C,G,0.004478,0.004651,0.9667,0.9625,0.1602,5.784,1.0,0.004371,0.004478,0.004651,0,3,337,0,2,216
8,chr3:132523636:G:C,C,G,0.001488,0.0,0.4225,,,,1.0,0.000871,0.001488,0.0,0,1,337,0,0,216
9,chr3:132538225:G:A,A,G,0.0,0.0,,,,,,0.0,0.0,0.0,0,0,337,0,0,216


In [81]:
# Look at significant SNPs, if any 
sig_freq = clean_full_results[clean_full_results['P']<0.05]
sig_snps = sig_freq['SNP'].tolist()
sig_snps

['chr3:132475009:C:A', 'chr3:132499777:G:A']

In [82]:
# Look at significant SNPs, if any 
sig_df_results = clean_full_results[clean_full_results['SNP'].isin(sig_snps)]
sig_df_results

Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
0,chr3:132475009:C:A,A,C,0.0,0.009259,0.01233,0.0,0.0,,0.1109,0.003478,0.0,0.009259,0,0,337,0,4,216
3,chr3:132499777:G:A,A,G,0.004451,0.03472,0.000104,0.1243,0.03577,0.4319,0.000933,0.01565,0.004451,0.03472,0,3,337,1,13,216


In [83]:
for snp in sig_snps:
    ! grep {snp} {WORK_DIR}/chr3_{ancestry}_release7.pvar

3	132475009	chr3:132475009:C:A	C	A	TYPED;IMPUTED;AF=0.00318583;MAF=0.00318583;AVG_CS=0.999964;R2=0.988774;ER2=0.777405
3	132499777	chr3:132499777:G:A	G	A	TYPED;IMPUTED;AF=0.0150827;MAF=0.0150827;AVG_CS=0.999878;R2=0.991941;ER2=0.809756


In [84]:
# Save files to VM
clean_full_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt', sep="\t", index=False)
sig_df_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt' , sep="\t", index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/{WORKSPACE_DIR}/')