# Variants in DNAJC13 and their Association with Parkinson's Disease Across Different Ancestral Backgrounds

* **Project**: DNAJC13 Gene Assesment
* **Last updated**: September 2024
* **Version**: Python 3.9
* **Data**: GP2 Release 7

# Summary
This notebook is focused on analysing association between DNAJC13 variants in European (EUR) ancestry and Parkinson's disease.

# Imports

In [1]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

  from IPython.core.display import display, HTML


In [2]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## AMP-PD v2.5
## Explicitly define release v2.5 path 
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'

AMP_WGS_RELEASE_PLINK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'plink')
AMP_WGS_RELEASE_GATK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'gatk')

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name @ `WORKSPACE_NAME`: {WORKSPACE_NAME}')
print(f'Billing Project @ `BILLING_PROJECT_ID`: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data @ `WORKSPACE_BUCKET`: {WORKSPACE_BUCKET}')
print('')

print('AMP-PD v2.5')
print(f'Path to AMP-PD v2.5 Clinical Data @ `AMP_CLINICAL_RELEASE_PATH`: {AMP_CLINICAL_RELEASE_PATH}')
print(f'Path to AMP-PD v2.5 WGS Data @ `AMP_WGS_RELEASE_PLINK_PATH`: {AMP_WGS_RELEASE_PLINK_PATH}')
print('')

## GP2 v7.0
## Explicitly define release v7.0 path 
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v7.0')
print(f'Path to GP2 v7.0 Clinical Data @ `GP2_CLINICAL_RELEASE_PATH`: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Metadata @ `GP2_META_RELEASE_PATH`: {GP2_META_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Raw Genotype Data @ `GP2_RAW_GENO_PATH`: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v7.0 Imputed Genotype Data @ `GP2_IMPUTED_GENO_PATH`: {GP2_IMPUTED_GENO_PATH}')

# Make working directory

In [2]:
ancestry = 'EUR'
WORK_DIR = f'2024_BURDEN_{ancestry}_R7'

#! mkdir {WORK_DIR}

# Install Packages

## PLINK

In [None]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi

In [None]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink2; then
echo "Plink2 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink2 \n    -------"
wget -N https://s3.amazonaws.com/plink2-assets/alpha3/plink2_linux_avx2_20220603.zip
unzip -o plink2_linux_avx2_20220603.zip
echo -e "\n plink2 downloaded and unzipped in /home/jupyter/tools \n "

fi

## ANNOVAR

In [None]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

In [None]:
%%bash

# Install ANNOVAR: Download resources for annotation

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20140902 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/

## RVTests

In [None]:
%%bash

#Install RVTESTS: Option 1 (~15min)
if test -e /home/jupyter/tools/rvtests; then

echo "rvtests is already installed"
else
echo "rvtests is not installed"

mkdir /home/jupyter/tools/rvtests
cd /home/jupyter/tools/rvtests

wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz 

tar -zxvf rvtests_linux64.tar.gz
fi

In [None]:
# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2
! chmod 777 /home/jupyter/tools/rvtests/executable/rvtest

# Copy Over Files 

In [None]:
## chr 3
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_IMPUTED_GENO_PATH}/{ancestry}/chr3_{ancestry}_release7.* {WORK_DIR}')

In [None]:
## clinical data 

## master key
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_CLINICAL_RELEASE_PATH}/master_key_release7_final.csv {WORK_DIR}')

## related file 
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_META_RELEASE_PATH}/related_samples/{ancestry}_release7.related {WORK_DIR}')

In [None]:
## PCs
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_RAW_GENO_PATH}/{ancestry}/{ancestry}_release7.eigenvec {WORK_DIR}')

In [None]:
# refFlat
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {WORKSPACE_BUCKET}/notebooks/misc/refFlat_HG38_all_chr.txt {WORK_DIR}')

In [9]:
## Check files
! ls {WORK_DIR}

chr3_EUR_release7.log	chr3_EUR_release7.pvar	master_key_release7_final.csv
chr3_EUR_release7.pgen	EUR_release7.eigenvec	refFlat_HG38_all_chr.txt
chr3_EUR_release7.psam	EUR_release7.related


# Covariate File

In [None]:
# Let's load the master key
key = pd.read_csv(f'{WORK_DIR}/master_key_release7_final.csv')
print(key.shape)
key.head()

In [None]:
key = key[['GP2sampleID', 'baseline_GP2_phenotype', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset', 'race_for_qc', 'label']]
key.rename(columns = {'GP2sampleID': 'IID',
                      'baseline_GP2_phenotype':'phenotype',
                                     'biological_sex_for_qc':'SEX', 
                                     'age_at_sample_collection':'AGE', 
                                     'race_for_qc':'RACE',
                                     'age_of_onset':'AAO'}, inplace = True)
key

In [None]:
## Subset to keep ancestry of interest 
ancestry_key = key[key['label']==ancestry].copy()
ancestry_key.reset_index(drop=True)

In [None]:
# Load information about related individuals 
related_df = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.related')
print(related_df.shape)
related_df

In [None]:
# Make a list of just one set of related people
related_list = list(related_df['IID1'])

# Check value counts of related and remove only one related individual
ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]

# Check size
print(ancestry_key.shape)
ancestry_key

In [15]:
# Convert phenotype to binary (0/1)
    # PD = 1; control = 0
pheno_mapping = {"PD": 1, "Control": 0}
ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')

In [16]:
# Check value counts of pheno
ancestry_key['PHENO'].value_counts(dropna=False)

PHENO
1       14003
<NA>     7231
0        5503
Name: count, dtype: Int64

In [None]:
## Get the PCs
pcs = pd.read_csv(f'{WORK_DIR}/{ancestry}_release7.eigenvec', sep='\t')

# Split the single column into multiple columns (No longer needed with release 7)
#pcs_split = pcs['#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10'].str.split(',', expand=True)

selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']


#pcs = pd.DataFrame(data=pcs_split.iloc[:, 1:7].values, columns=selected_columns)
pcs = pd.DataFrame(data=pcs.iloc[:, 1:7].values, columns=selected_columns)

# Drop the first row (since it's now the column names)
pcs = pcs.drop(0)

# Reset the index to remove any potential issues
pcs = pcs.reset_index(drop=True)


# Display the resulting DataFrame
print(pcs)

In [18]:
# Convert sex to binary (0/1)
    # Female = 1; male = 0
sex_mapping = {"Female": 1, "Male": 0}
ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')

In [19]:
# Check value counts of SEX
ancestry_key['SEX'].value_counts(dropna=False)

SEX
0       16130
1       10434
<NA>      173
Name: count, dtype: Int64

In [20]:
## Make covariate file
df = pd.merge(pcs, ancestry_key, on='IID', how='left')
df.columns

Index(['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'phenotype', 'SEX', 'AGE',
       'AAO', 'RACE', 'label', 'PHENO'],
      dtype='object')

In [21]:
df['FID'] = 0

In [None]:
## Clean up and keep columns we need 
final_df = df[['FID','IID', 'SEX', 'PHENO', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
final_df

In [23]:
final_df.groupby(['PHENO'])['SEX'].value_counts()

PHENO  SEX
0      0      2987
       1      2309
1      0      8279
       1      4850
Name: count, dtype: int64

In [24]:
## Make file of sample IDs to keep 
samples_toKeep = final_df[['FID', 'IID']].copy()
samples_toKeep.to_csv(f'{WORK_DIR}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)

In [25]:
## Save your covariate file
final_df.to_csv(f'{WORK_DIR}/{ancestry}_covariate_file.txt', sep = '\t', index=False)

In [26]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_EUR_release7.log	EUR_covariate_file.txt	master_key_release7_final.csv
chr3_EUR_release7.pgen	EUR_release7.eigenvec	refFlat_HG38_all_chr.txt
chr3_EUR_release7.psam	EUR_release7.related
chr3_EUR_release7.pvar	EUR.samplestoKeep


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_covariate_file.txt {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/{ancestry}_covariate_file.txt')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}.samplestoKeep {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/{ancestry}.samplestoKeep')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

# Annotation using ANNOVAR

* *DNAJC13* from NCBI gene
* hg38 (chr3:132417502-132539032)

In [5]:
## extract region using plink

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--chr 3 \
--from-bp 132417502  \
--to-bp 132539032 \
--recode vcf id-paste=iid \
--mac 2 \
--out {WORK_DIR}/{ancestry}_DNAJC13

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.log.
Options in effect:
  --chr 3
  --export vcf id-paste=iid
  --from-bp 132417502
  --mac 2
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13
  --pfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --to-bp 132539032

Start time: Thu May  2 18:39:36 2024
7450 MiB RAM detected, ~5966 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
25801 samples (10132 females, 15669 males; 25801 founders) loaded from
2024_BURDEN_EUR_R7/chr3_EUR_release7.psam.
14469872 variants loaded from 2024_BURDEN_EUR_R7/chr3_EUR_release7.pvar.
1 binary phenotype loaded (13263 cases, 5297 controls).
Calculating allele frequencies... done.
5276 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
2991 variants remaining after main filters.
--export vcf to 2024_BURDEN_EUR

In [6]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz

In [7]:
## annotate using ANNOVAR
! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out {WORK_DIR}/{ancestry}_DNAJC13.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 2024_BURDEN_EUR_R7/EUR_DNAJC13.vcf.gz > 2024_BURDEN_EUR_R7/EUR_DNAJC13.annovar.avinput>
NOTICE: Finished reading 3014 lines from VCF file
NOTICE: A total of 2991 locus in VCF file passed QC threshold, representing 2737 SNPs (1901 transitions and 836 transversions) and 254 indels/substitutions
NOTICE: Finished writing allele frequencies based on 70617337 SNP genotypes (49047701 transitions and 21569636 transversions) and 6553454 indels/substitutions for 25801 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl 2024_BURDEN_EUR_R7/EUR_DNAJC13.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile 2024_BURDEN_EUR_R7/EUR_DNAJC13.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Processing operation=g prot

In [8]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo25804,Otherinfo25805,Otherinfo25806,Otherinfo25807,Otherinfo25808,Otherinfo25809,Otherinfo25810,Otherinfo25811,Otherinfo25812,Otherinfo25813
0,3,132417504,132417504,A,G,UTR5,DNAJC13,NM_001329126:c.-17047A>G;NM_015268:c.-17047A>G,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,3,132417507,132417507,G,A,UTR5,DNAJC13,NM_001329126:c.-17044G>A;NM_015268:c.-17044G>A,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,3,132417543,132417543,C,T,UTR5,DNAJC13,NM_001329126:c.-17008C>T;NM_015268:c.-17008C>T,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,3,132417565,132417577,GAGGGAGGAGGCG,-,UTR5,DNAJC13,NM_001329126:c.-16986_-16974del-;NM_015268:c.-...,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,3,132417565,132417565,G,C,UTR5,DNAJC13,NM_001329126:c.-16986G>C;NM_015268:c.-16986G>C,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2986,3,132538903,132538903,G,A,UTR3,DNAJC13,NM_001329126:c.*621G>A;NM_015268:c.*621G>A,.,.,...,0|0,0|0,0|0,0|1,0|1,0|1,1|0,0|0,0|0,0|1
2987,3,132538916,132538916,G,A,UTR3,DNAJC13,NM_001329126:c.*634G>A;NM_015268:c.*634G>A,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2988,3,132538938,132538938,T,G,UTR3,DNAJC13,NM_001329126:c.*656T>G;NM_015268:c.*656T>G,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2989,3,132538944,132538944,T,-,UTR3,DNAJC13,NM_001329126:c.*662delT;NM_015268:c.*662delT,.,.,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [9]:
## Filter intronic
intronic = gene[(gene['Func.refGene'] == 'intronic')]

In [10]:
## Filter UTR3
utr3 = gene[(gene['Func.refGene'] == 'UTR3')]

In [11]:
## Filter UTR5
utr5 = gene[(gene['Func.refGene'] == 'UTR5')]

In [12]:
## Filter exonic and synonymous variants
coding_synonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'synonymous SNV')]

In [13]:
# Filter exonic and non-synonymous variants
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_nonsynonymous

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo25804,Otherinfo25805,Otherinfo25806,Otherinfo25807,Otherinfo25808,Otherinfo25809,Otherinfo25810,Otherinfo25811,Otherinfo25812,Otherinfo25813
472,3,132434590,132434590,T,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon2:c.T40C:p.Y14H,DNAJC...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
787,3,132447373,132447373,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon4:c.C197T:p.T66M,DNAJ...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
790,3,132447465,132447465,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon4:c.G289A:p.A97T,DNAJ...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
868,3,132450732,132450732,A,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon6:c.A422T:p.N141I,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
869,3,132450777,132450777,T,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon6:c.T467C:p.I156T,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2660,3,132525718,132525718,G,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon52:c.G6169T:p.A2057S,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2711,3,132528264,132528264,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon54:c.G6457A:p.A2153T,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2712,3,132528316,132528316,T,G,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon54:c.T6509G:p.L2170W,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2781,3,132531088,132531088,T,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon55:c.T6616C:p.Y2206H,DNA...",...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [14]:
totalVariants = len(gene.axes[0])
totalIntronic = len(intronic.axes[0])
totalUTR3 = len(utr3.axes[0])
totalUTR5 = len(utr5.axes[0])
totalExonicSyn = len(coding_synonymous.axes[0])
totalExonicNonSyn = len(coding_nonsynonymous.axes[0])

In [15]:
print("Total Variants: ", totalVariants)
print("Intronic:", totalIntronic)
print("UTR3:", totalUTR3)
print("UTR5:", totalUTR5)
print("Exonic Syn:", totalExonicSyn)
print("Exonic NonSyn:", totalExonicNonSyn)

Total Variants:  2991
Intronic: 2855
UTR3: 25
UTR5: 11
Exonic Syn: 36
Exonic NonSyn: 64


In [16]:
# Save in PLINK format 
variants_toKeep = coding_nonsynonymous[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep', sep="\t", index=False, header=False)
variants_toKeep

Unnamed: 0,Chr,Start,End,Gene.refGene
472,3,132434590,132434590,DNAJC13
787,3,132447373,132447373,DNAJC13
790,3,132447465,132447465,DNAJC13
868,3,132450732,132450732,DNAJC13
869,3,132450777,132450777,DNAJC13
...,...,...,...,...
2660,3,132525718,132525718,DNAJC13
2711,3,132528264,132528264,DNAJC13
2712,3,132528316,132528316,DNAJC13
2781,3,132531088,132531088,DNAJC13


In [17]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_EUR_release7.log			      EUR_DNAJC13.log
chr3_EUR_release7.pgen			      EUR_DNAJC13.vcf.gz
chr3_EUR_release7.psam			      EUR_DNAJC13.vcf.gz.tbi
chr3_EUR_release7.pvar			      EUR_release7.eigenvec
EUR_covariate_file.txt			      EUR_release7.related
EUR_DNAJC13.all_coding_nonsyn.variantstoKeep  EUR.samplestoKeep
EUR_DNAJC13.annovar.avinput		      master_key_release7_final.csv
EUR_DNAJC13.annovar.hg38_multianno.txt	      refFlat_HG38_all_chr.txt
EUR_DNAJC13.annovar.hg38_multianno.vcf


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep ')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

## Burden Analyses using RVTests

In [6]:
# Convert the files from Plink 2.0 to Plink 1.9 format

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry}_release7 \
--make-bed \
--out {WORK_DIR}/chr3_{ancestry}_release7

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/chr3_EUR_release7.log.
Options in effect:
  --make-bed
  --out 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --pfile 2024_BURDEN_EUR_R7/chr3_EUR_release7

Start time: Thu May  2 20:44:00 2024
7450 MiB RAM detected, ~6221 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
25801 samples (10132 females, 15669 males; 25801 founders) loaded from
2024_BURDEN_EUR_R7/chr3_EUR_release7.psam.
14469872 variants loaded from 2024_BURDEN_EUR_R7/chr3_EUR_release7.pvar.
1 binary phenotype loaded (13263 cases, 5297 controls).
Writing 2024_BURDEN_EUR_R7/chr3_EUR_release7.fam ... done.
Writing 2024_BURDEN_EUR_R7/chr3_EUR_release7.bim ... done.
Writing 2024_BURDEN_EUR_R7/chr3_EUR_release7.bed ... 1011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606

In [7]:
## extract variants
    # later do based on class and frequency

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--recode vcf-iid \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding.log.
Options in effect:
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding
  --recode vcf-iid

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233

In [8]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz

In [9]:
## RVtests with covariates 
! /home/jupyter/tools/rvtests/executable/rvtest --noweb --hide-covar \
--out {WORK_DIR}/{ancestry}_DNAJC13.burden.coding \
--kernel skat,skato \
--inVcf {WORK_DIR}/{ancestry}_DNAJC13.coding.vcf.gz \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--mpheno 4 \
--gene DNAJC13 \
--geneFile {WORK_DIR}/refFlat_HG38_all_chr.txt \
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX,PC1,PC2,PC3,PC4 #[WARN]Covariate [ PC5 ] has strong correlation [ r^2 = 1 ] with the response!


# --out : Name of output 
# --burden cmc --kernel skato: tests to run 
# --inVcf : VCF file 
# --gene: gene name (if only looking at one or a few)
# --geneFile refFlat.txt
# --pheno :  covar file
# --mpheno : # column that has phenotype information
# --pheno-name : column name with phenotype in file
# --covar : covar file
# --freqUpper : optional, MAF cut-off
# --covar-name : covariates, listed by column name, separated by commas (no spaces between commas)
## 0=controls; 1=cases

Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output: --inVcf [2024_BURDEN_EUR_R7/EUR_DNAJC13.coding.vcf.gz]
                          --inBgen [], --inBgenSample [], --inKgg []
                          --out [2024_BURDEN_EUR_R7/EUR_DNAJC13.burden.coding]
                          --outputRaw
       Specify Covariate: --covar [2024_BURDEN_EUR_R7/EUR_covariate_file.txt]
                          --covar-name [SEX,PC1,PC2,PC3,PC4], --sex
       Specify Phenotype: --pheno [2024_BURDEN_EUR_R7/EUR_covariate_file.txt]
                          --inverseNormal, --useResidualAsPhenotype
                       

In [10]:
## look at results 
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.Skat.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	Pvalue	NumPerm	ActualPerm	Stat	NumGreater	NumEqual	PermPvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	18425	67	58	1174.43	0.0312121	10000	10000	1174.43	319	0	0.0319


In [11]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.SkatO.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	rho	Pvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	18425	67	58	3.92594e+07	1	0.00621017


In [12]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr3_EUR_release7.bed
chr3_EUR_release7.bim
chr3_EUR_release7.fam
chr3_EUR_release7.log
chr3_EUR_release7.pgen
chr3_EUR_release7.psam
chr3_EUR_release7.pvar
EUR_covariate_file.txt
EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
EUR_DNAJC13.annovar.avinput
EUR_DNAJC13.annovar.hg38_multianno.txt
EUR_DNAJC13.annovar.hg38_multianno.vcf
EUR_DNAJC13.burden.coding.log
EUR_DNAJC13.burden.coding.Skat.assoc
EUR_DNAJC13.burden.coding.SkatO.assoc
EUR_DNAJC13.coding.log
EUR_DNAJC13.coding.vcf.gz
EUR_DNAJC13.coding.vcf.gz.tbi
EUR_DNAJC13.log
EUR_DNAJC13.vcf.gz
EUR_DNAJC13.vcf.gz.tbi
EUR_release7.eigenvec
EUR_release7.related
EUR.samplestoKeep
master_key_release7_final.csv
refFlat_HG38_all_chr.txt


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.burden.coding.*.assoc {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

# Case/Control Frequencies

In [5]:
## extract variants

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --ci 0.95

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --ci 0.95
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516

In [7]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc

 CHR                  SNP         BP   A1      F_A      F_U   A2        CHISQ            P           OR           SE          L95          U95 
   3   chr3:132434590:T:C  132434590    C 3.77e-05 9.439e-05    T       0.4516       0.5016       0.3994        1.414      0.02498        6.386 
   3   chr3:132447373:C:T  132447373    T 3.774e-05        0    C       0.3993       0.5274           NA           NA           NA           NA 
   3   chr3:132447465:G:A  132447465    A 0.0001131        0    G        1.198       0.2737           NA           NA           NA           NA 
   3   chr3:132450732:A:T  132450732    T 0.0001887 0.0002833    A       0.3143        0.575       0.6658       0.7304       0.1591        2.787 
   3   chr3:132450777:T:C  132450777    C        0 9.439e-05    T        2.503       0.1136            0          inf            0          nan 
   3   chr3:132450821:A:G  132450821    G 3.77e-05 0.0001888    A        2.138       0.1437       0.1997        1.225       

In [9]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc', delim_whitespace=True, usecols=['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR', 'L95', 'U95'])
assoc

Unnamed: 0,SNP,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132434590:T:C,C,0.000038,0.000094,T,0.5016,0.3994,0.02498,6.386
1,chr3:132447373:C:T,T,0.000038,0.000000,C,0.5274,,,
2,chr3:132447465:G:A,A,0.000113,0.000000,G,0.2737,,,
3,chr3:132450732:A:T,T,0.000189,0.000283,A,0.5750,0.6658,0.15910,2.787
4,chr3:132450777:T:C,C,0.000000,0.000094,T,0.1136,0.0000,0.00000,
...,...,...,...,...,...,...,...,...,...
62,chr3:132525718:G:T,T,0.000604,0.000945,G,0.2623,0.6387,0.28980,1.408
63,chr3:132528264:G:A,A,0.000113,0.000094,G,0.8753,1.1980,0.12460,11.520
64,chr3:132528316:T:G,G,0.004901,0.005664,T,0.3526,0.8647,0.63630,1.175
65,chr3:132531088:T:C,C,0.000038,0.000000,T,0.5274,,,


In [10]:
## repeat association study applying multiple testing corrections for the raw p-values (like Bonferroni post-hoc)
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--assoc \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --allow-no-sex --adjust

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --adjust
  --allow-no-sex
  --assoc
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161

In [11]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted

 CHR                  SNP      UNADJ         GC       BONF       HOLM   SIDAK_SS   SIDAK_SD     FDR_BH     FDR_BY
   3   chr3:132494190:A:G     0.0228    0.04072          1          1     0.7376     0.7376     0.6806          1 
   3   chr3:132500821:A:C    0.02524    0.04429          1          1      0.773     0.7671     0.6806          1 
   3   chr3:132499779:G:T    0.04628    0.07326          1          1      0.936     0.9296     0.6806          1 
   3   chr3:132492449:C:T     0.1135     0.1549          1          1     0.9991     0.9987     0.6806          1 
   3   chr3:132450777:T:C     0.1136      0.155          1          1     0.9991     0.9987     0.6806          1 
   3   chr3:132499168:G:A     0.1216     0.1641          1          1     0.9995      0.999     0.6806          1 
   3   chr3:132538225:G:A     0.1344     0.1785          1          1     0.9998     0.9994     0.6806          1 
   3   chr3:132450821:A:G     0.1437     0.1887          1          1    

In [12]:
assoc_adjusted = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted', delim_whitespace=True, usecols=['SNP', 'BONF'])
assoc_adjusted

Unnamed: 0,SNP,BONF
0,chr3:132494190:A:G,1
1,chr3:132500821:A:C,1
2,chr3:132499779:G:T,1
3,chr3:132492449:C:T,1
4,chr3:132450777:T:C,1
5,chr3:132499168:G:A,1
6,chr3:132538225:G:A,1
7,chr3:132450821:A:G,1
8,chr3:132516436:C:G,1
9,chr3:132467277:G:A,1


In [13]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031

In [14]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq

 CHR                  SNP   A1   A2          MAF  NCHROBS
   3   chr3:132434590:T:C    C    T    3.876e-05    51600
   3   chr3:132447373:C:T    T    C    1.939e-05    51560
   3   chr3:132447465:G:A    A    G    7.752e-05    51598
   3   chr3:132450732:A:T    T    A    0.0001552    51560
   3   chr3:132450777:T:C    C    T    5.815e-05    51594
   3   chr3:132450821:A:G    G    A    7.752e-05    51600
   3   chr3:132453642:T:C    C    T    5.814e-05    51598
   3   chr3:132454135:C:T    T    C            0    51594
   3   chr3:132456308:C:G    G    C    1.938e-05    51592
   3   chr3:132456751:C:T    T    C    0.0001357    51592
   3   chr3:132457316:G:T    T    G    5.814e-05    51600
   3   chr3:132466351:A:C    C    A    0.0003295    51600
   3   chr3:132466392:A:G    G    A    7.753e-05    51594
   3   chr3:132467228:C:T    T    C    3.876e-05    51596
   3   chr3:132467269:G:T    T    G    5.816e-05    51586
   3   chr3:132467277:G:A    A    G    5.816e-05    5158

In [15]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq', delim_whitespace=True, usecols=['SNP', 'MAF'])
freq.head()

Unnamed: 0,SNP,MAF
0,chr3:132434590:T:C,3.9e-05
1,chr3:132447373:C:T,1.9e-05
2,chr3:132447465:G:A,7.8e-05
3,chr3:132450732:A:T,0.000155
4,chr3:132450777:T:C,5.8e-05


In [16]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq case-control \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq case-control
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242

In [17]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc

 CHR                  SNP   A1   A2        MAF_A        MAF_U  NCHROBS_A  NCHROBS_U
   3   chr3:132434590:T:C    C    T     3.77e-05    9.439e-05      26524      10594
   3   chr3:132447373:C:T    T    C    3.774e-05            0      26500      10582
   3   chr3:132447465:G:A    A    G    0.0001131            0      26524      10592
   3   chr3:132450732:A:T    T    A    0.0001887    0.0002833      26500      10588
   3   chr3:132450777:T:C    C    T            0    9.439e-05      26520      10594
   3   chr3:132450821:A:G    G    A     3.77e-05    0.0001888      26524      10594
   3   chr3:132453642:T:C    C    T     7.54e-05    9.439e-05      26524      10594
   3   chr3:132454135:C:T    T    C            0            0      26518      10594
   3   chr3:132456308:C:G    G    C            0            0      26522      10590
   3   chr3:132456751:C:T    T    C    0.0001508    0.0002832      26518      10594
   3   chr3:132457316:G:T    T    G     3.77e-05            0    

In [18]:
freq_cc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc', delim_whitespace=True, usecols=['SNP', 'MAF_A', 'MAF_U'])
freq_cc.head()

Unnamed: 0,SNP,MAF_A,MAF_U
0,chr3:132434590:T:C,3.8e-05,9.4e-05
1,chr3:132447373:C:T,3.8e-05,0.0
2,chr3:132447465:G:A,0.000113,0.0
3,chr3:132450732:A:T,0.000189,0.000283
4,chr3:132450777:T:C,0.0,9.4e-05


In [19]:
all_freq = pd.merge(freq, freq_cc, on="SNP", how="left")
all_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U
0,chr3:132434590:T:C,0.000039,0.000038,0.000094
1,chr3:132447373:C:T,0.000019,0.000038,0.000000
2,chr3:132447465:G:A,0.000078,0.000113,0.000000
3,chr3:132450732:A:T,0.000155,0.000189,0.000283
4,chr3:132450777:T:C,0.000058,0.000000,0.000094
...,...,...,...,...
62,chr3:132525718:G:T,0.000931,0.000604,0.000945
63,chr3:132528264:G:A,0.000116,0.000113,0.000094
64,chr3:132528316:T:G,0.005039,0.004901,0.005664
65,chr3:132531088:T:C,0.000039,0.000038,0.000000


In [20]:
assoc_and_freq = pd.merge(all_freq, assoc, on="SNP", how="left")
assoc_and_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95
0,chr3:132434590:T:C,0.000039,0.000038,0.000094,C,0.000038,0.000094,T,0.5016,0.3994,0.02498,6.386
1,chr3:132447373:C:T,0.000019,0.000038,0.000000,T,0.000038,0.000000,C,0.5274,,,
2,chr3:132447465:G:A,0.000078,0.000113,0.000000,A,0.000113,0.000000,G,0.2737,,,
3,chr3:132450732:A:T,0.000155,0.000189,0.000283,T,0.000189,0.000283,A,0.5750,0.6658,0.15910,2.787
4,chr3:132450777:T:C,0.000058,0.000000,0.000094,C,0.000000,0.000094,T,0.1136,0.0000,0.00000,
...,...,...,...,...,...,...,...,...,...,...,...,...
62,chr3:132525718:G:T,0.000931,0.000604,0.000945,T,0.000604,0.000945,G,0.2623,0.6387,0.28980,1.408
63,chr3:132528264:G:A,0.000116,0.000113,0.000094,A,0.000113,0.000094,G,0.8753,1.1980,0.12460,11.520
64,chr3:132528316:T:G,0.005039,0.004901,0.005664,G,0.004901,0.005664,T,0.3526,0.8647,0.63630,1.175
65,chr3:132531088:T:C,0.000039,0.000038,0.000000,C,0.000038,0.000000,T,0.5274,,,


In [27]:
# Merge the results
assoc_and_freq = pd.merge(assoc_and_freq, assoc_adjusted, on="SNP", how="left")
assoc_and_freq.head()

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95,BONF
0,chr3:132434590:T:C,3.9e-05,3.8e-05,9.4e-05,C,3.8e-05,9.4e-05,T,0.5016,0.3994,0.02498,6.386,1.0
1,chr3:132447373:C:T,1.9e-05,3.8e-05,0.0,T,3.8e-05,0.0,C,0.5274,,,,1.0
2,chr3:132447465:G:A,7.8e-05,0.000113,0.0,A,0.000113,0.0,G,0.2737,,,,1.0
3,chr3:132450732:A:T,0.000155,0.000189,0.000283,T,0.000189,0.000283,A,0.575,0.6658,0.1591,2.787,1.0
4,chr3:132450777:T:C,5.8e-05,0.0,9.4e-05,C,0.0,9.4e-05,T,0.1136,0.0,0.0,,1.0


In [21]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_release7 \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--recode A \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_EUR_R7/chr3_EUR_release7
  --extract range 2024_BURDEN_EUR_R7/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_EUR_R7/EUR.samplestoKeep
  --out 2024_BURDEN_EUR_R7/EUR_DNAJC13.coding_nonsyn
  --recode A

7450 MB RAM detected; reserving 3725 MB for main workspace.
14469872 variants loaded from .bim file.
25801 people (15669 males, 10132 females) loaded from .fam.
18560 phenotype values loaded from .fam.
--extract range: 14469805 variants excluded.
--extract range: 67 variants remaining.
--keep: 25800 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 25800 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw', delim_whitespace=True)
recode.head()

In [23]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for DNAJC13: {len(variants)}')
variants

Number of variants in EUR for DNAJC13: 67


['chr3:132434590:T:C_C',
 'chr3:132447373:C:T_T',
 'chr3:132447465:G:A_A',
 'chr3:132450732:A:T_T',
 'chr3:132450777:T:C_C',
 'chr3:132450821:A:G_G',
 'chr3:132453642:T:C_C',
 'chr3:132454135:C:T_T',
 'chr3:132456308:C:G_G',
 'chr3:132456751:C:T_T',
 'chr3:132457316:G:T_T',
 'chr3:132466351:A:C_C',
 'chr3:132466392:A:G_G',
 'chr3:132467228:C:T_T',
 'chr3:132467269:G:T_T',
 'chr3:132467277:G:A_A',
 'chr3:132474978:A:G_G',
 'chr3:132475009:C:A_A',
 'chr3:132475050:G:A_A',
 'chr3:132477841:G:A_A',
 'chr3:132477871:A:G_G',
 'chr3:132477983:A:G_G',
 'chr3:132477995:A:G_G',
 'chr3:132478058:C:T_T',
 'chr3:132478117:T:C_C',
 'chr3:132478139:G:A_A',
 'chr3:132480451:G:A_A',
 'chr3:132483531:C:G_G',
 'chr3:132484650:C:T_T',
 'chr3:132488374:A:G_G',
 'chr3:132488428:G:A_A',
 'chr3:132492421:A:C_C',
 'chr3:132492449:C:T_T',
 'chr3:132492497:A:G_G',
 'chr3:132492604:A:G_G',
 'chr3:132494190:A:G_G',
 'chr3:132499168:G:A_A',
 'chr3:132499231:C:T_T',
 'chr3:132499252:A:G_G',
 'chr3:132499777:G:A_A',


In [24]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
#    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
#    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
#        'Freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
#        'Freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['SNP'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls,SNP
0,chr3:132434590:T:C_C,0,1,13262,0,1,5297,chr3:132434590:T:C
1,chr3:132447373:C:T_T,0,1,13262,0,0,5297,chr3:132447373:C:T
2,chr3:132447465:G:A_A,0,3,13262,0,0,5297,chr3:132447465:G:A
3,chr3:132450732:A:T_T,0,5,13262,0,3,5297,chr3:132450732:A:T
4,chr3:132450777:T:C_C,0,0,13262,0,1,5297,chr3:132450777:T:C
...,...,...,...,...,...,...,...,...
62,chr3:132525718:G:T_T,0,16,13262,0,10,5297,chr3:132525718:G:T
63,chr3:132528264:G:A_A,0,3,13262,0,1,5297,chr3:132528264:G:A
64,chr3:132528316:T:G_G,1,128,13262,1,58,5297,chr3:132528316:T:G
65,chr3:132531088:T:C_C,0,1,13262,0,0,5297,chr3:132531088:T:C


In [28]:
## Merge with assoc results 
full_results = pd.merge(assoc_and_freq, df_results, on="SNP", how="left")

In [29]:
clean_full_results = full_results[['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR','L95', 'U95', 'BONF', 
                                   'MAF', 'MAF_A', 'MAF_U', 
                                   'Hom Cases', 'Het Cases', 'Total Cases', 
                                   'Hom Controls','Het Controls', 'Total Controls']].copy()

In [32]:
print(clean_full_results.shape)
print(f'No. of SNPs: {clean_full_results.shape[0]}')
clean_full_results

(67, 16)
No. of SNPs: 67


Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
0,chr3:132434590:T:C,C,T,0.000038,0.000094,0.5016,0.3994,0.000039,0.000038,0.000094,0,1,13262,0,1,5297
1,chr3:132447373:C:T,T,C,0.000038,0.000000,0.5274,,0.000019,0.000038,0.000000,0,1,13262,0,0,5297
2,chr3:132447465:G:A,A,G,0.000113,0.000000,0.2737,,0.000078,0.000113,0.000000,0,3,13262,0,0,5297
3,chr3:132450732:A:T,T,A,0.000189,0.000283,0.5750,0.6658,0.000155,0.000189,0.000283,0,5,13262,0,3,5297
4,chr3:132450777:T:C,C,T,0.000000,0.000094,0.1136,0.0000,0.000058,0.000000,0.000094,0,0,13262,0,1,5297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,chr3:132525718:G:T,T,G,0.000604,0.000945,0.2623,0.6387,0.000931,0.000604,0.000945,0,16,13262,0,10,5297
63,chr3:132528264:G:A,A,G,0.000113,0.000094,0.8753,1.1980,0.000116,0.000113,0.000094,0,3,13262,0,1,5297
64,chr3:132528316:T:G,G,T,0.004901,0.005664,0.3526,0.8647,0.005039,0.004901,0.005664,1,128,13262,1,58,5297
65,chr3:132531088:T:C,C,T,0.000038,0.000000,0.5274,,0.000039,0.000038,0.000000,0,1,13262,0,0,5297


In [30]:
# Look at significant SNPs, if any 
sig_freq = clean_full_results[clean_full_results['P']<0.05]
sig_snps = sig_freq['SNP'].tolist()
sig_snps

['chr3:132494190:A:G', 'chr3:132499779:G:T', 'chr3:132500821:A:C']

In [31]:
# Look at significant SNPs, if any 
sig_df_results = clean_full_results[clean_full_results['SNP'].isin(sig_snps)]
sig_df_results

Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
35,chr3:132494190:A:G,G,A,0.007994,0.005758,0.0228,1.391,1.046,1.852,1.0,0.007385,0.007994,0.005758,1,210,13262,1,59,5297
40,chr3:132499779:G:T,T,G,0.4663,0.4777,0.04628,0.9552,0.913,0.9992,1.0,0.4683,0.4663,0.4777,2836,6693,13262,1192,2676,5297
41,chr3:132500821:A:C,C,A,0.0,0.000189,0.02524,0.0,0.0,,1.0,3.9e-05,0.0,0.000189,0,0,13262,0,2,5297


In [11]:
for snp in sig_snps:
    ! grep {snp} {WORK_DIR}/chr3_{ancestry}_release7.pvar

3	132494190	chr3:132494190:A:G	A	G	TYPED;IMPUTED;MAF=0.00688418;AVG_CS=0.999989;R2=0.998482;ER2=0.968619;AF=0.00584599
3	132499779	chr3:132499779:G:T	G	T	TYPED;IMPUTED;MAF=0.467505;AVG_CS=0.999943;R2=0.999832;ER2=0.997669;AF=0.466179
3	132500821	chr3:132500821:A:C	A	C	IMPUTED;MAF=6.77805e-05;AVG_CS=0.999981;R2=0.811328;AF=2.90362e-05


In [32]:
# Save files to VM
clean_full_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt', sep="\t", index=False)
sig_df_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt' , sep="\t", index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt {WORKSPACE_BUCKET}/2023_BURDEN_{ancestry}_R7/{ancestry}_DNAJC13.annovar.hg38_multianno.txt')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_{ancestry}_R7/')