# Variants in DNAJC13 and their Association with Parkinson's Disease Across Different Ancestral Backgrounds

* **Project**: DNAJC13 Gene Assesment
* **Last updated**: September 2024
* **Version**: Python 3.9
* **Data**: AMP-PD Release 3

# Summary
This notebook is focused on analysing association between DNAJC13 variants in European (EUR) ancestry and Parkinson's disease.

# Imports

In [3]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

  from IPython.core.display import display, HTML


In [4]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## AMP-PD v2.5
## Explicitly define release v2.5 path 
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'

AMP_WGS_RELEASE_PLINK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'plink')
AMP_WGS_RELEASE_GATK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'gatk')

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name @ `WORKSPACE_NAME`: {WORKSPACE_NAME}')
print(f'Billing Project @ `BILLING_PROJECT_ID`: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data @ `WORKSPACE_BUCKET`: {WORKSPACE_BUCKET}')
print('')

print('AMP-PD v2.5')
print(f'Path to AMP-PD v2.5 Clinical Data @ `AMP_CLINICAL_RELEASE_PATH`: {AMP_CLINICAL_RELEASE_PATH}')
print(f'Path to AMP-PD v2.5 WGS Data @ `AMP_WGS_RELEASE_PLINK_PATH`: {AMP_WGS_RELEASE_PLINK_PATH}')
print('')

## GP2 v5.0
## Explicitly define release v5.0 path 
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v6.0')
print(f'Path to GP2 v6.0 Clinical Data @ `GP2_CLINICAL_RELEASE_PATH`: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v6.0 Metadata @ `GP2_META_RELEASE_PATH`: {GP2_META_RELEASE_PATH}')
print(f'Path to GP2 v6.0 Raw Genotype Data @ `GP2_RAW_GENO_PATH`: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v6.0 Imputed Genotype Data @ `GP2_IMPUTED_GENO_PATH`: {GP2_IMPUTED_GENO_PATH}')

# Make working directory

In [6]:
ancestry = "EUR"
WORK_DIR = f'2024_BURDEN_AMP_{ancestry}'
! mkdir {WORK_DIR}

AMP_DATA_DIR = f'AMP_DATA_DIR'
! mkdir {AMP_DATA_DIR}


# Install Packages

## PLINK

In [None]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi

In [None]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink2; then
echo "Plink2 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink2 \n    -------"
wget -N https://s3.amazonaws.com/plink2-assets/alpha3/plink2_linux_avx2_20220603.zip
unzip -o plink2_linux_avx2_20220603.zip
echo -e "\n plink2 downloaded and unzipped in /home/jupyter/tools \n "

fi

## ANNOVAR

In [None]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

In [None]:
%%bash

# Install ANNOVAR: Download resources for annotation

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20140902 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/

## RVTests

In [None]:
%%bash

#Install RVTESTS: Option 1 (~15min)
if test -e /home/jupyter/tools/rvtests; then

echo "rvtests is already installed"
else
echo "rvtests is not installed"

mkdir /home/jupyter/tools/rvtests
cd /home/jupyter/tools/rvtests

wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz 

tar -zxvf rvtests_linux64.tar.gz
fi

In [None]:
# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2
! chmod 777 /home/jupyter/tools/rvtests/executable/rvtest

# Copy Over Files 

In [None]:
## chr 3
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {AMP_WGS_RELEASE_PLINK_PATH}/pfiles/chr3.* {AMP_DATA_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {AMP_WGS_RELEASE_PLINK_PATH}/pfiles/chr3.pvar {AMP_DATA_DIR}')

In [None]:
## I have already downloaded it in a different folder. Just linking
#shell_do(f'ln -s /home/jupyter/Team 5 DNAJC13 Gene Assessment/edit/2024_BURDEN_AMP/chr3.* {WORK_DIR}')

In [None]:
## clinical data 

## demographics
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {AMP_CLINICAL_RELEASE_PATH}/Demographics.csv {WORK_DIR}')

## enrollment
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {AMP_CLINICAL_RELEASE_PATH}/Enrollment.csv {WORK_DIR}')

## duplicates
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {AMP_RELEASE_PATH}/amp_pd_participant_wgs_duplicates.csv {WORK_DIR}')

## related file 
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_META_RELEASE_PATH}/related_samples/release6_{ancestry}.related {WORK_DIR}')

In [None]:
# refFlat
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {WORKSPACE_BUCKET}/notebooks/misc/refFlat_HG38_all_chr.txt {WORK_DIR}')

In [None]:
# Predicted ancestries uploaded by Mary
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORKSPACE_BUCKET}/AMPPD_v25_GenoTools_Predictions.txt {WORK_DIR}')


In [None]:
# Additional files uploaded by Mary
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORKSPACE_BUCKET}/AMP_v3/AMPPD_v3_COV_wPHENOS_wGENOTOOLS.csv {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORKSPACE_BUCKET}/AMP_v3/PCA.FILTERED.AMP_PD_{ancestry}.PD.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORKSPACE_BUCKET}/AMP_v3/toRemove_1stand2ndDegree_Relateds_{ancestry}.txt {WORK_DIR}')

In [9]:
## Check files
! ls {WORK_DIR}

amp_pd_participant_wgs_duplicates.csv
AMPPD_v25_GenoTools_Predictions.txt
AMPPD_v3_COV_wPHENOS_wGENOTOOLS.csv
chr3.pgen
chr3.psam
chr3.pvar
Demographics.csv
Enrollment.csv
PCA.FILTERED.AMP_PD_EUR.PD.eigenvec
refFlat_HG38_all_chr.txt
toRemove_1stand2ndDegree_Relateds_EUR.txt


# Covariate File

In [None]:
# Let's load the disease status
enrollment = pd.read_csv(f'{WORK_DIR}/Enrollment.csv')
print(enrollment.shape)
enrollment.head()

In [None]:
# Let's load the demographic data
demographics = pd.read_csv(f'{WORK_DIR}/Demographics.csv')
print(demographics.shape)
demographics.head()

In [None]:
# Let's load the predicted ancestry
predictedAncestry= pd.read_csv(f'{WORK_DIR}/AMPPD_v25_GenoTools_Predictions.txt', sep='\s+' )
print(predictedAncestry.shape)
predictedAncestry.head()

In [None]:
#Get just the needed info from Enrollment
enrollment = enrollment[['participant_id', 'study_arm']]
enrollment.rename(columns = {'participant_id': 'IID',
                      'study_arm':'phenotype'}, inplace = True)
print(enrollment.shape)
enrollment.head()

In [None]:
#Get just the needed info from Demographics
demographics = demographics[['participant_id', 'sex', 'age_at_baseline']]
demographics.rename(columns = {'participant_id': 'IID',
                      'sex':'SEX',
                      'age_at_baseline':'AGE'}, inplace = True)
print(demographics.shape)
demographics.head()

In [None]:
# Merge Enrollment with Demographics
merged_key = pd.merge(enrollment, demographics, on='IID', how='left')
print(merged_key.shape)
merged_key.head()

In [None]:
# Merge Enrollment/Demographics with Ancestry
merged_key2 = pd.merge(merged_key, predictedAncestry, on='IID', how='left')
print(merged_key2.shape)
merged_key2.head()

In [17]:
merged_key2['label'].value_counts(dropna=False)

label
EUR    8370
AJ     1312
NaN     316
AAC     106
AMR      86
MDE      42
EAS      33
CAS      15
AFR      13
SAS      10
FIN       2
Name: count, dtype: int64

In [18]:
merged_key2['phenotype'].value_counts(dropna=False)

phenotype
Healthy Control                3377
PD                             2987
LBD                            2521
Genetic Cohort Unaffected       403
Genetic Cohort PD               272
Genetic Registry Unaffected     245
Genetic Registry PD             196
Disease Control                 159
SWEDD                            77
Prodromal                        64
NaN                               3
Unknown                           1
Name: count, dtype: int64

In [None]:
merged_key2[merged_key2['label']==ancestry]

In [None]:
## Subset to keep ancestry of interest 
ancestry_key = merged_key2[merged_key2['label']==ancestry].copy()
ancestry_key.reset_index(drop=True)

In [21]:
# Convert phenotype to binary (0/1)
    # PD = 1; control = 0
pheno_mapping = {"PD": 2, "Healthy Control": 1}
ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')

In [22]:
# Check value counts of pheno
ancestry_key['PHENO'].value_counts(dropna=False)

PHENO
<NA>    2938
1       2920
2       2512
Name: count, dtype: Int64

In [23]:
# Convert sex to binary (0/1)
    # Female = 1; male = 0
sex_mapping = {'Female': 2, 'Male': 1}
ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')

In [24]:
# Check value counts of SEX
ancestry_key['SEX'].value_counts(dropna=False)

SEX
1       4812
2       3558
<NA>       0
Name: count, dtype: Int64

In [None]:
## Clean up and keep columns we need 
final_df = ancestry_key[['FID','IID', 'SEX', 'PHENO', 'AGE']].copy()
final_df

In [26]:
# remove duplicate values in IID column
final_df = final_df.drop_duplicates(subset=['IID'], keep='first')

In [27]:
final_df.groupby(['PHENO'])['SEX'].value_counts()

PHENO  SEX
1      2      1504
       1      1416
2      1      1445
       2       835
Name: count, dtype: int64

In [28]:
final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [None]:
print(final_df.shape)
final_df.head()

In [None]:
# Load information about related individuals 
column_names = ['FID','IID']
related_df = pd.read_csv(f'{WORK_DIR}/toRemove_1stand2ndDegree_Relateds_{ancestry}.txt', sep='\t', header = None, names=column_names)
print(related_df.shape)
related_df

In [None]:
# Make a list of just one set of related people
related_list = list(related_df['IID'])

# Check value counts of related and remove only one related individual
final_df = final_df[~final_df["IID"].isin(related_list)]

# Check size
print(final_df.shape)
final_df

In [32]:
## Make file of sample IDs to keep 
samples_toKeep = final_df[['FID','IID']].copy()
samples_toKeep.to_csv(f'{WORK_DIR}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)

In [None]:
! head {WORK_DIR}/{ancestry}.samplestoKeep

In [34]:
#final_df['FID'] = 0
final_df['FATID'] = 0
final_df['MATID'] = 0

In [35]:
final_df2=final_df[['FID','IID','MATID','FATID','SEX','PHENO', 'AGE']].copy()

In [None]:
final_df2

In [None]:
## Load PCs
pcs = pd.read_csv(f'{WORK_DIR}/PCA.FILTERED.AMP_PD_{ancestry}.PD.eigenvec', sep='\t')
print(pcs.shape)
pcs

In [None]:
# Merge Cov and PCs
final_df3 = pd.merge(final_df2, pcs, on='IID', how='left')
print(final_df3.shape)
final_df3.head()

In [39]:
final_df3.drop(['#FID', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'], axis='columns', inplace=True)

In [None]:
print(final_df3.shape)
final_df3.head()

In [41]:
final_df3.groupby(['PHENO'])['SEX'].value_counts()

PHENO  SEX
1      2      1459
       1      1379
2      1      1442
       2       835
Name: count, dtype: int64

In [42]:
## Save your covariate file
final_df3.to_csv(f'{WORK_DIR}/{ancestry}_covariate_file.txt', sep = '\t', index=False)

In [None]:
!head {WORK_DIR}/{ancestry}_covariate_file.txt

In [44]:
## check to make sure file was created and saved
! ls -lh {WORK_DIR}

total 23M
-rw-rw-r-- 1 jupyter users 1.9K May  5 15:43 amp_pd_participant_wgs_duplicates.csv
-rw-rw-r-- 1 jupyter users 262K May  5 15:43 AMPPD_v25_GenoTools_Predictions.txt
-rw-rw-r-- 1 jupyter users 924K May  5 15:43 AMPPD_v3_COV_wPHENOS_wGENOTOOLS.csv
lrwxrwxrwx 1 jupyter users   25 May  5 15:43 chr3.pgen -> ../AMP_DATA_DIR/chr3.pgen
lrwxrwxrwx 1 jupyter users   25 May  5 15:43 chr3.psam -> ../AMP_DATA_DIR/chr3.psam
lrwxrwxrwx 1 jupyter users   25 May  5 15:43 chr3.pvar -> ../AMP_DATA_DIR/chr3.pvar
-rw-rw-r-- 1 jupyter users 636K May  5 15:43 Demographics.csv
-rw-rw-r-- 1 jupyter users 633K May  5 15:43 Enrollment.csv
-rw-rw-r-- 1 jupyter users 448K May  5 15:43 EUR_covariate_file.txt
-rw-rw-r-- 1 jupyter users 121K May  5 15:43 EUR.samplestoKeep
-rw-rw-r-- 1 jupyter users 667K May  5 15:43 PCA.FILTERED.AMP_PD_EUR.PD.eigenvec
-rw-rw-r-- 1 jupyter users  20M May  5 15:43 refFlat_HG38_all_chr.txt
-rw-rw-r-- 1 jupyter users  12K May  5 15:43 toRemove_1stand2ndDegree_Relate

In [45]:
## extract ancestry

! /home/jupyter/tools/plink2 \
--pfile {WORK_DIR}/chr3 \
--chr 3 \
--from-bp 132417502  \
--to-bp 132539032 \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--make-pgen \
--out {WORK_DIR}/chr3_{ancestry}

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/chr3_EUR.log.
Options in effect:
  --chr 3
  --from-bp 132417502
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --make-pgen
  --out 2024_BURDEN_AMP_EUR/chr3_EUR
  --pfile 2024_BURDEN_AMP_EUR/chr3
  --to-bp 132539032

Start time: Sun May  5 15:43:56 2024
7450 MiB RAM detected, ~5378 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
10418 samples (0 females, 0 males, 10418 ambiguous; 10418 founders) loaded from
2024_BURDEN_AMP_EUR/chr3.psam.
10972242 variants loaded from 2024_BURDEN_AMP_EUR/chr3.pvar.
Note: No phenotype data present.
--keep: 5115 samples remaining.
5115 samples (0 females, 0 males, 5115 ambiguous; 5115 founders) remaining
after main filters.
6005 variants remaining after main filters.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR.psam ... done.
Writing 2024_BURD

In [None]:
# Save to workspace bucket (move from VM to workspace bucket) 
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_covariate_file.txt {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/{ancestry}_covariate_file.txt')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}.samplestoKeep {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/{ancestry}.samplestoKeep')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

# Annotation using ANNOVAR

* *DNAJC13* from NCBI gene
* hg38 (chr3:132417502-132539032)

In [48]:
## extract region using plink

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry} \
--chr 3 \
--from-bp 132417502  \
--to-bp 132539032 \
--recode vcf id-paste=iid \
--mac 2 \
--out {WORK_DIR}/{ancestry}_DNAJC13

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.log.
Options in effect:
  --chr 3
  --export vcf id-paste=iid
  --from-bp 132417502
  --mac 2
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13
  --pfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --to-bp 132539032

Start time: Sun May  5 15:56:58 2024
7450 MiB RAM detected, ~5386 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
5115 samples (0 females, 0 males, 5115 ambiguous; 5115 founders) loaded from
2024_BURDEN_AMP_EUR/chr3_EUR.psam.
6005 variants loaded from 2024_BURDEN_AMP_EUR/chr3_EUR.pvar.
Note: No phenotype data present.
Calculating allele frequencies... done.
4360 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
1645 variants remaining after main filters.
--export vcf to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.vcf ... 101011111212131314

In [49]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz

In [50]:
## annotate using ANNOVAR
! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry}_DNAJC13.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out {WORK_DIR}/{ancestry}_DNAJC13.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 2024_BURDEN_AMP_EUR/EUR_DNAJC13.vcf.gz > 2024_BURDEN_AMP_EUR/EUR_DNAJC13.annovar.avinput>
NOTICE: Finished reading 1706 lines from VCF file
NOTICE: A total of 1645 locus in VCF file passed QC threshold, representing 1430 SNPs (941 transitions and 489 transversions) and 500 indels/substitutions
NOTICE: Finished writing allele frequencies based on 7314450 SNP genotypes (4813215 transitions and 2501235 transversions) and 2557500 indels/substitutions for 5115 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl 2024_BURDEN_AMP_EUR/EUR_DNAJC13.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile 2024_BURDEN_AMP_EUR/EUR_DNAJC13.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Processing operation=g proto

In [51]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo5118,Otherinfo5119,Otherinfo5120,Otherinfo5121,Otherinfo5122,Otherinfo5123,Otherinfo5124,Otherinfo5125,Otherinfo5126,Otherinfo5127
0,3,132417567,132417567,-,GGAGGAGGC,UTR5,DNAJC13,NM_001329126:c.-16984_-16983insGGAGGAGGC;NM_01...,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,3,132417567,132417567,G,0,UTR5,DNAJC13,NM_001329126:c.-16984G>0;NM_015268:c.-16984G>0,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,3,132417589,132417589,G,A,UTR5,DNAJC13,NM_001329126:c.-16962G>A;NM_015268:c.-16962G>A,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,3,132417590,132417590,A,G,UTR5,DNAJC13,NM_001329126:c.-16961A>G;NM_015268:c.-16961A>G,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,3,132417594,132417594,A,G,UTR5,DNAJC13,NM_001329126:c.-16957A>G;NM_015268:c.-16957A>G,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2097,3,132538679,132538679,A,T,UTR3,DNAJC13,NM_001329126:c.*397A>T;NM_015268:c.*397A>T,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2098,3,132538708,132538708,G,A,UTR3,DNAJC13,NM_001329126:c.*426G>A;NM_015268:c.*426G>A,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2099,3,132538723,132538723,C,G,UTR3,DNAJC13,NM_001329126:c.*441C>G;NM_015268:c.*441C>G,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2100,3,132538890,132538890,G,A,UTR3,DNAJC13,NM_001329126:c.*608G>A;NM_015268:c.*608G>A,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0


In [52]:
## Filter intronic
intronic = gene[(gene['Func.refGene'] == 'intronic')]

In [53]:
## Filter UTR3
utr3 = gene[(gene['Func.refGene'] == 'UTR3')]

In [54]:
## Filter UTR5
utr5 = gene[(gene['Func.refGene'] == 'UTR5')]

In [55]:
## Filter exonic and synonymous variants
coding_synonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'synonymous SNV')]

In [56]:
# Filter exonic and non-synonymous variants
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_nonsynonymous

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo5118,Otherinfo5119,Otherinfo5120,Otherinfo5121,Otherinfo5122,Otherinfo5123,Otherinfo5124,Otherinfo5125,Otherinfo5126,Otherinfo5127
525,3,132450732,132450732,A,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon6:c.A422T:p.N141I,DNA...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
602,3,132456751,132456751,C,T,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon12:c.C1268T:p.A423V,D...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
604,3,132456763,132456763,G,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon12:c.G1280C:p.S427T,D...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
666,3,132461159,132461159,T,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon15:c.T1667C:p.L556S,D...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
743,3,132466338,132466338,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_001329126:exon19:c.G2008A:p.V670I,D...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1077,3,132473173,132473173,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon21:c.G2237A:p.R746Q,DNAJ...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1138,3,132475009,132475009,C,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon22:c.C2369A:p.S790Y,DNAJ...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1178,3,132477841,132477841,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon23:c.G2498A:p.R833K,DNAJ...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1183,3,132478139,132478139,G,A,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon24:c.G2708A:p.R903K,DNAJ...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1224,3,132480415,132480415,T,C,exonic,DNAJC13,.,nonsynonymous SNV,"DNAJC13:NM_015268:exon26:c.T2819C:p.V940A,DNAJ...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0


In [57]:
totalVariants = len(gene.axes[0])
totalIntronic = len(intronic.axes[0])
totalUTR3 = len(utr3.axes[0])
totalUTR5 = len(utr5.axes[0])
totalExonicSyn = len(coding_synonymous.axes[0])
totalExonicNonSyn = len(coding_nonsynonymous.axes[0])

In [58]:
print("Total Variants: ", totalVariants)
print("Intronic:", totalIntronic)
print("UTR3:", totalUTR3)
print("UTR5:", totalUTR5)
print("Exonic Syn:", totalExonicSyn)
print("Exonic NonSyn:", totalExonicNonSyn)

Total Variants:  2102
Intronic: 2032
UTR3: 12
UTR5: 5
Exonic Syn: 18
Exonic NonSyn: 35


In [59]:
# Save in PLINK format 
variants_toKeep = coding_nonsynonymous[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep', sep="\t", index=False, header=False)
variants_toKeep

Unnamed: 0,Chr,Start,End,Gene.refGene
525,3,132450732,132450732,DNAJC13
602,3,132456751,132456751,DNAJC13
604,3,132456763,132456763,DNAJC13
666,3,132461159,132461159,DNAJC13
743,3,132466338,132466338,DNAJC13
1077,3,132473173,132473173,DNAJC13
1138,3,132475009,132475009,DNAJC13
1178,3,132477841,132477841,DNAJC13
1183,3,132478139,132478139,DNAJC13
1224,3,132480415,132480415,DNAJC13


In [60]:
## check to make sure file was created and saved
! ls {WORK_DIR}

amp_pd_participant_wgs_duplicates.csv
AMPPD_v25_GenoTools_Predictions.txt
AMPPD_v3_COV_wPHENOS_wGENOTOOLS.csv
chr3_EUR.log
chr3_EUR.pgen
chr3_EUR.psam
chr3_EUR.pvar
chr3.pgen
chr3.psam
chr3.pvar
Demographics.csv
Enrollment.csv
EUR_covariate_file.txt
EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
EUR_DNAJC13.annovar.avinput
EUR_DNAJC13.annovar.hg38_multianno.txt
EUR_DNAJC13.annovar.hg38_multianno.vcf
EUR_DNAJC13.log
EUR_DNAJC13.vcf.gz
EUR_DNAJC13.vcf.gz.tbi
EUR.samplestoKeep
PCA.FILTERED.AMP_PD_EUR.PD.eigenvec
refFlat_HG38_all_chr.txt
toRemove_1stand2ndDegree_Relateds_EUR.txt


In [None]:
# Save to workspace bucket (move from VM to workspace bucket) (not copying right now)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep ')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

# Burden Analyses using RVTests

In [63]:
# Convert the files from Plink 2.0 to Plink 1.9 format 

! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry} \
--make-bed \
--max-alleles 2 \
--out {WORK_DIR}/chr3_{ancestry}

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/chr3_EUR.log.
Options in effect:
  --make-bed
  --max-alleles 2
  --out 2024_BURDEN_AMP_EUR/chr3_EUR
  --pfile 2024_BURDEN_AMP_EUR/chr3_EUR

Start time: Sun May  5 15:58:45 2024
7450 MiB RAM detected, ~5083 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
5115 samples (0 females, 0 males, 5115 ambiguous; 5115 founders) loaded from
2024_BURDEN_AMP_EUR/chr3_EUR.psam.
5615 out of 6005 variants loaded from 2024_BURDEN_AMP_EUR/chr3_EUR.pvar.
Note: No phenotype data present.
5615 variants remaining after main filters.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR.fam ... done.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR.bim ... done.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR.bed ... 0%done.
End time: Sun May  5 15:58:45 2024


In [64]:
## extract variants
    # later do based on class and frequency

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry} \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--recode vcf-iid \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn
  --recode vcf-iid

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
--extract range: 5582 variants excluded.
--extract range: 33 variants remaining.
--keep: 5115 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 5115 founders and 0 nonfounders present.
Calculating allele frequencies...

In [65]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.vcf.gz

In [66]:
## RVtests with covariates 
! /home/jupyter/tools/rvtests/executable/rvtest --noweb --hide-covar \
--out {WORK_DIR}/{ancestry}_DNAJC13.burden.coding_nonsyn \
--kernel skat,skato \
--inVcf {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.vcf.gz \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--pheno-name PHENO \
--gene DNAJC13 \
--geneFile {WORK_DIR}/refFlat_HG38_all_chr.txt #\
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX AGE PC1 PC2 PC3 PC4
#[WARN]Covariate [ PC5 ] has strong correlation [ r^2 = 1 ] with the response!


# --out : Name of output 
# --burden cmc --kernel skato: tests to run 
# --inVcf : VCF file 
# --gene: gene name (if only looking at one or a few)
# --geneFile refFlat.txt
# --pheno :  covar file
# --mpheno : # column that has phenotype information
# --pheno-name : column name with phenotype in file
# --covar : covar file
# --freqUpper : optional, MAF cut-off
# --covar-name : covariates, listed by column name, separated by commas (no spaces between commas)
## 0=controls; 1=cases

Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output:
                          --inVcf [2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.vcf.gz]
                          --inBgen [], --inBgenSample [], --inKgg []
                          --out [2024_BURDEN_AMP_EUR/EUR_DNAJC13.burden.coding_nonsyn]
                          --outputRaw
       Specify Covariate: --covar [], --covar-name [], --sex
       Specify Phenotype: --pheno [2024_BURDEN_AMP_EUR/EUR_covariate_file.txt]
                          --inverseNormal, --useResidualAsPhenotype, --mpheno []
                          --pheno-name [PHENO], --qt

In [67]:
## look at results 
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding_nonsyn.Skat.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	Pvalue	NumPerm	ActualPerm	Stat	NumGreater	NumEqual	PermPvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	5115	33	33	110685	0.26713	10000	3511	110685	1000	0	0.284819


In [68]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.burden.coding_nonsyn.SkatO.assoc

Gene	RANGE	N_INFORMATIVE	NumVar	NumPolyVar	Q	rho	Pvalue
DNAJC13	3:132417501-132539032,3:132417501-132539032	5115	33	33	54487.7	0.4	0.406778


In [69]:
## check to make sure file was created and saved
! ls {WORK_DIR}

amp_pd_participant_wgs_duplicates.csv
AMPPD_v25_GenoTools_Predictions.txt
AMPPD_v3_COV_wPHENOS_wGENOTOOLS.csv
chr3_EUR.bed
chr3_EUR.bim
chr3_EUR.fam
chr3_EUR.log
chr3_EUR.pgen
chr3_EUR.psam
chr3_EUR.pvar
chr3.pgen
chr3.psam
chr3.pvar
Demographics.csv
Enrollment.csv
EUR_covariate_file.txt
EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
EUR_DNAJC13.annovar.avinput
EUR_DNAJC13.annovar.hg38_multianno.txt
EUR_DNAJC13.annovar.hg38_multianno.vcf
EUR_DNAJC13.burden.coding_nonsyn.log
EUR_DNAJC13.burden.coding_nonsyn.Skat.assoc
EUR_DNAJC13.burden.coding_nonsyn.SkatO.assoc
EUR_DNAJC13.coding_nonsyn.log
EUR_DNAJC13.coding_nonsyn.nosex
EUR_DNAJC13.coding_nonsyn.vcf.gz
EUR_DNAJC13.coding_nonsyn.vcf.gz.tbi
EUR_DNAJC13.log
EUR_DNAJC13.vcf.gz
EUR_DNAJC13.vcf.gz.tbi
EUR.samplestoKeep
PCA.FILTERED.AMP_PD_EUR.PD.eigenvec
refFlat_HG38_all_chr.txt
toRemove_1stand2ndDegree_Relateds_EUR.txt


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.burden.coding_nonsyn.*.assoc {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

# Case/Control Frequencies

In [72]:
## extract variants

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry} \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--pheno-name PHENO \
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX,AGE,PC1,PC2,PC3,PC4\
--assoc \
--allow-no-sex \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --ci 0.95

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --ci 0.95
  --covar 2024_BURDEN_AMP_EUR/EUR_covariate_file.txt
  --covar-name SEX,AGE,PC1,PC2,PC3,PC4
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn
  --pheno 2024_BURDEN_AMP_EUR/EUR_covariate_file.txt
  --pheno-name PHENO

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
5115 phenotype values present after --pheno.
--extract range: 5

In [73]:
! head {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc

 CHR           SNP         BP   A1      F_A      F_U   A2        CHISQ            P           OR           SE          L95          U95 
   3   rs149480465  132450732    T 0.0002196 0.0003524    A       0.1519       0.6967       0.6231        1.225      0.05648        6.874 
   3   rs147898644  132456751    T 0.0004392        0    C        2.493       0.1143           NA           NA           NA           NA 
   3             .  132456763    C        0 0.0003524    G        1.605       0.2052            0          inf            0          nan 
   3             .  132461159    C 0.0004392        0    T        2.493       0.1143           NA           NA           NA           NA 
   3   rs200491223  132466338    A        0 0.0003524    G        1.605       0.2052            0          inf            0          nan 
   3             .  132473173    A        0 0.0003524    G        1.605       0.2052            0          inf            0          nan 
   3   rs149121829  1324750

In [74]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc', delim_whitespace=True, usecols=['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR', 'L95', 'U95'])
assoc

Unnamed: 0,SNP,A1,F_A,F_U,A2,P,OR,L95,U95
0,rs149480465,T,0.00022,0.000352,A,0.6967,0.6231,0.05648,6.874
1,rs147898644,T,0.000439,0.0,C,0.1143,,,
2,.,C,0.0,0.000352,G,0.2052,0.0,0.0,
3,.,C,0.000439,0.0,T,0.1143,,,
4,rs200491223,A,0.0,0.000352,G,0.2052,0.0,0.0,
5,.,A,0.0,0.000352,G,0.2052,0.0,0.0,
6,rs149121829,A,0.009003,0.01004,C,0.5918,0.8956,0.5984,1.34
7,rs201629093,A,0.00022,0.000352,G,0.6967,0.6231,0.05648,6.874
8,rs141952333,A,0.003074,0.003171,G,0.9304,0.9693,0.4816,1.951
9,rs202127368,T,0.000659,0.001057,C,0.4995,0.6229,0.1557,2.492


In [75]:
## repeat association study applying multiple testing corrections for the raw p-values (like Bonferroni post-hoc)
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry} \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--pheno-name PHENO \
--covar {WORK_DIR}/{ancestry}_covariate_file.txt \
--covar-name SEX,AGE,PC1,PC2,PC3,PC4\
--assoc \
--allow-no-sex \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn --adjust

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --adjust
  --allow-no-sex
  --assoc
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --covar 2024_BURDEN_AMP_EUR/EUR_covariate_file.txt
  --covar-name SEX,AGE,PC1,PC2,PC3,PC4
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn
  --pheno 2024_BURDEN_AMP_EUR/EUR_covariate_file.txt
  --pheno-name PHENO

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
5115 phenotype values present after --pheno.
--extract range: 55

In [76]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted

 CHR           SNP      UNADJ         GC       BONF       HOLM   SIDAK_SS   SIDAK_SD     FDR_BH     FDR_BY
   3     rs3762672   0.008968   0.008968     0.2959     0.2959     0.2572     0.2572     0.2959          1 
   3    rs61748102    0.03787    0.03787          1          1     0.7203     0.7093      0.539          1 
   3             .    0.07316    0.07316          1          1     0.9185     0.9051      0.539          1 
   3    rs79953286     0.1048     0.1048          1          1     0.9741     0.9639      0.539          1 
   3    rs55825559      0.112      0.112          1          1     0.9801     0.9681      0.539          1 
   3   rs147898644     0.1143     0.1143          1          1     0.9818     0.9681      0.539          1 
   3             .     0.1143     0.1143          1          1     0.9818     0.9681      0.539          1 
   3   rs145242123     0.1782     0.1782          1          1     0.9985     0.9939     0.6045          1 
   3             .  

In [77]:
assoc_adjusted = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted', delim_whitespace=True, usecols=['SNP', 'BONF'])
assoc_adjusted

Unnamed: 0,SNP,BONF
0,rs3762672,0.2959
1,rs61748102,1.0
2,.,1.0
3,rs79953286,1.0
4,rs55825559,1.0
5,rs147898644,1.0
6,.,1.0
7,rs145242123,1.0
8,.,1.0
9,rs200491223,1.0


In [78]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry} \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
--extract range: 5582 variants excluded.
--extract range: 33 variants remaining.
--keep: 5115 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 5115 founders and 0 nonfounders present.
Calculating allele freq

In [79]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq

 CHR           SNP   A1   A2          MAF  NCHROBS
   3   rs149480465    T    A    0.0002933    10230
   3   rs147898644    T    C    0.0001955    10230
   3             .    C    G    0.0001955    10230
   3             .    C    T    0.0001955    10230
   3   rs200491223    A    G    0.0001955    10230
   3             .    A    G    0.0001955    10230
   3   rs149121829    A    C      0.00958    10230
   3   rs201629093    A    G    0.0002933    10230
   3   rs141952333    A    G     0.003128    10230
   3   rs202127368    T    C    0.0008798    10230
   3             .    A    G    0.0001955    10230
   3   rs201263331    G    A     0.001173    10230
   3   rs372257861    G    A    0.0001955    10230
   3    rs61748101    G    A     0.005767    10230
   3   rs147104839    A    G     0.000391    10230
   3    rs61748102    T    C       0.0175    10230
   3    rs61748103    A    G      0.01222    10230
   3     rs3762672    T    G       0.4661    10230
   3        

In [80]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq', delim_whitespace=True, usecols=['SNP', 'MAF'])
freq.head()

Unnamed: 0,SNP,MAF
0,rs149480465,0.000293
1,rs147898644,0.000196
2,.,0.000196
3,.,0.000196
4,rs200491223,0.000196


In [81]:
! /home/jupyter/tools/plink2 --pfile {WORK_DIR}/chr3_{ancestry} \
--make-bed \
--max-alleles 2 \
--pheno {WORK_DIR}/{ancestry}_covariate_file.txt \
--pheno-name PHENO \
--out {WORK_DIR}/chr3_{ancestry}_wPheno

PLINK v2.00a5.10LM AVX2 Intel (5 Jan 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno.log.
Options in effect:
  --make-bed
  --max-alleles 2
  --out 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno
  --pfile 2024_BURDEN_AMP_EUR/chr3_EUR
  --pheno 2024_BURDEN_AMP_EUR/EUR_covariate_file.txt
  --pheno-name PHENO

Start time: Sun May  5 16:00:58 2024
7450 MiB RAM detected, ~5062 available; reserving 3725 MiB for main workspace.
Using up to 2 compute threads.
5115 samples (0 females, 0 males, 5115 ambiguous; 5115 founders) loaded from
2024_BURDEN_AMP_EUR/chr3_EUR.psam.
5615 out of 6005 variants loaded from 2024_BURDEN_AMP_EUR/chr3_EUR.pvar.
1 binary phenotype loaded (2277 cases, 2838 controls).
5615 variants remaining after main filters.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno.fam ... done.
Writing 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno.bim ... done.
Writing 2024_B

In [82]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_wPheno \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--freq case-control \
--allow-no-sex \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --allow-no-sex
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --freq case-control
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
5115 phenotype values loaded from .fam.
--extract range: 5582 variants excluded.
--extract range: 33 variants remaining.
--keep: 5115 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main var

In [83]:
! cat {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc

 CHR           SNP   A1   A2        MAF_A        MAF_U  NCHROBS_A  NCHROBS_U
   3   rs149480465    T    A    0.0002196    0.0003524       4554       5676
   3   rs147898644    T    C    0.0004392            0       4554       5676
   3             .    C    G            0    0.0003524       4554       5676
   3             .    C    T    0.0004392            0       4554       5676
   3   rs200491223    A    G            0    0.0003524       4554       5676
   3             .    A    G            0    0.0003524       4554       5676
   3   rs149121829    A    C     0.009003      0.01004       4554       5676
   3   rs201629093    A    G    0.0002196    0.0003524       4554       5676
   3   rs141952333    A    G     0.003074     0.003171       4554       5676
   3   rs202127368    T    C    0.0006588     0.001057       4554       5676
   3             .    A    G    0.0002196    0.0001762       4554       5676
   3   rs201263331    G    A    0.0008783     0.001409       455

In [84]:
freq_cc = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc', delim_whitespace=True, usecols=['SNP', 'MAF_A', 'MAF_U'])
freq_cc.head()

Unnamed: 0,SNP,MAF_A,MAF_U
0,rs149480465,0.00022,0.000352
1,rs147898644,0.000439,0.0
2,.,0.0,0.000352
3,.,0.000439,0.0
4,rs200491223,0.0,0.000352


In [85]:
all_freq = pd.merge(freq, freq_cc, on="SNP", how="left")
all_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U
0,rs149480465,0.000293,0.000220,0.000352
1,rs147898644,0.000196,0.000439,0.000000
2,.,0.000196,0.000000,0.000352
3,.,0.000196,0.000439,0.000000
4,.,0.000196,0.000000,0.000352
...,...,...,...,...
70,.,0.000391,0.000220,0.000176
71,.,0.000391,0.000220,0.000352
72,.,0.000391,0.000000,0.000705
73,rs140537885,0.004301,0.004611,0.004052


In [86]:
assoc_and_freq = pd.merge(all_freq, assoc, on="SNP", how="left")
assoc_and_freq

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95
0,rs149480465,0.000293,0.000220,0.000352,T,0.000220,0.000352,A,0.69670,0.6231,0.05648,6.874
1,rs147898644,0.000196,0.000439,0.000000,T,0.000439,0.000000,C,0.11430,,,
2,.,0.000196,0.000000,0.000352,C,0.000000,0.000352,G,0.20520,0.0000,0.00000,
3,.,0.000196,0.000000,0.000352,C,0.000439,0.000000,T,0.11430,,,
4,.,0.000196,0.000000,0.000352,A,0.000000,0.000352,G,0.20520,0.0000,0.00000,
...,...,...,...,...,...,...,...,...,...,...,...,...
364,.,0.000391,0.000000,0.000705,G,0.000220,0.000176,A,0.87600,1.2460,0.07794,19.930
365,.,0.000391,0.000000,0.000705,G,0.000220,0.000352,T,0.69670,0.6231,0.05648,6.874
366,.,0.000391,0.000000,0.000705,A,0.000000,0.000705,G,0.07316,0.0000,0.00000,
367,rs140537885,0.004301,0.004611,0.004052,G,0.004611,0.004052,T,0.66750,1.1390,0.62940,2.060


In [87]:
# Merge the results
assoc_and_freq = pd.merge(assoc_and_freq, assoc_adjusted, on="SNP", how="left")
assoc_and_freq.head()

Unnamed: 0,SNP,MAF,MAF_A,MAF_U,A1,F_A,F_U,A2,P,OR,L95,U95,BONF
0,rs149480465,0.000293,0.00022,0.000352,T,0.00022,0.000352,A,0.6967,0.6231,0.05648,6.874,1.0
1,rs147898644,0.000196,0.000439,0.0,T,0.000439,0.0,C,0.1143,,,,1.0
2,.,0.000196,0.0,0.000352,C,0.0,0.000352,G,0.2052,0.0,0.0,,1.0
3,.,0.000196,0.0,0.000352,C,0.0,0.000352,G,0.2052,0.0,0.0,,1.0
4,.,0.000196,0.0,0.000352,C,0.0,0.000352,G,0.2052,0.0,0.0,,1.0


In [88]:
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/chr3_{ancestry}_wPheno \
--extract range {WORK_DIR}/{ancestry}_DNAJC13.all_coding_nonsyn.variantstoKeep \
--keep {WORK_DIR}/{ancestry}.samplestoKeep \
--recode A \
--out {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.log.
Options in effect:
  --bfile 2024_BURDEN_AMP_EUR/chr3_EUR_wPheno
  --extract range 2024_BURDEN_AMP_EUR/EUR_DNAJC13.all_coding_nonsyn.variantstoKeep
  --keep 2024_BURDEN_AMP_EUR/EUR.samplestoKeep
  --out 2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn
  --recode A

7450 MB RAM detected; reserving 3725 MB for main workspace.
5615 variants loaded from .bim file.
5115 people (0 males, 0 females, 5115 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
2024_BURDEN_AMP_EUR/EUR_DNAJC13.coding_nonsyn.nosex .
5115 phenotype values loaded from .fam.
--extract range: 5582 variants excluded.
--extract range: 33 variants remaining.
--keep: 5115 people remaining.
phenotypes to be ignored, use the --allow-no-sex flag.
Using 1 thread (no multithreaded calculatio

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw', delim_whitespace=True)
recode.head()

In [90]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for DNAJC13: {len(variants)}')
variants

Number of variants in EUR for DNAJC13: 33


['rs149480465_T',
 'rs147898644_T',
 '._C',
 '._C.1',
 'rs200491223_A',
 '._A',
 'rs149121829_A',
 'rs201629093_A',
 'rs141952333_A',
 'rs202127368_T',
 '._A.1',
 'rs201263331_G',
 'rs372257861_G',
 'rs61748101_G',
 'rs147104839_A',
 'rs61748102_T',
 'rs61748103_A',
 'rs3762672_T',
 '._G',
 'rs55825559_T',
 'rs139620588_A',
 'rs202044084_C',
 'rs141090992_A',
 'rs79953286_G',
 'rs142160751_C',
 '._G.1',
 'rs145242123_T',
 'rs147575919_G',
 'rs10935014_C',
 'rs138693725_T',
 '._A.2',
 'rs140537885_G',
 'rs138367039_A']

In [91]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
#    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
#    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
#        'Freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
#        'Freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['SNP'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls,SNP
0,rs149480465_T,0,1,2277,0,2,2838,rs149480465
1,rs147898644_T,0,2,2277,0,0,2838,rs147898644
2,._C,0,0,2277,0,2,2838,.
3,._C.1,0,2,2277,0,0,2838,.
4,rs200491223_A,0,0,2277,0,2,2838,rs200491223
5,._A,0,0,2277,0,2,2838,.
6,rs149121829_A,0,41,2277,0,57,2838,rs149121829
7,rs201629093_A,0,1,2277,0,2,2838,rs201629093
8,rs141952333_A,0,14,2277,0,18,2838,rs141952333
9,rs202127368_T,0,3,2277,0,6,2838,rs202127368


In [92]:
## Merge with assoc results 
full_results = pd.merge(assoc_and_freq, df_results, on="SNP", how="left")

In [93]:
clean_full_results = full_results[['SNP', 'A1', 'A2', 'F_A', 'F_U', 'P', 'OR','L95', 'U95', 'BONF', 
                                   'MAF', 'MAF_A', 'MAF_U', 
                                   'Hom Cases', 'Het Cases', 'Total Cases', 
                                   'Hom Controls','Het Controls', 'Total Controls']].copy()

In [94]:
print(clean_full_results.shape)
print(f'No. of SNPs: {clean_full_results.shape[0]}')
clean_full_results

(16833, 19)
No. of SNPs: 16833


Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
0,rs149480465,T,A,0.000220,0.000352,0.69670,0.6231,0.05648,6.874,1.0,0.000293,0.000220,0.000352,0,1,2277,0,2,2838
1,rs147898644,T,C,0.000439,0.000000,0.11430,,,,1.0,0.000196,0.000439,0.000000,0,2,2277,0,0,2838
2,.,C,G,0.000000,0.000352,0.20520,0.0000,0.00000,,1.0,0.000196,0.000000,0.000352,0,0,2277,0,2,2838
3,.,C,G,0.000000,0.000352,0.20520,0.0000,0.00000,,1.0,0.000196,0.000000,0.000352,0,2,2277,0,0,2838
4,.,C,G,0.000000,0.000352,0.20520,0.0000,0.00000,,1.0,0.000196,0.000000,0.000352,0,0,2277,0,2,2838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16828,.,A,G,0.000000,0.000705,0.07316,0.0000,0.00000,,1.0,0.000391,0.000000,0.000705,0,1,2277,0,1,2838
16829,.,A,G,0.000000,0.000705,0.07316,0.0000,0.00000,,1.0,0.000391,0.000000,0.000705,0,1,2277,0,2,2838
16830,.,A,G,0.000000,0.000705,0.07316,0.0000,0.00000,,1.0,0.000391,0.000000,0.000705,0,0,2277,0,4,2838
16831,rs140537885,G,T,0.004611,0.004052,0.66750,1.1390,0.62940,2.060,1.0,0.004301,0.004611,0.004052,0,21,2277,0,23,2838


In [95]:
# Look at significant SNPs, if any 
sig_freq = clean_full_results[clean_full_results['P']<0.05]
sig_snps = sig_freq['SNP'].tolist()
sig_snps

['rs61748102', 'rs3762672']

In [96]:
# Look at significant SNPs, if any 
sig_df_results = clean_full_results[clean_full_results['SNP'].isin(sig_snps)]
sig_df_results

Unnamed: 0,SNP,A1,A2,F_A,F_U,P,OR,L95,U95,BONF,MAF,MAF_A,MAF_U,Hom Cases,Het Cases,Total Cases,Hom Controls,Het Controls,Total Controls
9615,rs61748102,T,C,0.01449,0.01991,0.03787,0.724,0.533,0.9833,1.0,0.0175,0.01449,0.01991,1,64,2277,2,109,2838
9617,rs3762672,T,G,0.4517,0.4776,0.008968,0.901,0.8332,0.9743,0.2959,0.4661,0.4517,0.4776,458,1141,2277,663,1385,2838


In [97]:
# Save files to VM
clean_full_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt', sep="\t", index=False)
sig_df_results.to_csv(f'{WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt' , sep="\t", index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/{ancestry}_DNAJC13.annovar.hg38_multianno.txt')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.fullVariantInformation.txt {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.SignificantVariantInformation.txt {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.raw {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq.cc {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.frq {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry}_DNAJC13.coding_nonsyn.assoc.adjusted {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/2024_BURDEN_AMP_{ancestry}/')