## __Single gene analysis__

```GP2 ❤️ Open Science 😍```


__0. Getting Started__
- Loading Python libraries
- Defining functions
- Installing packages

__1. Copy data from workspace to cloud environment__

__2. Extract SLC25A46__

__3. Annotate SLC25A46 variants__

__4. Extract coding/non-syn variants__

__5. Calculate frequency in cases versus controls__

__6. Save out results__



## __Single gene analysis__

```GP2 ❤️ Open Science 😍```


__0. Getting Started__
- Loading Python libraries
- Defining functions
- Installing packages

__1. Copy data from workspace to cloud environment__

__2. Extract SLC25A46__

__3. Annotate SLC25A46 variants__

__4. Extract coding/non-syn variants__

__5. Calculate frequency in cases versus controls__

__6. Save out results__



## Getting Started

### Loading Python libraries

In [1]:
# Use the os package to interact with the environment
import os

# Bring in Pandas for Dataframe functionality
import pandas as pd

# Numpy for basics

import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

#Import Sys
import sys as sys

  from IPython.core.display import display, HTML


### Defining functions

In [2]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
https://notebooks.firecloud.org/proxy/terra-18d8e41c/saturn-c0307c3a-6cd0-4eca-9068-61afda6b299f/jupyter/notebooks/Endophilin-A/edit/SH3GL2_AMP-PD_ALF.ipynb#    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("chrome://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

### Set paths

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name: {WORKSPACE_NAME}')
print(f'Billing Project: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data: {WORKSPACE_BUCKET}')
print('')


## GP2 v5.0
## Explicitly define release v5.0 path 
GP2_RELEASE_PATH = 'path/'
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v5.0')
print(f'Path to GP2 v5.0 Clinical Data: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v5.0 Raw Genotype Data: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v5.0 Imputed Genotype Data: {GP2_IMPUTED_GENO_PATH}')


## AMP-PD v3.0
## Explicitly define release v3.0 path 
AMP_RELEASE_PATH = 'path/'
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'

AMP_WGS_RELEASE_PATH = 'path/'
AMP_WGS_RELEASE_PLINK_PATH = os.path.join(AMP_WGS_RELEASE_PATH, 'plink')

print('AMP-PD v3.0')
print(f'Path to AMP-PD v3.0 Clinical Data: {AMP_CLINICAL_RELEASE_PATH}')
print(f'Path to AMP-PD v3.0 WGS Data: {AMP_WGS_RELEASE_PLINK_PATH}')
print('')

### Install packages

### Install Plink 1.9 and Plink 2.0

In [4]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi


if test -e /home/jupyter/tools/plink2; then
echo "Plink2 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink2 \n    -------"
wget -N https://s3.amazonaws.com/plink2-assets/alpha3/plink2_linux_avx2_20220603.zip
unzip -o plink2_linux_avx2_20220603.zip
echo -e "\n plink2 downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink1.9 is already installed in /home/jupyter/tools/
Plink2 is already installed in /home/jupyter/tools/


In [5]:
%%bash
ls /home/jupyter/tools/

annovar
annovar.latest.tar.gz
LICENSE
plink
plink2
plink2_linux_x86_64_20240504.zip
plink_linux_x86_64_20190304.zip
prettify
toy.map
toy.ped


### Remote restrictions

In [6]:
%%bash

# chmod plink 1.9 
chmod u+x /home/jupyter/tools/plink

In [7]:
%%bash

# chmod plink 2.0
chmod u+x /home/jupyter/tools/plink2

### Install ANNOVAR

In [8]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

annovar is already installed in /home/jupyter/tools/


In [9]:
%%bash
ls /home/jupyter/tools/

annovar
annovar.latest.tar.gz
LICENSE
plink
plink2
plink2_linux_x86_64_20240504.zip
plink_linux_x86_64_20190304.zip
prettify
toy.map
toy.ped


### Install ANNOVAR: Download sources of annotation

In [10]:
%%bash

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20170905 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/


NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20170905.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20170905.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished d

In [11]:
%%bash
ls /home/jupyter/tools/annovar/

annotate_variation.pl
coding_change.pl
convert2annovar.pl
example
humandb
retrieve_seq_from_fasta.pl
table_annovar.pl
variants_reduction.pl


## Copy data from AMP-PD bucket to workspace

In [12]:
# Make a directory called "/home/jupyter/SLC25A46_AMPPD/"
print("Making a working directory")
WORK_DIR = f'/home/jupyter/SLC25A46_AMPPD/'
shell_do(f'mkdir -p {WORK_DIR}') # f' means f-string - contains expressions to execute the code

Making a working directory


Executing: mkdir -p /home/jupyter/SLC25A46_AMPPD/


In [None]:
# Check directory where AMP-PD data is
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {AMP_WGS_RELEASE_PLINK_PATH}/pfiles')

In [None]:
#Copy data to directory defined above
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_WGS_RELEASE_PLINK_PATH}/pfiles/chr5* {WORK_DIR}')

### Create a covariate file with AMP-PD data

In [None]:
## Copy file from "AMP_CLINICAL_RELEASE_PATH"
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {AMP_CLINICAL_RELEASE_PATH}')

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_RELEASE_PATH}/amp_pd_case_control.csv {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/Demographics.csv {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/PD_Medical_History.csv {WORK_DIR}')

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/Family_History_PD.csv {WORK_DIR}')

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/MDS_UPDRS_Part_IV.csv {WORK_DIR}')

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/DaTSCAN_SBR.csv {WORK_DIR}')

In [None]:
pd_medical_history_df = pd.read_csv(f'{WORK_DIR}/PD_Medical_History.csv')
pd_medical_history_df[pd_medical_history_df['participant_id']=='PD-PDUV122BP2']

In [None]:
pd_family_history_df = pd.read_csv(f'{WORK_DIR}/Family_History_PD.csv')
pd_family_history_df[pd_family_history_df['participant_id']=='PD-PDUV122BP2']

In [22]:
pd_medical_history_df = pd_medical_history_df[["participant_id","visit_month","diagnosis","age_at_diagnosis"]]

In [None]:
pd_case_control_df = pd.read_csv(f'{WORK_DIR}/amp_pd_case_control.csv')
pd_case_control_df[pd_case_control_df['participant_id']=='HB-PD_INVUM287GR7']

In [24]:
pd_case_control_latest_df = pd_case_control_df[['participant_id', 'diagnosis_latest', 'case_control_other_latest']].copy()

In [25]:
pd_case_control_latest_df.columns = ['ID', 'LATEST_DX', 'CASE_CONTROL']

In [26]:
case_con_reduced = pd_case_control_latest_df.copy()
case_con_reduced.drop_duplicates(subset=['ID'], inplace=True)

In [27]:
conditions = [
    (case_con_reduced['CASE_CONTROL'] == "Case"),
    (case_con_reduced['CASE_CONTROL'] == "Control")]

In [28]:
choices = [2,1]
case_con_reduced['PHENO'] = np.select(conditions, choices, default=-9).astype(np.int64)

In [29]:
case_con_reduced.reset_index(inplace=True)
case_con_reduced.drop(columns=["index"], inplace=True)

In [None]:
case_con_reduced.drop(columns=['CASE_CONTROL'], inplace=True)
case_con_reduced.head()

In [None]:
demographics_df = pd.read_csv(f'{WORK_DIR}/Demographics.csv')
demographics_df[demographics_df['participant_id']=='PD-PDUV122BP2']

In [32]:
demographics_df.rename(columns = {'participant_id':'ID'}, inplace = True)
demographics_df.rename(columns = {'age_at_baseline':'BASELINE_AGE'}, inplace = True)
demographics_df.rename(columns = {'race':'RACE'}, inplace = True)

In [33]:
demographics_baseline_df = demographics_df \
.sort_values('visit_month', ascending=True) \
.drop_duplicates('ID').sort_index()

In [None]:
demographics_df_casecon = demographics_df.merge(case_con_reduced, on='ID', how='outer')
demographics_df_casecon.head()

In [35]:
conditions = [
     (demographics_df_casecon['sex'] == "Male"),
     (demographics_df_casecon['sex'] == "Female")]

In [36]:
choices = [1,2]
demographics_df_casecon['SEX'] = np.select(conditions, choices, default=None).astype(np.int64)

In [None]:
demographics_df_casecon.drop(columns=['sex'], inplace=True)
demographics_df_casecon.head()

In [38]:
demographics_df_casecon_toKeep_sex = demographics_df_casecon[['ID', 'ID', 'SEX'
                                                          ]].copy()
demographics_df_casecon_toKeep_sex.columns = ['FID','IID', 'SEX']
demographics_df_casecon_toKeep_sex.head()
demographics_df_casecon_toKeep_sex.to_csv(f'{WORK_DIR}/SEX.txt',index=False)

In [39]:
demographics_df_casecon_toKeep_pheno = demographics_df_casecon[['ID', 'ID', 'PHENO'
                                                          ]].copy()
demographics_df_casecon_toKeep_pheno.columns = ['FID','IID', 'PHENO']
demographics_df_casecon_toKeep_pheno.head()
demographics_df_casecon_toKeep_pheno.to_csv(f'{WORK_DIR}/PHENO.txt',index=False)

In [None]:
moca_df = pd.read_csv(f'{WORK_DIR}/MOCA.csv')
moca_df[moca_df['participant_id']=='PD-PDUV122BP2']

In [None]:
moca_df = pd.read_csv(f'{WORK_DIR}/MDS_UPDRS_Part_IV.csv')
moca_df[moca_df['participant_id']=='PD-PDUV122BP2']

In [None]:
dat_df = pd.read_csv(f'{WORK_DIR}/DaTSCAN_visual_interpretation.csv')
dat_df[dat_df['participant_id']=='PD-PDUV122BP2']

In [None]:
datscan_df = pd.read_csv(f'{WORK_DIR}/DaTSCAN_SBR.csv')
datscan_df[datscan_df['participant_id']=='PD-PDUV122BP2']

## Extract SLC25A46

In [40]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

## SLC25A46 : gene positions on hg38 (from https://useast.ensembl.org/index.html)
/home/jupyter/tools/plink2 \
--pfile chr5 \
--chr 5 \
--from-bp 110738136 \
--to-bp 110765161 \
--make-bed \
--max-alleles 2 \
--out temp_AMPPD

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to temp_AMPPD.log.
Options in effect:
  --chr 5
  --from-bp 110738136
  --make-bed
  --max-alleles 2
  --out temp_AMPPD
  --pfile chr5
  --to-bp 110765161

Start time: Thu May 23 02:14:29 2024
3676 MiB RAM detected, ~2428 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
10418 samples (0 females, 0 males, 10418 ambiguous; 10418 founders) loaded from
chr5.psam.
9161370 out of 9880933 variants loaded from chr5.pvar.
Note: No phenotype data present.
1257 variants remaining after main filters.
Writing temp_AMPPD.fam ... done.
Writing temp_AMPPD.bim ... done.
Writing temp_AMPPD.bed ... done.
End time: Thu May 23 02:16:56 2024


In [41]:
%%bash
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head temp_AMPPD.bim

5	.	0	110738161	C	G
5	rs78443964	0	110738170	C	T
5	.	0	110738172	A	G
5	.	0	110738183	C	T
5	.	0	110738210	C	G
5	rs147877745	0	110738249	C	T
5	.	0	110738270	G	A
5	.	0	110738345	C	G
5	rs114440759	0	110738407	T	C
5	.	0	110738408	T	G


In [42]:
bim = pd.read_csv("/home/jupyter/SLC25A46_AMPPD/temp_AMPPD.bim",sep="\t",names=['chr','rsID','pos','bp','A1','A2'])
bim['chr'] = bim['chr'].astype(str)
bim['bp'] = bim['bp'].astype(str)

In [43]:
bim['rsID'] = bim.chr.str.cat(bim.bp, sep=':')

In [44]:
bim.head()

Unnamed: 0,chr,rsID,pos,bp,A1,A2
0,5,5:110738161,0,110738161,C,G
1,5,5:110738170,0,110738170,C,T
2,5,5:110738172,0,110738172,A,G
3,5,5:110738183,0,110738183,C,T
4,5,5:110738210,0,110738210,C,G


In [45]:
bim.to_csv(f'{WORK_DIR}/temp_AMPPD.bim',sep="\t",index=False,header=None)

In [None]:
%%bash
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head temp_AMPPD.fam
#Store the first column of IDs in a new file
awk '{ print $1 }' temp_AMPPD.fam > IDs.txt

In [47]:
sex = pd.read_csv(f'{WORK_DIR}/SEX.txt')
pheno = pd.read_csv(f'{WORK_DIR}/PHENO.txt')
ids = pd.read_csv(f'{WORK_DIR}/IDs.txt',names=['FID'])

In [48]:
sex_new = pd.merge(sex,ids,on="FID")
sex_new.to_csv(f'{WORK_DIR}/SEX_new.txt',index=False, sep="\t")

In [49]:
pheno_new = pd.merge(pheno,ids,on="FID")
pheno_new.to_csv(f'{WORK_DIR}/PHENO_new.txt',index=False, sep="\t")

In [50]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

# Update sex variable
/home/jupyter/tools/plink \
--bfile temp_AMPPD \
--update-sex SEX_new.txt  \
--pheno PHENO_new.txt \
--make-bed \
--out pheno_SLC25A46

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to pheno_SLC25A46.log.
Options in effect:
  --bfile temp_AMPPD
  --make-bed
  --out pheno_SLC25A46
  --pheno PHENO_new.txt
  --update-sex SEX_new.txt

3676 MB RAM detected; reserving 1838 MB for main workspace.
1257 variants loaded from .bim file.
10418 people (0 males, 0 females, 10418 ambiguous) loaded from .fam.
Ambiguous sex IDs written to pheno_SLC25A46.nosex .
7512 phenotype values present after --pheno.
--update-sex: 10418 people updated, 1 ID not present.
Using 1 thread.
Before main variant filters, 10418 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999982.
1257 variants and 10418 

### Visualize plink files bim, fam and bed

In [51]:
%%bash

# Visualize bim file
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head pheno_SLC25A46.bim

5	5:110738161	0	110738161	C	G
5	5:110738170	0	110738170	C	T
5	5:110738172	0	110738172	A	G
5	5:110738183	0	110738183	C	T
5	5:110738210	0	110738210	C	G
5	5:110738249	0	110738249	C	T
5	5:110738270	0	110738270	G	A
5	5:110738345	0	110738345	C	G
5	5:110738407	0	110738407	T	C
5	5:110738408	0	110738408	T	G


In [None]:
%%bash

# Visualize fam file
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head pheno_SLC25A46.fam

In [None]:
%%bash

# Visualize bed file
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head pheno_SLC25A46.bed

In [54]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

# Turn binary files into VCF
/home/jupyter/tools/plink \
--bfile pheno_SLC25A46 \
--recode vcf-fid \
--allow-no-sex \
--out pheno_SLC25A46

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to pheno_SLC25A46.log.
Options in effect:
  --allow-no-sex
  --bfile pheno_SLC25A46
  --out pheno_SLC25A46
  --recode vcf-fid

3676 MB RAM detected; reserving 1838 MB for main workspace.
1257 variants loaded from .bim file.
10418 people (5782 males, 4636 females) loaded from .fam.
7512 phenotype values loaded from .fam.
Using 1 thread.
Before main variant filters, 10418 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999982.
1257 variants and 10418 people pass filters and QC.
Among remaining phenotypes, 3359 are cases and 4153 are controls.  (2906
phenotypes are missing.)
--recode vcf-fid to 

## Annotate SLC25A46 variants

In [55]:
%%bash
## 
WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR
export PATH=$PATH:/home/jupyter/tools/rvtests/third/tabix-0.2.6/

### Bgzip and Tabix
bgzip pheno_SLC25A46.vcf

tabix -f -p vcf pheno_SLC25A46.vcf.gz

[bgzip] can't create pheno_SLC25A46.vcf.gz: File exists


In [56]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

### Annotate variants using ANNOVAR: https://annovar.openbioinformatics.org/en/latest/ 
perl /home/jupyter/tools/annovar/table_annovar.pl pheno_SLC25A46.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out pheno_SLC25A46.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 pheno_SLC25A46.vcf.gz > pheno_SLC25A46.annovar.avinput>
NOTICE: Finished reading 1264 lines from VCF file
NOTICE: A total of 1257 locus in VCF file passed QC threshold, representing 1130 SNPs (764 transitions and 366 transversions) and 127 indels/substitutions
NOTICE: Finished writing allele frequencies based on 11772340 SNP genotypes (7959352 transitions and 3812988 transversions) and 1323086 indels/substitutions for 10418 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl pheno_SLC25A46.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile pheno_SLC25A46.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=refGene

NOTICE: Running with system command <annotate_variati

## Extract coding and non-syn variants

In [62]:
# Visualize multianno file
SLC25A46 = pd.read_csv(f'{WORK_DIR}/pheno_SLC25A46.annovar.hg38_multianno.txt', sep = '\t')
SLC25A46.head

<bound method NDFrame.head of       Chr      Start        End   Ref Alt Func.refGene Gene.refGene  \
0       5  110738161  110738161     G   C         UTR5     SLC25A46   
1       5  110738170  110738170     T   C         UTR5     SLC25A46   
2       5  110738172  110738172     G   A         UTR5     SLC25A46   
3       5  110738183  110738183     T   C         UTR5     SLC25A46   
4       5  110738210  110738210     G   C         UTR5     SLC25A46   
...   ...        ...        ...   ...  ..          ...          ...   
1252    5  110765017  110765017     T   C         UTR3     SLC25A46   
1253    5  110765038  110765038     G   A         UTR3     SLC25A46   
1254    5  110765089  110765092  CTTA   -         UTR3     SLC25A46   
1255    5  110765140  110765140     A   G         UTR3     SLC25A46   
1256    5  110765150  110765150     G   A         UTR3     SLC25A46   

                                     GeneDetail.refGene ExonicFunc.refGene  \
0                                 NM_00

In [63]:
SLC25A46 = SLC25A46[SLC25A46['Gene.refGene']=='SLC25A46']

In [64]:
SLC25A46.to_csv(f'{WORKSPACE_BUCKET}/pheno_SLC25A46_final_AMPPD.annovar.hg38_multianno.txt')

In [66]:
SLC25A46.count()

Chr               1257
Start             1257
End               1257
Ref               1257
Alt               1257
                  ... 
Otherinfo10426    1257
Otherinfo10427    1257
Otherinfo10428    1257
Otherinfo10429    1257
Otherinfo10430    1257
Length: 10441, dtype: int64

In [67]:
SLC25A46['Func.refGene'].value_counts()

Func.refGene
intronic    1030
UTR3         151
exonic        51
UTR5          23
splicing       2
Name: count, dtype: int64

## Filter exonic variants!

In [68]:

coding = SLC25A46[SLC25A46['Func.refGene'] == 'exonic']
coding.head()

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,Otherinfo10421,Otherinfo10422,Otherinfo10423,Otherinfo10424,Otherinfo10425,Otherinfo10426,Otherinfo10427,Otherinfo10428,Otherinfo10429,Otherinfo10430
58,5,110739128,110739130,GCG,-,exonic,SLC25A46,.,nonframeshift deletion,SLC25A46:NM_001303249:exon1:c.9_11del:p.3_4del...,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
59,5,110739131,110739131,G,A,exonic,SLC25A46,.,synonymous SNV,"SLC25A46:NM_001303249:exon1:c.G12A:p.R4R,SLC25...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
60,5,110739163,110739163,G,A,exonic,SLC25A46,.,nonsynonymous SNV,"SLC25A46:NM_001303249:exon1:c.G44A:p.R15Q,SLC2...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
61,5,110739171,110739171,G,T,exonic,SLC25A46,.,nonsynonymous SNV,"SLC25A46:NM_001303249:exon1:c.G52T:p.A18S,SLC2...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
62,5,110739182,110739182,G,T,exonic,SLC25A46,.,nonsynonymous SNV,"SLC25A46:NM_001303249:exon1:c.G63T:p.E21D,SLC2...",...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0


In [69]:
coding['ExonicFunc.refGene'].value_counts()

ExonicFunc.refGene
nonsynonymous SNV          33
synonymous SNV             14
nonframeshift deletion      2
nonframeshift insertion     1
stoploss                    1
Name: count, dtype: int64

In [None]:
coding.to_csv(f'{WORK_DIR}/coding.txt', sep = '\t', index = False)
coding

In [None]:
# Filter intronic variants
intronic = SLC25A46[SLC25A46['Func.refGene'] == 'intronic']
intronic.head()

In [72]:
intronic.count()

Chr               1030
Start             1030
End               1030
Ref               1030
Alt               1030
                  ... 
Otherinfo10426    1030
Otherinfo10427    1030
Otherinfo10428    1030
Otherinfo10429    1030
Otherinfo10430    1030
Length: 10441, dtype: int64

In [None]:
# Filter UTR3 variants
UTR3 = SLC25A46[SLC25A46['Func.refGene'] == 'UTR3']
UTR3.head()

In [74]:
UTR3.count()

Chr               151
Start             151
End               151
Ref               151
Alt               151
                 ... 
Otherinfo10426    151
Otherinfo10427    151
Otherinfo10428    151
Otherinfo10429    151
Otherinfo10430    151
Length: 10441, dtype: int64

In [None]:
# Filter UTR5 variants
UTR5 = SLC25A46[SLC25A46['Func.refGene'] == 'UTR5']
UTR5.head()

In [76]:
UTR5.count()

Chr               23
Start             23
End               23
Ref               23
Alt               23
                  ..
Otherinfo10426    23
Otherinfo10427    23
Otherinfo10428    23
Otherinfo10429    23
Otherinfo10430    23
Length: 10441, dtype: int64

## Calculate freq of coding and non-syn vars in cases versus controls

In [77]:
reduced_coding = coding[["Chr", "Start", "End", "Gene.refGene"]]
reduced_coding.to_csv(f'{WORK_DIR}/reduced_coding.txt', sep = '\t', index = False, header= False)

In [78]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

head reduced_coding.txt

5	110739128	110739130	SLC25A46
5	110739131	110739131	SLC25A46
5	110739163	110739163	SLC25A46
5	110739171	110739171	SLC25A46
5	110739182	110739182	SLC25A46
5	110739259	110739259	SLC25A46
5	110739263	110739263	SLC25A46
5	110739266	110739266	SLC25A46
5	110739267	110739267	SLC25A46
5	110739287	110739287	SLC25A46


In [80]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

/home/jupyter/tools/plink --bfile pheno_SLC25A46 --extract range reduced_coding.txt --assoc --ci 0.95 --out coding_pheno_SLC25A46 --allow-no-sex

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to coding_pheno_SLC25A46.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile pheno_SLC25A46
  --ci 0.95
  --extract range reduced_coding.txt
  --out coding_pheno_SLC25A46

3676 MB RAM detected; reserving 1838 MB for main workspace.
1257 variants loaded from .bim file.
10418 people (5782 males, 4636 females) loaded from .fam.
7512 phenotype values loaded from .fam.
--extract range: 1208 variants excluded.
--extract range: 49 variants remaining.
Using 1 thread.
Before main variant filters, 10418 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999996.
49 variants and 10418 people pass f

In [81]:
SLC25A46_freq = pd.read_csv(f'{WORK_DIR}/coding_pheno_SLC25A46.assoc', delim_whitespace=True)
SLC25A46_freq

Unnamed: 0,CHR,SNP,BP,A1,F_A,F_U,A2,CHISQ,P,OR,SE,L95,U95
0,5,5:110739131,110739131,A,0.000149,0.0,G,1.236,0.2662,,,,
1,5,5:110739163,110739163,A,0.000149,0.0,G,1.236,0.2662,,,,
2,5,5:110739171,110739171,T,0.000149,0.000361,G,0.6291,0.4277,0.412,1.155,0.04285,3.962
3,5,5:110739182,110739182,T,0.0,0.00012,G,0.8089,0.3685,0.0,inf,0.0,
4,5,5:110739259,110739259,A,0.000149,0.0,C,1.236,0.2662,,,,
5,5,5:110739263,110739263,C,0.000149,0.0,T,1.236,0.2662,,,,
6,5,5:110739266,110739266,T,0.02397,0.01914,C,4.144,0.04178,1.258,0.113,1.008,1.57
7,5,5:110739267,110739267,A,0.001489,0.000361,C,5.46,0.01946,4.126,0.6585,1.135,15.0
8,5,5:110739287,110739287,T,0.000149,0.0,C,1.236,0.2662,,,,
9,5,5:110739305,110739305,A,0.0,0.0,G,,,,,,


In [82]:
%%bash

WORK_DIR='/home/jupyter/SLC25A46_AMPPD/'
cd $WORK_DIR

/home/jupyter/tools/plink --bfile pheno_SLC25A46 --extract range reduced_coding.txt --recode A --out SLC25A46_AMPPD_EUR_variants

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_AMPPD_EUR_variants.log.
Options in effect:
  --bfile pheno_SLC25A46
  --extract range reduced_coding.txt
  --out SLC25A46_AMPPD_EUR_variants
  --recode A

3676 MB RAM detected; reserving 1838 MB for main workspace.
1257 variants loaded from .bim file.
10418 people (5782 males, 4636 females) loaded from .fam.
7512 phenotype values loaded from .fam.
--extract range: 1208 variants excluded.
--extract range: 49 variants remaining.
Using 1 thread.
Before main variant filters, 10418 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999996.
49 variants and 10418 people pass filters and QC.

In [None]:
SLC25A46_AMPPD_EUR_variants_recode = pd.read_csv(f'{WORK_DIR}/SLC25A46_AMPPD_EUR_variants.raw', delim_whitespace=True)
SLC25A46_AMPPD_EUR_variants_recode

### SLC25A46 5:110739171_T

In [None]:
SLC25A46_AMPPD_EUR_variants_recode[(SLC25A46_AMPPD_EUR_variants_recode['5:110739171_T']==1)&(SLC25A46_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

In [None]:
SLC25A46_AMPPD_EUR_variants_recode[(SLC25A46_AMPPD_EUR_variants_recode['5:110739171_T']==1)&(SLC25A46_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

In [None]:
SLC25A46_AMPPD_EUR_variants_recode[(SLC25A46_AMPPD_EUR_variants_recode['5:110739171_T']==2)&(SLC25A46_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

In [None]:
SLC25A46_AMPPD_EUR_variants_recode[(SLC25A46_AMPPD_EUR_variants_recode['5:110739171_T']==2)&(SLC25A46_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

### GCH1 p.Gly203Arg

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54845787_T']==1)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54845787_T']==1)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54845787_T']==2)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54845787_T']==2)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

### GCH1 p.Gln103His

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54902355_G']==2)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54902355_G']==2)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54902355_G']==1)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==1)]

In [None]:
GCH1_AMPPD_EUR_variants_recode[(GCH1_AMPPD_EUR_variants_recode['14:54902355_G']==1)&(GCH1_AMPPD_EUR_variants_recode['PHENOTYPE']==2)]

## Save out results..!

In [None]:
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR} {WORKSPACE_BUCKET}')