# SLC25A46 - Single gene analysis in GP2 data

## Description


### 0. Getting Started

- Loading Python libraries
- Defining functions
- Installing packages

### 1. Copy data from workspace to cloud environment

### 2. Extract SLC25A46

### 3. Annotate SLC25A46 variants

### 4. Extract coding/non-syn variants

### 5. Calculate frequency in cases versus controls

### 6. Calculate frequency (homozygotes) in cases versus controls

### 7. Save out results

## Getting Started

### Loading Python libraries

In [1]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns  

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

#Import Sys
import sys as sys

### Defining functions

In [2]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("cloud://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

### Set paths

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name @ `WORKSPACE_NAME`: {WORKSPACE_NAME}')
print(f'Billing Project @ `BILLING_PROJECT_ID`: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data @ `WORKSPACE_BUCKET`: {WORKSPACE_BUCKET}')
print('')

## GP2 v7.0
## Explicitly define release v7.0 path 
GP2_RELEASE_PATH = 'path/'
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
GP2_META_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_RELEASE_PATH_2 = 'cloud://gp2tier2/release6_21122023'
GP2_WGS_PATH = f'{GP2_RELEASE_PATH_2}/wgs'

print('GP2 v7.0')
print(f'Path to GP2 v7.0 Clinical Data: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Raw Genotype Data: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v7.0 Imputed Genotype Data: {GP2_IMPUTED_GENO_PATH}')
print(f'Path to GP2 v7.0 Metadata: {GP2_META_PATH}')
print(f'Path to GP2 v6.0 WGS Data: {GP2_WGS_PATH}')

### Make 'SLC25A46_WGS' for {All Ancestries}

In [4]:
ancestry = 'all_ancestries'
WORK_DIR = f'SLC25A46_WGS_{ancestry}'

! mkdir {WORK_DIR}

### Install packages

#### Install Plink 1.9 and Plink 2.0

In [5]:
%%bash

mkdir -p ~/tools
cd ~/tools

if test -e /home/jupyter/tools/plink; then
echo "Plink1.9 is already installed in /home/jupyter/tools/"

else
echo -e "Downloading plink \n    -------"
wget -N http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 
unzip -o plink_linux_x86_64_20190304.zip
echo -e "\n plink downloaded and unzipped in /home/jupyter/tools \n "

fi

Plink1.9 is already installed in /home/jupyter/tools/


In [6]:
%%bash

# Install plink 2.0
cd /home/jupyter/tools/
if test -e /home/jupyter/tools/plink2; then

echo "Plink2 is already installed in /home/jupyter/tools/"
else
echo "Plink2 is not installed"
cd /home/jupyter/tools/

wget https://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_20240504.zip

unzip -o plink2_linux_x86_64_20240504.zip

fi

Plink2 is already installed in /home/jupyter/tools/


In [7]:
! ls /home/jupyter/tools/

annovar		       plink2				 toy.map
annovar.latest.tar.gz  plink2_linux_x86_64_20240504.zip  toy.ped
LICENSE		       plink_linux_x86_64_20190304.zip
plink		       prettify


#### Remove restrictions

In [8]:
# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2

### Install ANNOVAR

In [9]:
%%bash

# Install ANNOVAR:
# https://www.openbioinformatics.org/annovar/annovar_download_form.php

if test -e /home/jupyter/tools/annovar; then

echo "annovar is already installed in /home/jupyter/tools/"
else
echo "annovar is not installed"
cd /home/jupyter/tools/

wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz

tar xvfz annovar.latest.tar.gz

fi

annovar is already installed in /home/jupyter/tools/


In [10]:
%%bash
ls /home/jupyter/tools/

annovar
annovar.latest.tar.gz
LICENSE
plink
plink2
plink2_linux_x86_64_20240504.zip
plink_linux_x86_64_20190304.zip
prettify
toy.map
toy.ped


#### Install ANNOVAR: Download sources of annotation 

In [11]:
%%bash

cd /home/jupyter/tools/annovar/

perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar clinvar_20140902 humandb/
#perl annotate_variation.pl -buildver hg38 -downdb cytoBand humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ensGene humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 humandb/ 
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad211_genome humandb/
#perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar ljb26_all humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the 'humandb' directory
NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_clinvar_20140902.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished d

## Copy data from GP2 bucket to workspace

In [None]:
# Check directory where GP2 Tier 2 data is
print("List available WGS information in GP2 (broken down by ancestry)")
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {GP2_WGS_PATH}')

In [1]:
### Show list contents of GP2 Tier 2

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls cloud://gp2tier2/')

In [None]:
## Copy WGS genotype GP2 Tier 2 data for SLC25A46 Chromosome 5
ancestry = 'all_ancestries'  
WORK_DIR = f'SLC25A46_WGS_{ancestry}'

shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_WGS_PATH}/var_calling/plink/chr5.psam {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_WGS_PATH}/var_calling/plink/chr5.pvar {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_WGS_PATH}/var_calling/plink/chr5.pgen {WORK_DIR}')

In [None]:
## Copy clinical data using master key
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {GP2_WGS_PATH}/var_calling/WGS_meta_key.csv {WORK_DIR}')

In [None]:
## Copy WGS_GP2_PCA.eigenvec file to directory
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORKSPACE_BUCKET}/WGS_GP2_PCA.eigenvec {WORK_DIR}')

In [None]:
## Visualize WGS_GP2_PCA.eigenvec using '! head' function
! head {WORK_DIR}/WGS_GP2_PCA.eigenvec

In [18]:
## Check files
! ls {WORK_DIR}

chr5.pgen  chr5.psam  chr5.pvar  WGS_GP2_PCA.eigenvec  WGS_meta_key.csv


### Create a covariate file with GP2 data

In [None]:
# load the master key
key = pd.read_csv(f'{WORK_DIR}/WGS_meta_key.csv')
print(key.shape)
key.head()

In [20]:
# Subsetting to keep only a few columns 
key = key[['GP2sampleID',"GP2ID", 'phenotype', "pheno_for_qc",'sex_for_qc', 'age', 'age_of_onset', 'label']]
# Rename the columns
key.rename(columns = {'GP2sampleID':'IID',"pheno_for_qc":"PHENO",'sex_for_qc':'SEX','age':'AGE','age_of_onset':'AAO'}, inplace = True)

In [None]:
##visualize only columns labled with "PHENO" for all ancestries
key.groupby('label')['PHENO'].value_counts()

In [22]:
##SELECT THE ANCESTRY GROUP YOU WILL BE WORKING WITH
ancestry_GROUP = 'EUR'

In [None]:
## Subset to keep ancestry of interest (in this case "EUR")
covar = key[key['label']==f"{ancestry_GROUP}"].copy()
covar.reset_index(drop=True)

In [None]:
# Load eigenvec file and visualize data from file
PCA = pd.read_csv(f'{WORK_DIR}/WGS_GP2_PCA.eigenvec', sep='\t')
PCA.rename(columns = {"#FID":"FID"}, inplace = True)
PCA.head()

In [25]:
## Summarize covar Dataframe
covar.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1273 entries, 0 to 2062
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IID        1273 non-null   object 
 1   GP2ID      1273 non-null   object 
 2   phenotype  1273 non-null   object 
 3   PHENO      1273 non-null   int64  
 4   SEX        1273 non-null   int64  
 5   AGE        721 non-null    float64
 6   AAO        687 non-null    float64
 7   label      1273 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 89.5+ KB


In [26]:
# Merge covar file from above cells with PCA file from before
covar_pca= covar.merge(PCA, on='IID', how='inner')
covar_pca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1273 entries, 0 to 1272
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IID        1273 non-null   object 
 1   GP2ID      1273 non-null   object 
 2   phenotype  1273 non-null   object 
 3   PHENO      1273 non-null   int64  
 4   SEX        1273 non-null   int64  
 5   AGE        721 non-null    float64
 6   AAO        687 non-null    float64
 7   label      1273 non-null   object 
 8   FID        1273 non-null   object 
 9   PC1        1273 non-null   float64
 10  PC2        1273 non-null   float64
 11  PC3        1273 non-null   float64
 12  PC4        1273 non-null   float64
 13  PC5        1273 non-null   float64
 14  PC6        1273 non-null   float64
 15  PC7        1273 non-null   float64
 16  PC8        1273 non-null   float64
 17  PC9        1273 non-null   float64
 18  PC10       1273 non-null   float64
dtypes: float64(12), int64(2), object(5)
memory usage

In [27]:
#save covar to workspace
covar.to_csv(f'{WORK_DIR}/{WORK_DIR}_COVS.txt', index=False, sep='\t')

In [None]:
## check to make sure file is correct/saved
covar_pca.head()

In [None]:
##
covar_pca_final = covar_pca[['FID','IID', 'SEX', 'AGE', 'PHENO','PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
covar_pca_final
covar_pca_final['PHENO'] = covar_pca_final['PHENO'].fillna(-9)
covar_pca_final['AGE'] = covar_pca_final['AGE'].fillna(-9)
covar_pca_final['SEX'] = covar_pca_final['SEX'].fillna(-9)

covar_pca_final

In [30]:
## Make your covariate file
covar_pca_final.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_covariate_file.txt', sep = '\t', index=False)

## Annotate SLC25A46! Extract using plink and then annotate using annovar

In [31]:
#Basic QC before extracting
WORK_DIR = "SLC25A46_WGS_all_ancestries"
! /home/jupyter/tools/plink2 \
--pfile {WORK_DIR}/chr5  \
--geno 0.05 \
--make-bed \
--max-alleles 2 \
--out {WORK_DIR}/chr5.qc

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/chr5.qc.log.
Options in effect:
  --geno 0.05
  --make-bed
  --max-alleles 2
  --out SLC25A46_WGS_all_ancestries/chr5.qc
  --pfile SLC25A46_WGS_all_ancestries/chr5

Start time: Tue May 21 21:06:07 2024
3676 MiB RAM detected, ~2420 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
2078 samples (760 females, 1318 males; 2078 founders) loaded from
SLC25A46_WGS_all_ancestries/chr5.psam.
5214395 out of 5425833 variants loaded from
SLC25A46_WGS_all_ancestries/chr5.pvar.
Note: No phenotype data present.
Calculating allele frequencies... 1012131415161819202122242526272830313233353637383941424344454748495051535455565759606162646566676870717273747677787980828384858788899091939495969799done.
--geno: 143788 variants removed due to missing genotype data.
5070607 variants remaining 

In [32]:
! ls

PP2A_AMP_PD.ipynb		   SLC25A46_CAS
PP2A_GP2_AAC.ipynb		   SLC25A46_EAS
PP2A_GP2_AFR.ipynb		   SLC25A46_EUR
PP2A_GP2_AJ.ipynb		   SLC25A46_GP2_Genotyping_Imputed_AAC.ipynb
PP2A_GP2_AMR.ipynb		   SLC25A46_GP2_Genotyping_Imputed_AFR.ipynb
PP2A_GP2_CAS.ipynb		   SLC25A46_GP2_Genotyping_Imputed_AJ2.0.ipynb
PP2A_GP2_EAS.ipynb		   SLC25A46_GP2_Genotyping_Imputed_AJ.ipynb
PP2A_GP2_EUR.ipynb		   SLC25A46_GP2_Genotyping_Imputed_AMR.ipynb
PP2A_GP2_MDE.ipynb		   SLC25A46_GP2_Genotyping_Imputed_CAS.ipynb
PP2A_GP2_SAS.ipynb		   SLC25A46_GP2_Genotyping_Imputed_EAS.ipynb
Schneider_test_PP2A_GP2_EUR.ipynb  SLC25A46_GP2_Genotyping_Imputed_EUR.ipynb
SLC25A46_AAC			   SLC25A46_GP2_Genotyping_Imputed_MDE.ipynb
SLC25A46_AFR			   SLC25A46_GP2_Genotyping_Imputed_SAS.ipynb
SLC25A46_AJ			   SLC25A46_GP2_WGS.ipynb
SLC25A46_AMP_PD.ipynb		   SLC25A46_MDE
SLC25A46_AMR			   SLC25A46_WGS_all_ancestries


In [None]:
! awk '{print $1, $2, $5}' SLC25A46_WGS_all_ancestries/EUR_covariate_file.txt > pheno.txt
! head pheno.txt

In [34]:
## extract region using plink

WORK_DIR = "SLC25A46_WGS_all_ancestries"
! /home/jupyter/tools/plink2 \
--bfile {WORK_DIR}/chr5.qc \
--chr 5 \
--from-bp 110738136 \
--to-bp 110765161 \
--pheno pheno.txt \
--make-bed \
--out {WORK_DIR}/SLC25A46

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/SLC25A46.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/chr5.qc
  --chr 5
  --from-bp 110738136
  --make-bed
  --out SLC25A46_WGS_all_ancestries/SLC25A46
  --pheno pheno.txt
  --to-bp 110765161

Start time: Tue May 21 21:08:29 2024
3676 MiB RAM detected, ~2429 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
2078 samples (760 females, 1318 males; 2078 founders) loaded from
SLC25A46_WGS_all_ancestries/chr5.qc.fam.
5070607 variants loaded from SLC25A46_WGS_all_ancestries/chr5.qc.bim.
1 binary phenotype loaded (924 cases, 229 controls).
735 variants remaining after main filters.
Writing SLC25A46_WGS_all_ancestries/SLC25A46.fam ... done.
Writing SLC25A46_WGS_all_ancestries/SLC25A46.bim ... done.
Writing SLC25A46_WGS_all_ancestries/SLC25A46.bed ... done.
End

In [35]:
# Make a sample list per ancestry_group to keep only that genetic data

sample_list = covar["IID"].tolist()


with open('sample_list.txt', 'w') as file:
    for iid in sample_list:
        file.write('%s\n' % iid)

In [36]:
! cp sample_list.txt SLC25A46_WGS_all_ancestries

In [37]:
! grep -f SLC25A46_WGS_all_ancestries/sample_list.txt  SLC25A46_WGS_all_ancestries/SLC25A46.fam | cut -f1,2 > SLC25A46_WGS_all_ancestries/to_keep.txt

In [38]:
! ls SLC25A46_WGS_all_ancestries

chr5.pgen    chr5.qc.log	     SLC25A46.log
chr5.psam    EUR_covariate_file.txt  SLC25A46_WGS_all_ancestries_COVS.txt
chr5.pvar    sample_list.txt	     to_keep.txt
chr5.qc.bed  SLC25A46.bed	     WGS_GP2_PCA.eigenvec
chr5.qc.bim  SLC25A46.bim	     WGS_meta_key.csv
chr5.qc.fam  SLC25A46.fam


In [39]:
! cat SLC25A46_WGS_all_ancestries/to_keep.txt | wc -l

1273


In [40]:
##Extract only the samples per ancestry

WORK_DIR = "SLC25A46_WGS_all_ancestries"

! /home/jupyter/tools/plink2 \
--bfile {WORK_DIR}/SLC25A46 \
--keep {WORK_DIR}/to_keep.txt \
--make-bed \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/SLC25A46
  --keep SLC25A46_WGS_all_ancestries/to_keep.txt
  --make-bed
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46

Start time: Tue May 21 21:08:38 2024
3676 MiB RAM detected, ~2465 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
2078 samples (760 females, 1318 males; 2078 founders) loaded from
SLC25A46_WGS_all_ancestries/SLC25A46.fam.
735 variants loaded from SLC25A46_WGS_all_ancestries/SLC25A46.bim.
1 binary phenotype loaded (924 cases, 229 controls).
--keep: 1273 samples remaining.
1273 samples (457 females, 816 males; 1273 founders) remaining after main
filters.
924 cases and 229 controls remaining after main filters.
Writing SLC25A46_WGS_all_ancestries/EUR_SLC25A4

In [41]:
## Turn binary files into VCF
WORK_DIR = "SLC25A46_WGS_all_ancestries"
! /home/jupyter/tools/plink2 \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--recode vcf id-paste=iid \
--mac 2 \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --export vcf id-paste=iid
  --mac 2
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46

Start time: Tue May 21 21:08:39 2024
3676 MiB RAM detected, ~2465 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
1273 samples (457 females, 816 males; 1273 founders) loaded from
SLC25A46_WGS_all_ancestries/EUR_SLC25A46.fam.
735 variants loaded from SLC25A46_WGS_all_ancestries/EUR_SLC25A46.bim.
1 binary phenotype loaded (924 cases, 229 controls).
Calculating allele frequencies... done.
579 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
156 variants remaining after main filters.
--export vcf to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.vcf ... 10101112121

In [42]:
### Bgzip and Tabix (zip and index the file)
! bgzip -f {WORK_DIR}/{ancestry_GROUP}_SLC25A46.vcf
! tabix -f -p vcf {WORK_DIR}/{ancestry_GROUP}_SLC25A46.vcf.gz 

## Annotate using ANNOVAR

In [43]:
## annotate using ANNOVAR
! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry_GROUP}_SLC25A46.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
-out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.annovar \
-remove -protocol refGene,clinvar_20140902 \
-operation g,f \
--nopolish \
-nastring . \
-vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 SLC25A46_WGS_all_ancestries/EUR_SLC25A46.vcf.gz > SLC25A46_WGS_all_ancestries/EUR_SLC25A46.annovar.avinput>
NOTICE: Finished reading 163 lines from VCF file
NOTICE: A total of 156 locus in VCF file passed QC threshold, representing 140 SNPs (95 transitions and 45 transversions) and 16 indels/substitutions
NOTICE: Finished writing allele frequencies based on 178220 SNP genotypes (120935 transitions and 57285 transversions) and 20368 indels/substitutions for 1273 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl SLC25A46_WGS_all_ancestries/EUR_SLC25A46.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
-----------------------------------------------------------------
NOTICE: Proc

In [None]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

In [None]:
gene = gene[gene['Gene.refGene'] == 'SLC25A46']
gene

In [None]:
# Filter exonic variants
coding = gene[(gene['Func.refGene'] == 'exonic')]
coding

In [None]:
# Filter exonic and non-synonymous variants
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_nonsynonymous

In [48]:
## EXPORT THIS FOR SUMMARY SHEET

In [49]:
results = [] 

utr5 = gene[gene['Func.refGene']== 'UTR5']
intronic = gene[gene['Func.refGene']== 'intronic']
exonic = gene[gene['Func.refGene']== 'exonic']
utr3 = gene[gene['Func.refGene']== 'UTR3']
coding_nonsynonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] == 'nonsynonymous SNV')]
coding_synonymous = gene[(gene['Func.refGene'] == 'exonic') & (gene['ExonicFunc.refGene'] != 'nonsynonymous SNV')]
print({ancestry})
print('Total variants: ', len(gene))
print("Intronic: ", len(intronic))
print('UTR3: ', len(utr3))
print('UTR5: ', len(utr5))
print("Total exonic: ", len(exonic))
print('  Synonymous: ', len(coding_synonymous))
print("  Nonsynonymous: ", len(coding_nonsynonymous))
results.append((gene, intronic, utr3, utr5, exonic, coding_synonymous, coding_nonsynonymous))
print('\n')
    
#output = pd.DataFrame(results, columns=('Total variants','Intronic', 'UTR3','UTR5', 'Total exonic', "Synonymous", 'Nonsynonymous'))

{'all_ancestries'}
Total variants:  156
Intronic:  119
UTR3:  25
UTR5:  1
Total exonic:  11
  Synonymous:  6
  Nonsynonymous:  5




In [50]:
# Save in PLINK format 
variants_toKeep = coding_nonsynonymous[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding_nonsyn.variantstoKeep.txt', sep="\t", index=False, header=False)
variants_toKeep

Unnamed: 0,Chr,Start,End,Gene.refGene
4,5,110739267,110739267,SLC25A46
5,5,110739354,110739354,SLC25A46
47,5,110746300,110746300,SLC25A46
101,5,110756712,110756712,SLC25A46
129,5,110761292,110761292,SLC25A46


In [51]:
variants_toKeep.shape

(5, 4)

In [52]:
# Save in PLINK format 
variants_toKeep2 = coding[['Chr', 'Start', 'End', 'Gene.refGene']].copy()
variants_toKeep2.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding.variantstoKeep', sep="\t", index=False, header=False)
variants_toKeep2

Unnamed: 0,Chr,Start,End,Gene.refGene
3,5,110739266,110739266,SLC25A46
4,5,110739267,110739267,SLC25A46
5,5,110739354,110739354,SLC25A46
29,5,110743749,110743749,SLC25A46
30,5,110743781,110743781,SLC25A46
47,5,110746300,110746300,SLC25A46
48,5,110746313,110746313,SLC25A46
101,5,110756712,110756712,SLC25A46
128,5,110761239,110761239,SLC25A46
129,5,110761292,110761292,SLC25A46


In [53]:
variants_toKeep2.shape

(11, 4)

In [54]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr5.pgen
chr5.psam
chr5.pvar
chr5.qc.bed
chr5.qc.bim
chr5.qc.fam
chr5.qc.log
EUR_covariate_file.txt
EUR_SLC25A46.all_coding_nonsyn.variantstoKeep.txt
EUR_SLC25A46.all_coding.variantstoKeep
EUR_SLC25A46.annovar.avinput
EUR_SLC25A46.annovar.hg38_multianno.txt
EUR_SLC25A46.annovar.hg38_multianno.vcf
EUR_SLC25A46.bed
EUR_SLC25A46.bim
EUR_SLC25A46.fam
EUR_SLC25A46.log
EUR_SLC25A46.vcf.gz
EUR_SLC25A46.vcf.gz.tbi
sample_list.txt
SLC25A46.bed
SLC25A46.bim
SLC25A46.fam
SLC25A46.log
SLC25A46_WGS_all_ancestries_COVS.txt
to_keep.txt
WGS_GP2_PCA.eigenvec
WGS_meta_key.csv


In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding_nonsyn.variantstoKeep.txt {WORKSPACE_BUCKET}/UHRF1BP1L_WGS_all_ancestries/{ancestry_GROUP}_UHRF1BP1L.all_coding_nonsyn.variantstoKeep.txt ')

## Case/Control Analysis

### Assoc

In [56]:
WORK_DIR = "SLC25A46_WGS_all_ancestries"

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--assoc \
--allow-no-sex \
--ci 0.95 \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --ci 0.95
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999594.
735 variants and 1273 people pass filters and QC.
Among remaining phenotypes, 924 are 

In [57]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all.assoc', delim_whitespace=True)
sig_all_nonadj = freq[freq['P']<0.05]
sig_all_nonadj

Unnamed: 0,CHR,SNP,BP,A1,F_A,F_U,A2,CHISQ,P,OR,SE,L95,U95
5,5,chr5_110738462_C_T,110738462,T,0.0,0.002183,C,4.037,0.04452,0.0,inf,0.0,
54,5,chr5_110739857_A_T,110739857,T,0.0,0.004367,A,8.077,0.004483,0.0,inf,0.0,
64,5,chr5_110740250_A_ATT,110740250,ATT,0.0,0.002183,A,4.037,0.04452,0.0,inf,0.0,
97,5,chr5_110741050_T_G,110741050,G,0.0,0.002183,T,4.037,0.04452,0.0,inf,0.0,
105,5,chr5_110741329_C_T,110741329,T,0.001623,0.008734,C,6.131,0.01328,0.1846,0.7656,0.04116,0.8275
126,5,chr5_110742104_A_G,110742104,G,0.0,0.002183,A,4.037,0.04452,0.0,inf,0.0,
153,5,chr5_110742876_C_A,110742876,A,0.0,0.002183,C,4.032,0.04464,0.0,inf,0.0,
160,5,chr5_110743095_C_T,110743095,T,0.0,0.002294,C,4.061,0.04387,0.0,inf,0.0,
210,5,chr5_110745264_G_C,110745264,C,0.003254,0.01092,G,4.53,0.0333,0.2958,0.6078,0.08986,0.9734
216,5,chr5_110745378_C_T,110745378,T,0.0,0.002183,C,4.037,0.04452,0.0,inf,0.0,


In [58]:
#Get the IDs to extract
sig_all_nonadj_id = sig_all_nonadj[['SNP']]
sig_all_nonadj_id
sig_all_nonadj_id.to_csv(f'{WORK_DIR}/{ancestry_GROUP}.sig_all_nonadj_id.txt', sep = '\t', index=False, header=None)

In [59]:
## check to make sure file was created and saved
! ls {WORK_DIR}

chr5.pgen
chr5.psam
chr5.pvar
chr5.qc.bed
chr5.qc.bim
chr5.qc.fam
chr5.qc.log
EUR_covariate_file.txt
EUR.sig_all_nonadj_id.txt
EUR_SLC25A46.all.assoc
EUR_SLC25A46.all_coding_nonsyn.variantstoKeep.txt
EUR_SLC25A46.all_coding.variantstoKeep
EUR_SLC25A46.all.log
EUR_SLC25A46.annovar.avinput
EUR_SLC25A46.annovar.hg38_multianno.txt
EUR_SLC25A46.annovar.hg38_multianno.vcf
EUR_SLC25A46.bed
EUR_SLC25A46.bim
EUR_SLC25A46.fam
EUR_SLC25A46.log
EUR_SLC25A46.vcf.gz
EUR_SLC25A46.vcf.gz.tbi
sample_list.txt
SLC25A46.bed
SLC25A46.bim
SLC25A46.fam
SLC25A46.log
SLC25A46_WGS_all_ancestries_COVS.txt
to_keep.txt
WGS_GP2_PCA.eigenvec
WGS_meta_key.csv


In [60]:
#--recode A creates a new text fileset, showing each variant in each case and control for the minor allele (A).
# Also extract the significant variants 
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract {WORK_DIR}/{ancestry_GROUP}.sig_all_nonadj_id.txt \
--recode A \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all.nonadj

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all.nonadj.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --extract SLC25A46_WGS_all_ancestries/EUR.sig_all_nonadj_id.txt
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all.nonadj
  --recode A

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
--extract: 39 variants remaining.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%3

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all.nonadj.raw', delim_whitespace=True)
recode.head()

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry_GROUP}_all-sign.variants-nonadj.txt {WORKSPACE_BUCKET}/SLC25A46_{ancestry}/{ancestry_GROUP}_all-sign.variants-nonadj.txt')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/SLC25A46_{ancestry_GROUP}/')

In [64]:
#Run with covariates
WORK_DIR = "SLC25A46_WGS_all_ancestries"

! /home/jupyter/tools/plink2 \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--glm \
--covar {WORK_DIR}/{ancestry_GROUP}_covariate_file.txt \
--covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5 \
--covar-variance-standardize \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_adj

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_adj.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --covar SLC25A46_WGS_all_ancestries/EUR_covariate_file.txt
  --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5
  --covar-variance-standardize
  --glm
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_adj

Start time: Tue May 21 21:09:17 2024
3676 MiB RAM detected, ~2414 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
1273 samples (457 females, 816 males; 1273 founders) loaded from
SLC25A46_WGS_all_ancestries/EUR_SLC25A46.fam.
735 variants loaded from SLC25A46_WGS_all_ancestries/EUR_SLC25A46.bim.
1 binary phenotype loaded (924 cases, 229 controls).
7 covariates loaded from SLC25A46_WGS_all_ancestries/EUR_covariate_file.txt.
--covar-variance-standardize: 7 covariates transformed

In [65]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_adj.PHENO1.glm.logistic.hybrid', delim_whitespace=True)
assoc_add = assoc[assoc['TEST']=="ADD"]
significant = assoc_add[assoc_add['P']<0.05]
significant

Unnamed: 0,#CHROM,POS,ID,REF,ALT,PROVISIONAL_REF?,A1,OMITTED,A1_FREQ,FIRTH?,TEST,OBS_CT,OR,LOG(OR)_SE,Z_STAT,P,ERRCODE
64,5,110738636,chr5_110738636_G_A,G,A,Y,A,G,0.000693,Y,ADD,721,0.003816,1.90978,-2.91581,0.003548,.
72,5,110738642,chr5_110738642_G_A,G,A,Y,A,G,0.000693,Y,ADD,721,0.008948,1.83130,-2.57540,0.010012,.
80,5,110738661,chr5_110738661_G_A,G,A,Y,A,G,0.000693,Y,ADD,721,0.000322,2.33836,-3.43813,0.000586,.
152,5,110739062,chr5_110739062_T_C,T,C,Y,C,T,0.000693,Y,ADD,721,0.000135,2.39997,-3.71251,0.000205,.
168,5,110739083,chr5_110739083_G_A,G,A,Y,A,G,0.000693,Y,ADD,721,0.004173,1.95773,-2.79867,0.005131,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5648,5,110764161,chr5_110764161_T_A,T,A,Y,A,T,0.000693,Y,ADD,721,0.001753,2.06097,-3.07923,0.002075,.
5704,5,110764313,chr5_110764313_T_C,T,C,Y,C,T,0.002080,Y,ADD,721,0.020198,1.72390,-2.26358,0.023600,.
5760,5,110764649,chr5_110764649_T_C,T,C,Y,C,T,0.000693,Y,ADD,721,0.000394,2.23674,-3.50510,0.000456,.
5784,5,110764736,chr5_110764736_T_A,T,A,Y,A,T,0.000693,Y,ADD,721,0.003593,1.96836,-2.85964,0.004241,.


In [66]:
#Get the IDs to extract
sig_all_adj_id = significant[['ID']]
sig_all_adj_id
sig_all_adj_id.to_csv(f'{WORK_DIR}/{ancestry_GROUP}.sig_all_adj_id.txt', sep = '\t', index=False, header=None)

In [67]:
#--recode A creates a new text fileset, showing each variant in each case and control for the minor allele (A).
# Also extract the significant variants 
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract {WORK_DIR}/{ancestry_GROUP}.sig_all_adj_id.txt \
--recode A \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all.adj

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all.adj.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --extract SLC25A46_WGS_all_ancestries/EUR.sig_all_adj_id.txt
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all.adj
  --recode A

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
--extract: 138 variants remaining.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.all.adj.raw', delim_whitespace=True)
recode

In [69]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for SLC25A46: {len(variants)}')
variants

Number of variants in all_ancestries for SLC25A46: 138


['chr5_110738636_G_A_A',
 'chr5_110738642_G_A_A',
 'chr5_110738661_G_A_A',
 'chr5_110739062_T_C_C',
 'chr5_110739083_G_A_A',
 'chr5_110739114_G_A_A',
 'chr5_110739115_C_A_A',
 'chr5_110739263_T_C_C',
 'chr5_110739296_G_A_A',
 'chr5_110739369_G_A_A',
 'chr5_110739370_G_A_A',
 'chr5_110739397_G_A_A',
 'chr5_110739428_G_A_A',
 'chr5_110739758_A_C_C',
 'chr5_110740023_T_G_G',
 'chr5_110740200_G_A_A',
 'chr5_110740246_A_T_T',
 'chr5_110740263_A_G_G',
 'chr5_110740378_G_A_A',
 'chr5_110740705_A_C_C',
 'chr5_110740722_C_T_T',
 'chr5_110740755_T_C_C',
 'chr5_110740930_C_T_T',
 'chr5_110741279_C_T_T',
 'chr5_110741298_A_G_G',
 'chr5_110741329_C_T_T',
 'chr5_110741773_TG_T_T',
 'chr5_110742329_A_G_G',
 'chr5_110742508_G_A_A',
 'chr5_110742613_A_G_G',
 'chr5_110742802_A_G_G',
 'chr5_110743179_CAAAATAAAT_C_C',
 'chr5_110744486_C_A_A',
 'chr5_110744514_G_A_A',
 'chr5_110744646_T_C_C',
 'chr5_110745009_G_A_A',
 'chr5_110745044_G_A_A',
 'chr5_110745254_T_G_G',
 'chr5_110745394_C_T_T',
 'chr5_11074549

In [70]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
        'Carrier freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
        'Carrier freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['ID'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,ID
0,chr5_110738636_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738636_G_A
1,chr5_110738642_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738642_G_A
2,chr5_110738661_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738661_G_A
3,chr5_110739062_T_C_C,0,1,924,0.001082,0,0,229,0.0,chr5_110739062_T_C
4,chr5_110739083_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110739083_G_A
...,...,...,...,...,...,...,...,...,...,...
133,chr5_110764161_T_A_A,0,2,924,0.002165,0,0,229,0.0,chr5_110764161_T_A
134,chr5_110764313_T_C_C,0,3,924,0.003247,0,0,229,0.0,chr5_110764313_T_C
135,chr5_110764649_T_C_C,0,1,924,0.001082,0,0,229,0.0,chr5_110764649_T_C
136,chr5_110764736_T_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110764736_T_A


In [71]:
##EXPORT THIS TO THE ALL-glm [p<0.05] sheet

In [72]:
#Merge with the glm file
sig_merge = significant[['ID','A1','A1_FREQ','OBS_CT','OR','LOG(OR)_SE','Z_STAT','P']]
merged = pd.merge(df_results, sig_merge, on='ID', how='right')
merged

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,ID,A1,A1_FREQ,OBS_CT,OR,LOG(OR)_SE,Z_STAT,P
0,chr5_110738636_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738636_G_A,A,0.000693,721,0.003816,1.90978,-2.91581,0.003548
1,chr5_110738642_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738642_G_A,A,0.000693,721,0.008948,1.83130,-2.57540,0.010012
2,chr5_110738661_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110738661_G_A,A,0.000693,721,0.000322,2.33836,-3.43813,0.000586
3,chr5_110739062_T_C_C,0,1,924,0.001082,0,0,229,0.0,chr5_110739062_T_C,C,0.000693,721,0.000135,2.39997,-3.71251,0.000205
4,chr5_110739083_G_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110739083_G_A,A,0.000693,721,0.004173,1.95773,-2.79867,0.005131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,chr5_110764161_T_A_A,0,2,924,0.002165,0,0,229,0.0,chr5_110764161_T_A,A,0.000693,721,0.001753,2.06097,-3.07923,0.002075
134,chr5_110764313_T_C_C,0,3,924,0.003247,0,0,229,0.0,chr5_110764313_T_C,C,0.002080,721,0.020198,1.72390,-2.26358,0.023600
135,chr5_110764649_T_C_C,0,1,924,0.001082,0,0,229,0.0,chr5_110764649_T_C,C,0.000693,721,0.000394,2.23674,-3.50510,0.000456
136,chr5_110764736_T_A_A,0,1,924,0.001082,0,0,229,0.0,chr5_110764736_T_A,A,0.000693,721,0.003593,1.96836,-2.85964,0.004241


In [73]:
## to CSV
merged.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_all-sign.variants-adj.txt', sep = '\t', index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry_GROUP}_all-sign.variants-adj.txt {WORKSPACE_BUCKET}/SLC25A46_{ancestry}/{ancestry_GROUP}_all-sign.variants-adj.txt')

CODING

In [75]:
### Assoc

In [76]:
WORK_DIR = "SLC25A46_WGS_all_ancestries"

! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract range {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding.variantstoKeep \
--assoc \
--allow-no-sex \
--ci 0.95 \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --ci 0.95
  --extract range SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_coding.variantstoKeep
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
--extract range: 724 variants excluded.
--extract range: 11 variants remaining.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%

In [77]:
freq = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding.assoc', delim_whitespace=True)
#freq[freq['P']<0.05]
freq

Unnamed: 0,CHR,SNP,BP,A1,F_A,F_U,A2,CHISQ,P,OR,SE,L95,U95
0,5,chr5_110739266_C_T,110739266,T,0.01569,0.01965,C,0.3548,0.5514,0.7954,0.3852,0.3738,1.692
1,5,chr5_110739267_C_A,110739267,A,0.000541,0.002183,C,1.142,0.2852,0.2474,1.415,0.01545,3.963
2,5,chr5_110739354_G_A,110739354,A,0.003788,0.0,G,1.74,0.1871,,,,
3,5,chr5_110743749_T_C,110743749,C,0.08712,0.1092,T,2.147,0.1429,0.7788,0.171,0.5569,1.089
4,5,chr5_110743781_A_G,110743781,G,0.004329,0.01092,A,2.842,0.09184,0.3939,0.5725,0.1283,1.21
5,5,chr5_110746300_C_A,110746300,A,0.001623,0.0,C,0.7445,0.3882,,,,
6,5,chr5_110746313_C_T,110746313,T,0.008117,0.00655,C,0.1163,0.7331,1.241,0.6346,0.3578,4.305
7,5,chr5_110756712_G_A,110756712,A,0.00487,0.002183,G,0.6136,0.4334,2.237,1.055,0.2826,17.7
8,5,chr5_110761239_G_A,110761239,A,0.06764,0.07205,G,0.1119,0.738,0.9343,0.2031,0.6276,1.391
9,5,chr5_110761292_A_G,110761292,G,0.002706,0.0,A,1.242,0.2651,,,,


In [78]:
#--recode A creates a new text fileset, showing each variant in each case and control for the minor allele (A).
# Also extract the significant variants 
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract range {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding.variantstoKeep \
--recode A \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding.nonadj

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding.nonadj.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --extract range SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_coding.variantstoKeep
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding.nonadj
  --recode A

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
--extract range: 724 variants excluded.
--extract range: 11 variants remaining.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%2

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding.nonadj.raw', delim_whitespace=True)
recode

In [80]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for SLC25A46: {len(variants)}')
variants

Number of variants in all_ancestries for SLC25A46: 11


['chr5_110739266_C_T_T',
 'chr5_110739267_C_A_A',
 'chr5_110739354_G_A_A',
 'chr5_110743749_T_C_C',
 'chr5_110743781_A_G_G',
 'chr5_110746300_C_A_A',
 'chr5_110746313_C_T_T',
 'chr5_110756712_G_A_A',
 'chr5_110761239_G_A_A',
 'chr5_110761292_A_G_G',
 'chr5_110761460_C_CTTA_CTTA']

In [81]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
        'Carrier freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
        'Carrier freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['SNP'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,SNP
0,chr5_110739266_C_T_T,1,27,924,0.030303,0,9,229,0.039301,chr5_110739266_C_T
1,chr5_110739267_C_A_A,0,1,924,0.001082,0,1,229,0.004367,chr5_110739267_C_A
2,chr5_110739354_G_A_A,0,7,924,0.007576,0,0,229,0.0,chr5_110739354_G_A
3,chr5_110743749_T_C_C,10,141,924,0.16342,4,42,229,0.200873,chr5_110743749_T_C
4,chr5_110743781_A_G_G,0,8,924,0.008658,0,5,229,0.021834,chr5_110743781_A_G
5,chr5_110746300_C_A_A,0,3,924,0.003247,0,0,229,0.0,chr5_110746300_C_A
6,chr5_110746313_C_T_T,1,13,924,0.015152,0,3,229,0.0131,chr5_110746313_C_T
7,chr5_110756712_G_A_A,0,9,924,0.00974,0,1,229,0.004367,chr5_110756712_G_A
8,chr5_110761239_G_A_A,6,113,924,0.128788,0,33,229,0.144105,chr5_110761239_G_A
9,chr5_110761292_A_G_G,0,5,924,0.005411,0,0,229,0.0,chr5_110761292_A_G


In [82]:
#Merge with the assoc file
sig_merge = freq[['SNP','A1','F_A','F_U','A2','L95','OR','U95','P']]
merged = pd.merge(df_results, sig_merge, on='SNP', how='right')
merged

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,SNP,A1,F_A,F_U,A2,L95,OR,U95,P
0,chr5_110739266_C_T_T,1,27,924,0.030303,0,9,229,0.039301,chr5_110739266_C_T,T,0.01569,0.01965,C,0.3738,0.7954,1.692,0.5514
1,chr5_110739267_C_A_A,0,1,924,0.001082,0,1,229,0.004367,chr5_110739267_C_A,A,0.000541,0.002183,C,0.01545,0.2474,3.963,0.2852
2,chr5_110739354_G_A_A,0,7,924,0.007576,0,0,229,0.0,chr5_110739354_G_A,A,0.003788,0.0,G,,,,0.1871
3,chr5_110743749_T_C_C,10,141,924,0.16342,4,42,229,0.200873,chr5_110743749_T_C,C,0.08712,0.1092,T,0.5569,0.7788,1.089,0.1429
4,chr5_110743781_A_G_G,0,8,924,0.008658,0,5,229,0.021834,chr5_110743781_A_G,G,0.004329,0.01092,A,0.1283,0.3939,1.21,0.09184
5,chr5_110746300_C_A_A,0,3,924,0.003247,0,0,229,0.0,chr5_110746300_C_A,A,0.001623,0.0,C,,,,0.3882
6,chr5_110746313_C_T_T,1,13,924,0.015152,0,3,229,0.0131,chr5_110746313_C_T,T,0.008117,0.00655,C,0.3578,1.241,4.305,0.7331
7,chr5_110756712_G_A_A,0,9,924,0.00974,0,1,229,0.004367,chr5_110756712_G_A,A,0.00487,0.002183,G,0.2826,2.237,17.7,0.4334
8,chr5_110761239_G_A_A,6,113,924,0.128788,0,33,229,0.144105,chr5_110761239_G_A,A,0.06764,0.07205,G,0.6276,0.9343,1.391,0.738
9,chr5_110761292_A_G_G,0,5,924,0.005411,0,0,229,0.0,chr5_110761292_A_G,G,0.002706,0.0,A,,,,0.2651


In [83]:
## to CSV
merged.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_coding.variants-nonadj.txt', sep = '\t', index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry_GROUP}_coding.variants-nonadj.txt {WORKSPACE_BUCKET}/SLC25A46_{ancestry}/{ancestry_GROUP}_coding.variants-nonadj.txt')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/SLC25A46_{ancestry}/')

In [86]:
#No significant SNPs with/without AGE as covariate
WORK_DIR = "SLC25A46_WGS_all_ancestries"

! /home/jupyter/tools/plink2 \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract range {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding.variantstoKeep \
--glm \
--covar {WORK_DIR}/{ancestry_GROUP}_covariate_file.txt \
--covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5 \
--covar-variance-standardize \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding

PLINK v2.00a6LM 64-bit Intel (4 May 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --covar SLC25A46_WGS_all_ancestries/EUR_covariate_file.txt
  --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5
  --covar-variance-standardize
  --extract range SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_coding.variantstoKeep
  --glm
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding

Start time: Tue May 21 21:09:33 2024
3676 MiB RAM detected, ~2409 available; reserving 1838 MiB for main workspace.
Using 1 compute thread.
1273 samples (457 females, 816 males; 1273 founders) loaded from
SLC25A46_WGS_all_ancestries/EUR_SLC25A46.fam.
735 variants loaded from SLC25A46_WGS_all_ancestries/EUR_SLC25A46.bim.
1 binary phenotype loaded (924 cases, 229 controls).
--extract bed1: 724 variants 

In [87]:
assoc = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding.PHENO1.glm.logistic.hybrid', delim_whitespace=True)
assoc_add = assoc[assoc['TEST']=="ADD"]
#assoc_add[assoc_add['P']<0.05]
assoc_add

Unnamed: 0,#CHROM,POS,ID,REF,ALT,PROVISIONAL_REF?,A1,OMITTED,A1_FREQ,FIRTH?,TEST,OBS_CT,OR,LOG(OR)_SE,Z_STAT,P,ERRCODE
0,5,110739266,chr5_110739266_C_T,C,T,Y,T,C,0.018031,Y,ADD,721,0.348172,1.46798,-0.718714,0.472317,.
8,5,110739267,chr5_110739267_C_A,C,A,Y,A,C,0.000693,Y,ADD,721,0.188917,1.7519,-0.951222,0.341492,.
16,5,110739354,chr5_110739354_G_A,G,A,Y,A,G,0.003467,Y,ADD,721,0.101537,1.56963,-1.45724,0.145049,.
24,5,110743749,chr5_110743749_T_C,T,C,Y,C,T,0.07975,Y,ADD,721,1.36939,1.27448,0.246659,0.805172,.
32,5,110743781,chr5_110743781_A_G,A,G,Y,G,A,0.004854,Y,ADD,721,0.140811,1.57818,-1.24215,0.214181,.
40,5,110746300,chr5_110746300_C_A,C,A,Y,A,C,0.00208,Y,ADD,721,0.007712,1.75738,-2.7683,0.005635,.
48,5,110746313,chr5_110746313_C_T,C,T,Y,T,C,0.009015,Y,ADD,721,0.292589,1.36368,-0.901225,0.367469,.
56,5,110756712,chr5_110756712_G_A,G,A,Y,A,G,0.006241,Y,ADD,721,0.087832,1.5408,-1.57862,0.114424,.
64,5,110761239,chr5_110761239_G_A,G,A,Y,A,G,0.067961,Y,ADD,721,0.920414,1.18426,-0.070028,0.944171,.
72,5,110761292,chr5_110761292_A_G,A,G,Y,G,A,0.000693,Y,ADD,721,0.00075,2.11995,-3.39435,0.000688,.


In [88]:
#--recode A creates a new text fileset, showing each variant in each case and control for the minor allele (A). 
! /home/jupyter/tools/plink \
--bfile {WORK_DIR}/{ancestry_GROUP}_SLC25A46 \
--extract range {WORK_DIR}/{ancestry_GROUP}_SLC25A46.all_coding.variantstoKeep \
--recode A \
--out {WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding.log.
Options in effect:
  --bfile SLC25A46_WGS_all_ancestries/EUR_SLC25A46
  --extract range SLC25A46_WGS_all_ancestries/EUR_SLC25A46.all_coding.variantstoKeep
  --out SLC25A46_WGS_all_ancestries/EUR_SLC25A46.coding
  --recode A

3676 MB RAM detected; reserving 1838 MB for main workspace.
735 variants loaded from .bim file.
1273 people (816 males, 457 females) loaded from .fam.
1153 phenotype values loaded from .fam.
--extract range: 724 variants excluded.
--extract range: 11 variants remaining.
Using 1 thread.
Before main variant filters, 1273 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%

In [None]:
recode = pd.read_csv(f'{WORK_DIR}/{ancestry_GROUP}_SLC25A46.coding.raw', delim_whitespace=True)
recode

In [90]:
# Make a list from the column names
column_names = recode.columns.tolist()

# Drop the first 6 columns to keep the variants 
variants = column_names[6:]

print(f'Number of variants in {ancestry} for SLC25A46: {len(variants)}')
variants

Number of variants in all_ancestries for SLC25A46: 11


['chr5_110739266_C_T_T',
 'chr5_110739267_C_A_A',
 'chr5_110739354_G_A_A',
 'chr5_110743749_T_C_C',
 'chr5_110743781_A_G_G',
 'chr5_110746300_C_A_A',
 'chr5_110746313_C_T_T',
 'chr5_110756712_G_A_A',
 'chr5_110761239_G_A_A',
 'chr5_110761292_A_G_G',
 'chr5_110761460_C_CTTA_CTTA']

In [91]:
# Pre-filter the dataset
cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]

results = []

for variant in variants:
    # For cases
    hom_cases = cases_data[cases_data[variant] == 2].shape[0]
    het_cases = cases_data[cases_data[variant] == 1].shape[0]
    total_cases = cases_data.shape[0]
    freq_cases = (hom_cases + het_cases) / total_cases

    # For controls
    hom_controls = controls_data[controls_data[variant] == 2].shape[0]
    het_controls = controls_data[controls_data[variant] == 1].shape[0]
    total_controls = controls_data.shape[0]
    freq_controls = (hom_controls + het_controls) / total_controls

    results.append({
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Total Cases': total_cases,
        'Carrier freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Total Controls': total_controls,
        'Carrier freq in Controls': freq_controls
    })

# Return
df_results = pd.DataFrame(results)
df_results['ID'] = df_results['Variant'].apply(lambda x: x.rsplit('_', 1)[0])

df_results

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,ID
0,chr5_110739266_C_T_T,1,27,924,0.030303,0,9,229,0.039301,chr5_110739266_C_T
1,chr5_110739267_C_A_A,0,1,924,0.001082,0,1,229,0.004367,chr5_110739267_C_A
2,chr5_110739354_G_A_A,0,7,924,0.007576,0,0,229,0.0,chr5_110739354_G_A
3,chr5_110743749_T_C_C,10,141,924,0.16342,4,42,229,0.200873,chr5_110743749_T_C
4,chr5_110743781_A_G_G,0,8,924,0.008658,0,5,229,0.021834,chr5_110743781_A_G
5,chr5_110746300_C_A_A,0,3,924,0.003247,0,0,229,0.0,chr5_110746300_C_A
6,chr5_110746313_C_T_T,1,13,924,0.015152,0,3,229,0.0131,chr5_110746313_C_T
7,chr5_110756712_G_A_A,0,9,924,0.00974,0,1,229,0.004367,chr5_110756712_G_A
8,chr5_110761239_G_A_A,6,113,924,0.128788,0,33,229,0.144105,chr5_110761239_G_A
9,chr5_110761292_A_G_G,0,5,924,0.005411,0,0,229,0.0,chr5_110761292_A_G


In [92]:
#Merge with the glm file
sig_merge = assoc_add[['ID','A1','A1_FREQ','OBS_CT','OR','LOG(OR)_SE','Z_STAT','P']]
merged = pd.merge(df_results, sig_merge, on='ID', how='right')
merged

Unnamed: 0,Variant,Hom Cases,Het Cases,Total Cases,Carrier freq in Cases,Hom Controls,Het Controls,Total Controls,Carrier freq in Controls,ID,A1,A1_FREQ,OBS_CT,OR,LOG(OR)_SE,Z_STAT,P
0,chr5_110739266_C_T_T,1,27,924,0.030303,0,9,229,0.039301,chr5_110739266_C_T,T,0.018031,721,0.348172,1.46798,-0.718714,0.472317
1,chr5_110739267_C_A_A,0,1,924,0.001082,0,1,229,0.004367,chr5_110739267_C_A,A,0.000693,721,0.188917,1.7519,-0.951222,0.341492
2,chr5_110739354_G_A_A,0,7,924,0.007576,0,0,229,0.0,chr5_110739354_G_A,A,0.003467,721,0.101537,1.56963,-1.45724,0.145049
3,chr5_110743749_T_C_C,10,141,924,0.16342,4,42,229,0.200873,chr5_110743749_T_C,C,0.07975,721,1.36939,1.27448,0.246659,0.805172
4,chr5_110743781_A_G_G,0,8,924,0.008658,0,5,229,0.021834,chr5_110743781_A_G,G,0.004854,721,0.140811,1.57818,-1.24215,0.214181
5,chr5_110746300_C_A_A,0,3,924,0.003247,0,0,229,0.0,chr5_110746300_C_A,A,0.00208,721,0.007712,1.75738,-2.7683,0.005635
6,chr5_110746313_C_T_T,1,13,924,0.015152,0,3,229,0.0131,chr5_110746313_C_T,T,0.009015,721,0.292589,1.36368,-0.901225,0.367469
7,chr5_110756712_G_A_A,0,9,924,0.00974,0,1,229,0.004367,chr5_110756712_G_A,A,0.006241,721,0.087832,1.5408,-1.57862,0.114424
8,chr5_110761239_G_A_A,6,113,924,0.128788,0,33,229,0.144105,chr5_110761239_G_A,A,0.067961,721,0.920414,1.18426,-0.070028,0.944171
9,chr5_110761292_A_G_G,0,5,924,0.005411,0,0,229,0.0,chr5_110761292_A_G,G,0.000693,721,0.00075,2.11995,-3.39435,0.000688


In [93]:
## to CSV
merged.to_csv(f'{WORK_DIR}/{ancestry_GROUP}_coding-variants-adj.txt', sep = '\t', index=False)

In [None]:
# Save to workspace bucket (move from VM to workspace bucket)
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} cp -r {WORK_DIR}/{ancestry_GROUP}_coding-variants-adj.txt {WORKSPACE_BUCKET}/SLC25A46_{ancestry}/{ancestry_GROUP}_coding-variants-adj.txt')

In [None]:
## Check workspace bucket
shell_do(f'gsutil -mu {BILLING_PROJECT_ID} ls {WORKSPACE_BUCKET}/SLC25A46_{ancestry_GROUP}/')