CTSB - Single gene analysis in GP2 Neurobooster genotyping data (all ancestries)

Project: GP2 CTSB

Version: Python/3.10.15,  R/4.3.3

Notebook Overview

1. Description
Loading Python libraries
Set paths
Make working directory

2. Installing packages

3. Create a covariate file with GP2 data

4. Annotation of the gene CTSB

5. Association analysis to compare allele frequencies between cases and controls

6. GLM analysis adjusting for gender, age, PC1-5

7. Burden test(SkatO, Skat, cmc,zeggini,mb,fp,cmcWald)

8. Conditional analysis

Loading Python libraries

In [2]:
# Use pathlib for file path manipulation
import pathlib

# Install numpy
import numpy as np

# Install Pandas for tabular data
import pandas as pd

# Install plotnine: a ggplot2-compatible Python plotting package
from plotnine import *

# Always show all columns in a Pandas DataFrame
pd.set_option('display.max_columns', None)

Set paths

In [3]:
REL7_PATH = pathlib.Path(pathlib.Path.home(), 'workspace/gp2_tier2_eu_release7_30042024')
!ls -hal {REL7_PATH}

total 61K
-r--r--r-- 1 jupyter jupyter 61K Jul 23  2024 README_release7_30042024.txt
dr-xr-xr-x 1 jupyter jupyter   0 Jun 12 11:04 clinical_data
dr-xr-xr-x 1 jupyter jupyter   0 Jun 12 11:04 imputed_genotypes
dr-xr-xr-x 1 jupyter jupyter   0 Jun 12 11:04 meta_data
dr-xr-xr-x 1 jupyter jupyter   0 Jun 12 11:04 raw_genotypes


Make working directory

In [4]:
WORK_DIR = "~/workspace/ws_files/CTSB/"

In [5]:
# make sure all tools installed
! ls /home/jupyter/tools

LICENSE		       plink2				rvtests
annovar		       plink2_linux_x86_64_latest.zip	toy.map
annovar.latest.tar.gz  plink_linux_x86_64_20190304.zip	toy.ped
plink		       prettify


In [6]:
# give permission

# chmod to make sure you have permission to run the program
! chmod u+x /home/jupyter/tools/plink
! chmod u+x /home/jupyter/tools/plink2
! chmod 777 /home/jupyter/tools/rvtests/executable/rvtest

In [None]:
%%bash
# making working directory
#Loop over all the ancestries
for ancestry in {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'} ;
do

#Make a folder for each ancestry
mkdir ~/workspace/ws_files/CTSB/CTSB_"$ancestry"

done

Create a covariate file with GP2 data

In [7]:
CLINICAL_DATA_PATH = pathlib.Path(REL7_PATH, 'clinical_data/master_key_release7_final_vwb.csv')

In [None]:
# Let's load the master key
key = pd.read_csv(CLINICAL_DATA_PATH, low_memory=False)
print(key.shape)
key.head()

In [None]:
# Subsetting to keep only a few columns 
key = key[['GP2sampleID', 'baseline_GP2_phenotype_for_qc', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset', 'label']]
# Renaming the columns
key.rename(columns = {'GP2sampleID':'IID',
                                     'baseline_GP2_phenotype_for_qc':'phenotype',
                                     'biological_sex_for_qc':'SEX', 
                                     'age_at_sample_collection':'AGE', 
                                     'age_of_onset':'AAO'}, inplace = True)
key

In [11]:
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    print(f'WORKING ON: {ancestry}')
    
    ## Subset to keep ancestry of interest 
    ancestry_key = key[key['label']==ancestry].copy()
    ancestry_key.reset_index(drop=True)
    
    # Load information about related individuals in the ancestry analyzed
    related_df = pd.read_csv(f'{REL7_PATH}/meta_data/related_samples/{ancestry}_release7_vwb.related')
    print(f'Related individuals: {related_df.shape}')
    
    # Make a list of just one set of related people
    related_list = list(related_df['IID1'])
    
    # Check value counts of related and remove only one related individual
    ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]
    
    # Check size
    print(f'Unrelated individuals: {ancestry_key.shape}')
    
    # Convert phenotype to binary (1/2)
    ## Assign conditions so case=2 and controls=1, and -9 otherwise (matching PLINK convention)
    # PD = 2; control = 1
    pheno_mapping = {"PD": 2, "Control": 1}
    ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')
    
    # Check value counts of pheno
    ancestry_key['PHENO'].value_counts(dropna=False)
    
    ## Get the PCs
    pcs = pd.read_csv(f'{REL7_PATH}/meta_data/qc_metrics/projected_pcs_vwb.csv')
    
    #Select just first 5 PCs
    selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    pcs = pd.DataFrame(data=pcs.iloc[:, 1:7].values, columns=selected_columns)
    
    # Drop the first row (since it's now the column names)
    pcs = pcs.drop(0)
    
    # Reset the index to remove any potential issues
    pcs = pcs.reset_index(drop=True)
    
    # Check size
    print(f'PCs: {pcs.shape}')
    
    # Check value counts of SEX
    sex_og_values = ancestry_key['SEX'].value_counts(dropna=False)
    print(f'Sex value counts - original:\n {sex_og_values.to_string()}')
    
    # Convert sex to binary (1/2)
    ## Assign conditions so female=2 and men=1, and -9 otherwise (matching PLINK convention)
    # Female = 2; Male = 1
    sex_mapping = {"Female": 2, "Male": 1}
    ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')
    
    # Check value counts of SEX after recoding
    sex_recode_values = ancestry_key['SEX'].value_counts(dropna=False)
    print(f'Sex value counts - recoded:\n{sex_recode_values.to_string()}')
    
    ## Make covariate file
    df = pd.merge(pcs, ancestry_key, on='IID', how='left')
    print(f'Check columns for covariate file: {df.columns}')
    
    #Make additional columns - FID, fatid and matid - these are needed for RVtests!!
    #RVtests needs the first 5 columns to be fid, iid, fatid, matid and sex otherwise it does not run correctly
    #Uppercase column name is ok
    #See https://zhanxw.github.io/rvtests/#phenotype-file
    df['FID'] = 0
    df['FATID'] = 0
    df['MATID'] = 0
    
    ## Clean up and keep columns we need 
    final_df = df[['FID','IID', 'FATID', 'MATID', 'SEX', 'AGE', 'PHENO','PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
    
    ##DO NOT replace missing values with -9 as this is misinterpreted by RVtests - needs to be nonnumeric
    #Leave missing values as NA
    
    #Check number of PD cases missing age
    pd_missAge = final_df[(final_df['PHENO']==2)&(final_df['AGE'].isna())]
    print(f'Number of PD cases missing age: {pd_missAge.shape[0]}')
    
    #Check number of controls missing age
    control_missAge = final_df[(final_df['PHENO']==1)&(final_df['AGE'].isna())]
    print(f'Number of controls missing age: {control_missAge.shape[0]}')
    
    ## Make file of sample IDs to keep 
    samples_toKeep = final_df[['FID', 'IID']].copy()
    samples_toKeep.to_csv(f'~/workspace/ws_files/CTSB/CTSB_{ancestry}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)
    
    ## Make your covariate file
    #Included na_rep to write out missing/NA values explicitly as string/text, not as blank otherwise they are misread in RVtests
    final_df.to_csv(f'~/workspace/ws_files/CTSB/CTSB_{ancestry}/{ancestry}_covariate_file.txt', sep = '\t', na_rep='NA', index=False)

WORKING ON: AJ
Related individuals: (245, 9)
Unrelated individuals: (2675, 6)
PCs: (58209, 6)
Sex value counts - original:
 SEX
Male                          1655
Female                        1009
Other/Unknown/Not Reported      11
Sex value counts - recoded:
SEX
1       1655
2       1009
<NA>      11
Check columns for covariate file: Index(['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'phenotype', 'SEX', 'AGE',
       'AAO', 'label', 'PHENO'],
      dtype='object')
Number of PD cases missing age: 95
Number of controls missing age: 195
WORKING ON: MDE
Related individuals: (19, 9)
Unrelated individuals: (589, 6)
PCs: (58209, 6)
Sex value counts - original:
 SEX
Male                          368
Female                        219
Other/Unknown/Not Reported      2
Sex value counts - recoded:
SEX
1       368
2       219
<NA>      2
Check columns for covariate file: Index(['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'phenotype', 'SEX', 'AGE',
       'AAO', 'label', 'PHENO'],
      dtype='object'

Annotation of the gene

Extract the region using PLINK

Extract CTSB gene

CTSB coordinates: Chromosome 8: 11,842,524-11,869,533(GRCh38/hg38)

In [12]:
## extract region using plink
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'

    ! /home/jupyter/tools/plink2 \
    --pfile {REL7_PATH}/imputed_genotypes/{ancestry}/chr8_{ancestry}_release7_vwb \
    --chr 8 \
    --from-bp 11792524 \
    --to-bp 11919533 \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_CTSB

PLINK v2.0.0-a.6.0LM 64-bit Intel (11 Nov 2024)    cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.log.
Options in effect:
  --chr 8
  --from-bp 11792524
  --make-bed
  --out /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB
  --pfile /home/jupyter/workspace/gp2_tier2_eu_release7_30042024/imputed_genotypes/AJ/chr8_AJ_release7_vwb
  --to-bp 11919533

Start time: Thu Jun 12 11:05:56 2025
12984 MiB RAM detected, ~11272 available; reserving 6492 MiB for main
workspace.
Using up to 2 compute threads.
2655 samples (1007 females, 1648 males; 2655 founders) loaded from
/home/jupyter/workspace/gp2_tier2_eu_release7_30042024/imputed_genotypes/AJ/chr8_AJ_release7_vwb.psam.
1308065 variants loaded from
/home/jupyter/workspace/gp2_tier2_eu_release7_30042024/imputed_genotypes/AJ/chr8_AJ_release7_vwb.pvar.
1 binary phenotype loaded (1292 cases, 411 controls).
2136 variants r

In [13]:
# Visualize bim file
WORK_DIR = '/home/jupyter/workspace/ws_files/CTSB/'
! head {WORK_DIR}/CTSB_EUR/EUR_CTSB.bim

8	chr8:11792554:C:T	0	11792554	T	C
8	chr8:11792575:T:C	0	11792575	C	T
8	chr8:11792593:T:G	0	11792593	G	T
8	chr8:11792594:C:G	0	11792594	G	C
8	chr8:11792658:C:G	0	11792658	G	C
8	chr8:11792660:C:G	0	11792660	G	C
8	chr8:11792663:T:C	0	11792663	C	T
8	chr8:11792668:T:G	0	11792668	G	T
8	chr8:11792679:G:C	0	11792679	C	G
8	chr8:11792680:C:T	0	11792680	T	C


In [None]:
# Visualize bim file
! head {WORK_DIR}/CTSB_EUR/EUR_CTSB.fam

In [15]:
for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}/'
    
    ! head -n 1 {WORK_DIR}/{ancestry}_CTSB.fam > {WORK_DIR}/{ancestry}_s1.txt

In [None]:
! head /home/jupyter/workspace/ws_files/CTSB/CTSB_EUR/EUR_s1.txt

Turn binary files into VCF

In [17]:
for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    ## Turn binary files into VCF
    ! /home/jupyter/tools/plink2 \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --keep {WORK_DIR}/{ancestry}_s1.txt \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_CTSB_v1

PLINK v2.0.0-a.6.0LM 64-bit Intel (11 Nov 2024)    cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.log.
Options in effect:
  --bfile /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB
  --keep /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_s1.txt
  --make-bed
  --out /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1

Start time: Thu Jun 12 11:13:41 2025
12984 MiB RAM detected, ~11262 available; reserving 6492 MiB for main
workspace.
Using up to 2 compute threads.
2655 samples (1007 females, 1648 males; 2655 founders) loaded from
/home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.fam.
2136 variants loaded from
/home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.bim.
1 binary phenotype loaded (1292 cases, 411 controls).
--keep: 1 sample remaining.
1 sample (0 females, 1 male; 1 founder) remaining after main filters.
1 case and 0 controls remainin

In [18]:
for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    ## Turn binary files into VCF
    ! /home/jupyter/tools/plink2 \
    --bfile {WORK_DIR}/{ancestry}_CTSB_v1 \
    --recode vcf-fid \
    --out {WORK_DIR}/{ancestry}_CTSB_v1

PLINK v2.0.0-a.6.0LM 64-bit Intel (11 Nov 2024)    cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.log.
Options in effect:
  --bfile /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1
  --export vcf-fid
  --out /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1

Start time: Thu Jun 12 11:18:34 2025
Note: --export 'vcf-fid' modifier is deprecated.  Use 'vcf' + 'id-paste=fid'.
12984 MiB RAM detected, ~11295 available; reserving 6492 MiB for main
workspace.
Using up to 2 compute threads.
1 sample (0 females, 1 male; 1 founder) loaded from
/home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.fam.
2136 variants loaded from
/home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.bim.
1 binary phenotype loaded (1 case, 0 controls).
--export vcf to /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.vcf
... 1010111112121313141415151616171718181919

In [19]:
### Bgzip and Tabix (zip and index the file)
for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    ! bgzip -f {WORK_DIR}/{ancestry}_CTSB_v1.vcf
    ! tabix -f -p vcf {WORK_DIR}/{ancestry}_CTSB_v1.vcf.gz 

Annotate using ANNOVAR

In [20]:
## annotate using ANNOVAR

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    ! perl /home/jupyter/tools/annovar/table_annovar.pl {WORK_DIR}/{ancestry}_CTSB_v1.vcf.gz /home/jupyter/tools/annovar/humandb/ -buildver hg38 \
    -out {WORK_DIR}/{ancestry}_CTSB.annovar \
    -remove -protocol refGene,clinvar_20140902 \
    -operation g,f \
    --nopolish \
    -nastring . \
    -vcfinput


NOTICE: Running with system command <convert2annovar.pl  -includeinfo -allsample -withfreq -format vcf4 /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB_v1.vcf.gz > /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.annovar.avinput>
NOTICE: Finished reading 2143 lines from VCF file
NOTICE: A total of 2136 locus in VCF file passed QC threshold, representing 2024 SNPs (1187 transitions and 837 transversions) and 112 indels/substitutions
NOTICE: Finished writing allele frequencies based on 2024 SNP genotypes (1187 transitions and 837 transversions) and 112 indels/substitutions for 1 samples

NOTICE: Running with system command </home/jupyter/tools/annovar/table_annovar.pl /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.annovar.avinput /home/jupyter/tools/annovar/humandb/ -buildver hg38 -outfile /home/jupyter/workspace/ws_files/CTSB/CTSB_AJ/AJ_CTSB.annovar -remove -protocol refGene,clinvar_20140902 -operation g,f --nopolish -nastring . -otherinfo>
---------------------------

In [21]:
WORK_DIR = "/home/jupyter/workspace/ws_files/CTSB"

In [22]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_AAC/AAC_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792650,11792650,G,T,intergenic,NEIL2;FDFT1,dist=5305;dist=2923,.,.,.,0,.,.,8,11792650,chr8:11792650:G:T,G,T,.,.,PR,GT,0/0
2,8,11792688,11792688,G,C,intergenic,NEIL2;FDFT1,dist=5343;dist=2885,.,.,.,0,.,.,8,11792688,chr8:11792688:G:C,G,C,.,.,PR,GT,0/0
3,8,11792691,11792691,C,T,intergenic,NEIL2;FDFT1,dist=5346;dist=2882,.,.,.,0,.,.,8,11792691,chr8:11792691:C:T,C,T,.,.,PR,GT,0/0
4,8,11792730,11792730,C,A,intergenic,NEIL2;FDFT1,dist=5385;dist=2843,.,.,.,0,.,.,8,11792730,chr8:11792730:C:A,C,A,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5346,8,11919417,11919417,T,C,intergenic,CTSB;DEFB136,dist=51330;dist=54520,.,.,.,0,.,.,8,11919417,chr8:11919417:T:C,T,C,.,.,PR,GT,0/0
5347,8,11919431,11919431,T,C,intergenic,CTSB;DEFB136,dist=51344;dist=54506,.,.,.,0,.,.,8,11919431,chr8:11919431:T:C,T,C,.,.,PR,GT,0/0
5348,8,11919450,11919450,G,C,intergenic,CTSB;DEFB136,dist=51363;dist=54487,.,.,.,0,.,.,8,11919450,chr8:11919450:G:C,G,C,.,.,PR,GT,0/0
5349,8,11919498,11919498,C,G,intergenic,CTSB;DEFB136,dist=51411;dist=54439,.,.,.,0,.,.,8,11919498,chr8:11919498:C:G,C,G,.,.,PR,GT,0/0


In [23]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      2872
intergenic    1816
UTR5           193
UTR3           178
upstream       108
downstream     104
exonic          78
splicing         2
Name: count, dtype: int64

In [24]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [25]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
synonymous SNV       39
nonsynonymous SNV    38
stopgain              1
Name: count, dtype: int64

In [26]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_AFR/AFR_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792594,11792594,C,A,intergenic,NEIL2;FDFT1,dist=5249;dist=2979,.,.,.,0,.,.,8,11792594,chr8:11792594:C:A,C,A,.,.,PR,GT,0/0
2,8,11792594,11792594,C,G,intergenic,NEIL2;FDFT1,dist=5249;dist=2979,.,.,.,0,.,.,8,11792594,chr8:11792594:C:G,C,G,.,.,PR,GT,0/0
3,8,11792691,11792691,C,T,intergenic,NEIL2;FDFT1,dist=5346;dist=2882,.,.,.,0,.,.,8,11792691,chr8:11792691:C:T,C,T,.,.,PR,GT,0/0
4,8,11792730,11792730,C,A,intergenic,NEIL2;FDFT1,dist=5385;dist=2843,.,.,.,0,.,.,8,11792730,chr8:11792730:C:A,C,A,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376,8,11919417,11919417,T,C,intergenic,CTSB;DEFB136,dist=51330;dist=54520,.,.,.,0,.,.,8,11919417,chr8:11919417:T:C,T,C,.,.,PR,GT,0/0
6377,8,11919431,11919431,T,C,intergenic,CTSB;DEFB136,dist=51344;dist=54506,.,.,.,0,.,.,8,11919431,chr8:11919431:T:C,T,C,.,.,PR,GT,0/0
6378,8,11919450,11919450,G,C,intergenic,CTSB;DEFB136,dist=51363;dist=54487,.,.,.,0,.,.,8,11919450,chr8:11919450:G:C,G,C,.,.,PR,GT,0/0
6379,8,11919454,11919454,T,C,intergenic,CTSB;DEFB136,dist=51367;dist=54483,.,.,.,0,.,.,8,11919454,chr8:11919454:T:C,T,C,.,.,PR,GT,0/0


In [27]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      3425
intergenic    2182
UTR5           240
UTR3           203
upstream       119
downstream     119
exonic          92
splicing         1
Name: count, dtype: int64

In [28]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [29]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    51
synonymous SNV       41
Name: count, dtype: int64

In [30]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_AJ/AJ_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0.0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792661,11792662,TT,-,intergenic,NEIL2;FDFT1,dist=5316;dist=2911,.,.,.,0.0,.,.,8,11792660,chr8:11792660:CTT:C,CTT,C,.,.,PR,GT,0/0
2,8,11792812,11792812,C,T,intergenic,NEIL2;FDFT1,dist=5467;dist=2761,.,.,.,0.0,.,.,8,11792812,chr8:11792812:C:T,C,T,.,.,PR,GT,0/0
3,8,11792935,11792941,ATTAAGT,-,intergenic,NEIL2;FDFT1,dist=5590;dist=2632,.,.,.,0.0,.,.,8,11792934,chr8:11792934:AATTAAGT:A,AATTAAGT,A,.,.,PR,GT,0/0
4,8,11792966,11792966,T,C,intergenic,NEIL2;FDFT1,dist=5621;dist=2607,.,.,.,0.0,.,.,8,11792966,chr8:11792966:T:C,T,C,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2131,8,11918972,11918972,G,A,intergenic,CTSB;DEFB136,dist=50885;dist=54965,.,.,.,0.0,.,.,8,11918972,chr8:11918972:G:A,G,A,.,.,PR,GT,0/0
2132,8,11919062,11919062,A,G,intergenic,CTSB;DEFB136,dist=50975;dist=54875,.,.,.,0.0,.,.,8,11919062,chr8:11919062:A:G,A,G,.,.,PR,GT,0/0
2133,8,11919200,11919200,G,A,intergenic,CTSB;DEFB136,dist=51113;dist=54737,.,.,.,0.0,.,.,8,11919200,chr8:11919200:G:A,G,A,.,.,PR,GT,0/0
2134,8,11919223,11919223,G,A,intergenic,CTSB;DEFB136,dist=51136;dist=54714,.,.,.,0.0,.,.,8,11919223,chr8:11919223:G:A,G,A,.,.,PR,GT,0/0


In [31]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      1131
intergenic     720
UTR5            94
UTR3            65
downstream      49
upstream        46
exonic          31
Name: count, dtype: int64

In [32]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [33]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    22
synonymous SNV        9
Name: count, dtype: int64

In [34]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_AMR/AMR_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792619,11792619,C,T,intergenic,NEIL2;FDFT1,dist=5274;dist=2954,.,.,.,0,.,.,8,11792619,chr8:11792619:C:T,C,T,.,.,PR,GT,0/0
2,8,11792691,11792691,C,T,intergenic,NEIL2;FDFT1,dist=5346;dist=2882,.,.,.,0,.,.,8,11792691,chr8:11792691:C:T,C,T,.,.,PR,GT,0/0
3,8,11792730,11792730,C,A,intergenic,NEIL2;FDFT1,dist=5385;dist=2843,.,.,.,0,.,.,8,11792730,chr8:11792730:C:A,C,A,.,.,PR,GT,0/0
4,8,11792798,11792798,G,A,intergenic,NEIL2;FDFT1,dist=5453;dist=2775,.,.,.,0,.,.,8,11792798,chr8:11792798:G:A,G,A,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,8,11919279,11919279,A,G,intergenic,CTSB;DEFB136,dist=51192;dist=54658,.,.,.,0,.,.,8,11919279,chr8:11919279:A:G,A,G,.,.,PR,GT,0/0
3034,8,11919300,11919300,G,T,intergenic,CTSB;DEFB136,dist=51213;dist=54637,.,.,.,0,.,.,8,11919300,chr8:11919300:G:T,G,T,.,.,PR,GT,0/0
3035,8,11919311,11919311,C,T,intergenic,CTSB;DEFB136,dist=51224;dist=54626,.,.,.,0,.,.,8,11919311,chr8:11919311:C:T,C,T,.,.,PR,GT,0/0
3036,8,11919332,11919332,C,T,intergenic,CTSB;DEFB136,dist=51245;dist=54605,.,.,.,0,.,.,8,11919332,chr8:11919332:C:T,C,T,.,.,PR,GT,0/0


In [35]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      1621
intergenic    1016
UTR5           118
UTR3            97
downstream      78
exonic          55
upstream        53
Name: count, dtype: int64

In [36]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [37]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    35
synonymous SNV       20
Name: count, dtype: int64

In [38]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_CAH/CAH_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792594,11792594,C,G,intergenic,NEIL2;FDFT1,dist=5249;dist=2979,.,.,.,0,.,.,8,11792594,chr8:11792594:C:G,C,G,.,.,PR,GT,0/0
2,8,11792691,11792691,C,T,intergenic,NEIL2;FDFT1,dist=5346;dist=2882,.,.,.,0,.,.,8,11792691,chr8:11792691:C:T,C,T,.,.,PR,GT,0/0
3,8,11792713,11792713,T,C,intergenic,NEIL2;FDFT1,dist=5368;dist=2860,.,.,.,0,.,.,8,11792713,chr8:11792713:T:C,T,C,.,.,PR,GT,0/0
4,8,11792730,11792730,C,A,intergenic,NEIL2;FDFT1,dist=5385;dist=2843,.,.,.,0,.,.,8,11792730,chr8:11792730:C:A,C,A,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4941,8,11919376,11919376,A,T,intergenic,CTSB;DEFB136,dist=51289;dist=54561,.,.,.,0,.,.,8,11919376,chr8:11919376:A:T,A,T,.,.,PR,GT,0/0
4942,8,11919387,11919387,G,A,intergenic,CTSB;DEFB136,dist=51300;dist=54550,.,.,.,0,.,.,8,11919387,chr8:11919387:G:A,G,A,.,.,PR,GT,0/0
4943,8,11919450,11919450,G,C,intergenic,CTSB;DEFB136,dist=51363;dist=54487,.,.,.,0,.,.,8,11919450,chr8:11919450:G:C,G,C,.,.,PR,GT,0/0
4944,8,11919508,11919508,C,G,intergenic,CTSB;DEFB136,dist=51421;dist=54429,.,.,.,0,.,.,8,11919508,chr8:11919508:C:G,C,G,.,.,PR,GT,0/0


In [39]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      2646
intergenic    1677
UTR5           192
UTR3           154
downstream     106
upstream        92
exonic          78
splicing         1
Name: count, dtype: int64

In [40]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [41]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV      40
synonymous SNV         37
frameshift deletion     1
Name: count, dtype: int64

In [42]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_CAS/CAS_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792660,11792660,C,G,intergenic,NEIL2;FDFT1,dist=5315;dist=2913,.,.,.,0,.,.,8,11792660,chr8:11792660:C:G,C,G,.,.,PR,GT,0/0
2,8,11792688,11792688,G,C,intergenic,NEIL2;FDFT1,dist=5343;dist=2885,.,.,.,0,.,.,8,11792688,chr8:11792688:G:C,G,C,.,.,PR,GT,0/0
3,8,11792784,11792784,G,A,intergenic,NEIL2;FDFT1,dist=5439;dist=2789,.,.,.,0,.,.,8,11792784,chr8:11792784:G:A,G,A,.,.,PR,GT,0/0
4,8,11792794,11792794,T,C,intergenic,NEIL2;FDFT1,dist=5449;dist=2779,.,.,.,0,.,.,8,11792794,chr8:11792794:T:C,T,C,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2602,8,11919223,11919223,G,A,intergenic,CTSB;DEFB136,dist=51136;dist=54714,.,.,.,0,.,.,8,11919223,chr8:11919223:G:A,G,A,.,.,PR,GT,0/0
2603,8,11919310,11919310,C,G,intergenic,CTSB;DEFB136,dist=51223;dist=54627,.,.,.,0,.,.,8,11919310,chr8:11919310:C:G,C,G,.,.,PR,GT,0/0
2604,8,11919373,11919373,C,T,intergenic,CTSB;DEFB136,dist=51286;dist=54564,.,.,.,0,.,.,8,11919373,chr8:11919373:C:T,C,T,.,.,PR,GT,0/0
2605,8,11919399,11919399,C,A,intergenic,CTSB;DEFB136,dist=51312;dist=54538,.,.,.,0,.,.,8,11919399,chr8:11919399:C:A,C,A,.,.,PR,GT,0/0


In [43]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      1373
intergenic     873
UTR5           113
UTR3            93
downstream      54
upstream        53
exonic          47
splicing         1
Name: count, dtype: int64

In [44]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [45]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    31
synonymous SNV       16
Name: count, dtype: int64

In [46]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_EAS/EAS_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792657,11792657,C,T,intergenic,NEIL2;FDFT1,dist=5312;dist=2916,.,.,.,0,.,.,8,11792657,chr8:11792657:C:T,C,T,.,.,PR,GT,0/0
2,8,11792660,11792660,C,G,intergenic,NEIL2;FDFT1,dist=5315;dist=2913,.,.,.,0,.,.,8,11792660,chr8:11792660:C:G,C,G,.,.,PR,GT,0/0
3,8,11792690,11792690,C,G,intergenic,NEIL2;FDFT1,dist=5345;dist=2883,.,.,.,0,.,.,8,11792690,chr8:11792690:C:G,C,G,.,.,PR,GT,0/0
4,8,11792717,11792717,C,T,intergenic,NEIL2;FDFT1,dist=5372;dist=2856,.,.,.,0,.,.,8,11792717,chr8:11792717:C:T,C,T,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,8,11919319,11919319,C,A,intergenic,CTSB;DEFB136,dist=51232;dist=54618,.,.,.,0,.,.,8,11919319,chr8:11919319:C:A,C,A,.,.,PR,GT,0/0
4077,8,11919359,11919359,G,C,intergenic,CTSB;DEFB136,dist=51272;dist=54578,.,.,.,0,.,.,8,11919359,chr8:11919359:G:C,G,C,.,.,PR,GT,0/0
4078,8,11919433,11919433,C,T,intergenic,CTSB;DEFB136,dist=51346;dist=54504,.,.,.,0,.,.,8,11919433,chr8:11919433:C:T,C,T,.,.,PR,GT,0/0
4079,8,11919510,11919510,C,T,intergenic,CTSB;DEFB136,dist=51423;dist=54427,.,.,.,0,.,.,8,11919510,chr8:11919510:C:T,C,T,.,.,PR,GT,0/0


In [47]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      2164
intergenic    1351
UTR5           170
UTR3           137
downstream      93
exonic          84
upstream        80
splicing         2
Name: count, dtype: int64

In [48]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [49]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV      48
synonymous SNV         34
frameshift deletion     2
Name: count, dtype: int64

In [50]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_MDE/MDE_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0.0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792593,11792593,T,G,intergenic,NEIL2;FDFT1,dist=5248;dist=2980,.,.,.,0.0,.,.,8,11792593,chr8:11792593:T:G,T,G,.,.,PR,GT,0/0
2,8,11792688,11792688,G,C,intergenic,NEIL2;FDFT1,dist=5343;dist=2885,.,.,.,0.0,.,.,8,11792688,chr8:11792688:G:C,G,C,.,.,PR,GT,0/0
3,8,11792691,11792691,C,T,intergenic,NEIL2;FDFT1,dist=5346;dist=2882,.,.,.,0.0,.,.,8,11792691,chr8:11792691:C:T,C,T,.,.,PR,GT,0/0
4,8,11792812,11792812,C,T,intergenic,NEIL2;FDFT1,dist=5467;dist=2761,.,.,.,0.0,.,.,8,11792812,chr8:11792812:C:T,C,T,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540,8,11919062,11919062,A,G,intergenic,CTSB;DEFB136,dist=50975;dist=54875,.,.,.,0.0,.,.,8,11919062,chr8:11919062:A:G,A,G,.,.,PR,GT,0/0
2541,8,11919200,11919200,G,A,intergenic,CTSB;DEFB136,dist=51113;dist=54737,.,.,.,0.0,.,.,8,11919200,chr8:11919200:G:A,G,A,.,.,PR,GT,0/0
2542,8,11919223,11919223,G,A,intergenic,CTSB;DEFB136,dist=51136;dist=54714,.,.,.,0.0,.,.,8,11919223,chr8:11919223:G:A,G,A,.,.,PR,GT,0/0
2543,8,11919263,11919263,G,T,intergenic,CTSB;DEFB136,dist=51176;dist=54674,.,.,.,0.0,.,.,8,11919263,chr8:11919263:G:T,G,T,.,.,PR,GT,0/0


In [51]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      1379
intergenic     857
UTR5            90
UTR3            82
downstream      48
upstream        46
exonic          42
splicing         1
Name: count, dtype: int64

In [52]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [53]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    26
synonymous SNV       16
Name: count, dtype: int64

In [54]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_SAS/SAS_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792683,11792683,C,G,intergenic,NEIL2;FDFT1,dist=5338;dist=2890,.,.,.,0,.,.,8,11792683,chr8:11792683:C:G,C,G,.,.,PR,GT,0/0
2,8,11792784,11792784,G,A,intergenic,NEIL2;FDFT1,dist=5439;dist=2789,.,.,.,0.5,.,.,8,11792784,chr8:11792784:G:A,G,A,.,.,PR,GT,0/1
3,8,11792794,11792794,T,C,intergenic,NEIL2;FDFT1,dist=5449;dist=2779,.,.,.,0,.,.,8,11792794,chr8:11792794:T:C,T,C,.,.,PR,GT,0/0
4,8,11792812,11792812,C,T,intergenic,NEIL2;FDFT1,dist=5467;dist=2761,.,.,.,0,.,.,8,11792812,chr8:11792812:C:T,C,T,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,8,11919223,11919223,G,A,intergenic,CTSB;DEFB136,dist=51136;dist=54714,.,.,.,0,.,.,8,11919223,chr8:11919223:G:A,G,A,.,.,PR,GT,0/0
2442,8,11919373,11919373,C,T,intergenic,CTSB;DEFB136,dist=51286;dist=54564,.,.,.,0,.,.,8,11919373,chr8:11919373:C:T,C,T,.,.,PR,GT,0/0
2443,8,11919376,11919376,A,T,intergenic,CTSB;DEFB136,dist=51289;dist=54561,.,.,.,0,.,.,8,11919376,chr8:11919376:A:T,A,T,.,.,PR,GT,0/0
2444,8,11919450,11919450,G,C,intergenic,CTSB;DEFB136,dist=51363;dist=54487,.,.,.,0,.,.,8,11919450,chr8:11919450:G:C,G,C,.,.,PR,GT,0/0


In [55]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      1254
intergenic     871
UTR5            94
UTR3            86
upstream        55
exonic          47
downstream      39
Name: count, dtype: int64

In [56]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [57]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV         31
synonymous SNV            15
nonframeshift deletion     1
Name: count, dtype: int64

In [58]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_FIN/FIN_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
1,8,11792812,11792812,C,T,intergenic,NEIL2;FDFT1,dist=5467;dist=2761,.,.,.,0,.,.,8,11792812,chr8:11792812:C:T,C,T,.,.,PR,GT,0/0
2,8,11792966,11792966,T,C,intergenic,NEIL2;FDFT1,dist=5621;dist=2607,.,.,.,0,.,.,8,11792966,chr8:11792966:T:C,T,C,.,.,PR,GT,0/0
3,8,11793099,11793099,T,C,intergenic,NEIL2;FDFT1,dist=5754;dist=2474,.,.,.,0,.,.,8,11793099,chr8:11793099:T:C,T,C,.,.,PR,GT,0/0
4,8,11793200,11793200,G,C,intergenic,NEIL2;FDFT1,dist=5855;dist=2373,.,.,.,0,.,.,8,11793200,chr8:11793200:G:C,G,C,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,8,11918551,11918551,G,T,intergenic,CTSB;DEFB136,dist=50464;dist=55386,.,.,.,1,.,.,8,11918551,chr8:11918551:G:T,G,T,.,.,PR,GT,1/1
1003,8,11918618,11918618,T,C,intergenic,CTSB;DEFB136,dist=50531;dist=55319,.,.,.,1,.,.,8,11918618,chr8:11918618:T:C,T,C,.,.,PR,GT,1/1
1004,8,11919200,11919200,G,A,intergenic,CTSB;DEFB136,dist=51113;dist=54737,.,.,.,0,.,.,8,11919200,chr8:11919200:G:A,G,A,.,.,PR,GT,0/0
1005,8,11919263,11919263,G,T,intergenic,CTSB;DEFB136,dist=51176;dist=54674,.,.,.,0,.,.,8,11919263,chr8:11919263:G:T,G,T,.,.,PR,GT,0/0


In [59]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      524
intergenic    358
UTR5           38
UTR3           37
upstream       18
downstream     18
exonic         14
Name: count, dtype: int64

In [60]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [61]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV    8
synonymous SNV       6
Name: count, dtype: int64

In [62]:
# Read in ANNOVAR multianno file
gene = pd.read_csv(f'{WORK_DIR}/CTSB_EUR/EUR_CTSB.annovar.hg38_multianno.txt', sep = '\t')
display(gene)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,clinvar_20140902,Otherinfo1,Otherinfo2,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13
0,8,11792554,11792554,C,T,intergenic,NEIL2;FDFT1,dist=5209;dist=3019,.,.,.,0,.,.,8,11792554,chr8:11792554:C:T,C,T,.,.,PR,GT,0/0
1,8,11792575,11792575,T,C,intergenic,NEIL2;FDFT1,dist=5230;dist=2998,.,.,.,0,.,.,8,11792575,chr8:11792575:T:C,T,C,.,.,PR,GT,0/0
2,8,11792593,11792593,T,G,intergenic,NEIL2;FDFT1,dist=5248;dist=2980,.,.,.,.,.,.,8,11792593,chr8:11792593:T:G,T,G,.,.,PR,GT,./.
3,8,11792594,11792594,C,G,intergenic,NEIL2;FDFT1,dist=5249;dist=2979,.,.,.,.,.,.,8,11792594,chr8:11792594:C:G,C,G,.,.,PR,GT,./.
4,8,11792658,11792658,C,G,intergenic,NEIL2;FDFT1,dist=5313;dist=2915,.,.,.,0,.,.,8,11792658,chr8:11792658:C:G,C,G,.,.,PR,GT,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13476,8,11919475,11919475,T,C,intergenic,CTSB;DEFB136,dist=51388;dist=54462,.,.,.,.,.,.,8,11919475,chr8:11919475:T:C,T,C,.,.,PR,GT,./.
13477,8,11919503,11919503,T,C,intergenic,CTSB;DEFB136,dist=51416;dist=54434,.,.,.,0,.,.,8,11919503,chr8:11919503:T:C,T,C,.,.,PR,GT,0/0
13478,8,11919510,11919510,C,T,intergenic,CTSB;DEFB136,dist=51423;dist=54427,.,.,.,0,.,.,8,11919510,chr8:11919510:C:T,C,T,.,.,PR,GT,0/0
13479,8,11919512,11919512,T,C,intergenic,CTSB;DEFB136,dist=51425;dist=54425,.,.,.,0,.,.,8,11919512,chr8:11919512:T:C,T,C,.,.,PR,GT,0/0


In [63]:
gene["Func.refGene"].value_counts()

Func.refGene
intronic      7065
intergenic    4618
UTR5           537
UTR3           455
upstream       270
downstream     269
exonic         264
splicing         3
Name: count, dtype: int64

In [64]:
# Filter exonic variants
coding = gene[gene['Func.refGene'] == 'exonic']

In [65]:
coding["ExonicFunc.refGene"].value_counts()

ExonicFunc.refGene
nonsynonymous SNV         164
synonymous SNV             91
nonframeshift deletion      6
frameshift deletion         2
frameshift insertion        1
Name: count, dtype: int64

ASSOC

In [66]:
#Run case-control analysis using plink assoc for all variants, not adjusting for any covariates
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS', 'CAH']

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --keep {WORK_DIR}/{ancestry}.samplestoKeep \
    --assoc \
    --allow-no-sex \
    --ci 0.95 \
    --maf 0.01 \
    --out {WORK_DIR}/{ancestry}_CTSB.all

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB.all.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB
  --ci 0.95
  --keep /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC.samplestoKeep
  --maf 0.01
  --out /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB.all

12984 MB RAM detected; reserving 6492 MB for main workspace.
5351 variants loaded from .bim file.
1111 people (455 males, 656 females) loaded from .fam.
1086 phenotype values loaded from .fam.
--keep: 1111 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1111 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960

In [67]:
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS', 'CAH']

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    print(f'WORKING ON: {ancestry}')
    
    #Look at assoc results, only variants with nominal p-value < 0.05
    freq = pd.read_csv(f'{WORK_DIR}/{ancestry}_CTSB.all.assoc', sep='\s+')
    sig_all_nonadj = freq[freq['P']<0.05]
    
    print(f'Variants with p-value < 0.05: {sig_all_nonadj.shape}')
    
    #Save FREQ to csv
    freq.to_csv(f'{WORK_DIR}/{ancestry}.all_nonadj.csv')

WORKING ON: AAC
Variants with p-value < 0.05: (142, 13)
WORKING ON: AFR
Variants with p-value < 0.05: (101, 13)
WORKING ON: AJ
Variants with p-value < 0.05: (6, 13)
WORKING ON: AMR
Variants with p-value < 0.05: (200, 13)
WORKING ON: CAS
Variants with p-value < 0.05: (167, 13)
WORKING ON: EAS
Variants with p-value < 0.05: (167, 13)
WORKING ON: EUR
Variants with p-value < 0.05: (208, 13)
WORKING ON: FIN
Variants with p-value < 0.05: (25, 13)
WORKING ON: MDE
Variants with p-value < 0.05: (80, 13)
WORKING ON: SAS
Variants with p-value < 0.05: (26, 13)
WORKING ON: CAH
Variants with p-value < 0.05: (53, 13)


GLM

In [68]:
#Run case-control analysis with covariates
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS', 'CAH']

for ancestry in ancestries:

    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    ! /home/jupyter/tools/plink2 \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --keep {WORK_DIR}/{ancestry}.samplestoKeep \
    --allow-no-sex \
    --maf 0.01 \
    --ci 0.95 \
    --glm \
    --covar {WORK_DIR}/{ancestry}_covariate_file.txt \
    --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5 \
    --covar-variance-standardize \
    --neg9-pheno-really-missing \
    --out {WORK_DIR}/{ancestry}_CTSB.all_adj

PLINK v2.0.0-a.6.0LM 64-bit Intel (11 Nov 2024)    cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB.all_adj.log.
Options in effect:
  --allow-no-sex
  --bfile /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB
  --ci 0.95
  --covar /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_covariate_file.txt
  --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5
  --covar-variance-standardize
  --glm
  --keep /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC.samplestoKeep
  --maf 0.01
  --neg9-pheno-really-missing
  --out /home/jupyter/workspace/ws_files/CTSB/CTSB_AAC/AAC_CTSB.all_adj

Start time: Thu Jun 12 11:30:51 2025
Note: --allow-no-sex no longer has any effect.  (Missing-sex samples are
automatically excluded from association analysis when sex is a covariate, and
treated normally otherwise.)
12984 MiB RAM detected, ~11226 available; reserving 6492 MiB for main
workspace

In [69]:
#Process results from plink glm analysis including covariates
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS', 'CAH']

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    print(f'WORKING ON: {ancestry}')
    
    #Read in plink glm results
    assoc = pd.read_csv(f'{WORK_DIR}/{ancestry}_CTSB.all_adj.PHENO1.glm.logistic.hybrid', delim_whitespace=True)
    
    #Filter for additive test only - this is the variant results
    assoc_add = assoc[assoc['TEST']=="ADD"]
    
    #Check if there are any significant (p < 0.05) variants
    significant = assoc_add[assoc_add['P']<0.05]

    print(f'There are {len(significant)} variants with p-value < 0.05 in glm')
    
    #Check if there are any significant (p < 0.05) variants
    GWsignificant = assoc_add[assoc_add['P']<5e-8]

    print(f'There are {len(GWsignificant)} variants with p-value < 5e-8 in glm')
    
    #Save assoc_add to csv
    assoc_add.to_csv(f'{WORK_DIR}/{ancestry}.all_adj.csv')

WORKING ON: AAC
There are 42 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: AFR
There are 126 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: AJ
There are 21 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: AMR
There are 160 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: CAS
There are 44 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: EAS
There are 9 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: EUR
There are 189 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: FIN
There are 6 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: MDE
There are 32 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: SAS
There are 26 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




WORKING ON: CAH
There are 75 variants with p-value < 0.05 in glm
There are 0 variants with p-value < 5e-8 in glm




change format of association file in EUR

In [70]:
WORK_DIR = "~/workspace/ws_files/CTSB/"

In [71]:
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    WORK_DIR = "~/workspace/ws_files/CTSB/"
    print(f'WORKING ON: {ancestry}')
    
    df = pd.read_csv(f'{WORK_DIR}/CTSB_{ancestry}/{ancestry}.all_adj.csv')
    
    #First just select relevant columns: chromosome, bp position and p-value
    export_ldassoc = df[['#CHROM', 'POS', 'P']].copy()
    
    #Rename the #CHROM column to remove the hashtag as I think this might be confusing LDassoc
    export_ldassoc = export_ldassoc.rename(columns={'#CHROM': 'CHROM'}) 
    
    #Then export as a tab-separated, not comma-separated file

    export_ldassoc.to_csv(f'{WORK_DIR}/CTSB_{ancestry}/{ancestry}.all_adj.formatted.tab', sep = '\t', index=False)

WORKING ON: AJ
WORKING ON: MDE
WORKING ON: EAS
WORKING ON: AMR
WORKING ON: CAS
WORKING ON: FIN
WORKING ON: CAH
WORKING ON: SAS
WORKING ON: AFR
WORKING ON: EUR
WORKING ON: AAC


GCTA conditional analysis

In [72]:
# prepare file for COJO
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    #Read in summary statistics
    sumstats = pd.read_csv(f'{WORK_DIR}/CTSB_{ancestry}/{ancestry}.all_adj.csv')
    
    #Format summary statistics for GCTA-COJO
    #First get the log odds ratio - this is required for COJO
    #1) For a case-control study, the effect size should be log(odds ratio) with its corresponding standard error.
    sumstats_formatted = sumstats.copy()
    sumstats_formatted['b'] = np.log(sumstats_formatted['OR'])
    
    #Now select just the necessary columns for COJO
    sumstats_export = sumstats_formatted[['ID', 'A1', 'OMITTED', 'A1_FREQ', 'b', 'LOG(OR)_SE', 'P', 'OBS_CT']].copy()
    
    #Rename columns following COJO format
    sumstats_export = sumstats_export.rename(columns = {'ID':'SNP', 'OMITTED':'A2', 'A1_FREQ':'freq', 'LOG(OR)_SE':'se', 'P':'p', 'OBS_CT':'N'})
    
    #Export
    sumstats_export.to_csv(f'{WORK_DIR}/CTSB_{ancestry}/{ancestry}.all_adj.sumstats.ma', sep = '\t', index=False)

In [None]:
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    #Select multiple associated SNPs based on significance p-value 5e-08
    #Can change the p-value for significance if needed
    #bfile is referring to the full dataset in plink binary format, e.g. GP2 - whatever you used to run the GWAS
    ! /home/jupyter/tools/gcta-1.94.1-linux-kernel-3-x86_64/gcta64 \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --maf 0.01 \
    --cojo-file {WORK_DIR}/{ancestry}.all_adj.sumstats.ma \
    --cojo-p 5e-8 \
    --cojo-slct \
    --out {WORK_DIR}/{ancestry}.all_adj.COJO

In [None]:
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    #Select multiple associated SNPs based on significance p-value 5e-08
    #Can change the p-value for significance if needed
    #bfile is referring to the full dataset in plink binary format, e.g. GP2 - whatever you used to run the GWAS
    ! /home/jupyter/tools/gcta-1.94.1-linux-kernel-3-x86_64/gcta64 \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --maf 0.01 \
    --cojo-file {WORK_DIR}/{ancestry}.all_adj.sumstats.ma \
    --cojo-p 4.27e-4 \
    --cojo-slct \
    --out {WORK_DIR}/{ancestry}.all_adj.LDprune.COJO

In [None]:
ancestries = {'AAC','AFR','AJ','AMR','CAS','EAS','EUR','FIN','MDE','SAS','CAH'}

for ancestry in ancestries:
    
    WORK_DIR = f'~/workspace/ws_files/CTSB/CTSB_{ancestry}'
    
    #Select multiple associated SNPs based on significance p-value 5e-08
    #Can change the p-value for significance if needed
    #bfile is referring to the full dataset in plink binary format, e.g. GP2 - whatever you used to run the GWAS
    ! /home/jupyter/tools/gcta-1.94.1-linux-kernel-3-x86_64/gcta64 \
    --bfile {WORK_DIR}/{ancestry}_CTSB \
    --maf 0.01 \
    --cojo-file {WORK_DIR}/{ancestry}.all_adj.sumstats.ma \
    --cojo-top-SNPs 10 \
    --out {WORK_DIR}/{ancestry}.all_adj.top10