In [1]:
import pandas as pd
import numpy as np
import os
import argparse
import sys
import subprocess
import time

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [3]:
wd = '/path/to/working/directory'

In [7]:
# data paths
## Note: using samples from release 6 since they are processed already
## This will not affect results since samples from release 6 are all included in release 7

snp_metrics_data_dir = '/path/to/snp/metrics'
master_key_full_path = '/path/to/full/master/key'
master_key_release_path = f'/path/to/release6/master/key'
genotype_path = '/path/to/release6/plink/genotypes'
metrics_dir = f'/path/to/output/directory'
hackathon_vars_path = f'data/PATHOGENIC_SNPS_TO_CLUSTER.txt'

In [11]:
# read through hackathon vars, loop through chroms and write positions to files
## Note: the list of chromosomes is just generated by a quick look through the hacakthon vars file
## Note: printing the list of positions in here because they get copied over to the 
## hackathon_rv_extract_snp_metrics.py file

hack_vars = pd.read_csv(hackathon_vars_path, sep='\t')

chroms = [1,2,3,4,6,12,15,16,21,22]

for chrom in chroms:
    hack_vars_chr = hack_vars[hack_vars['SNP'].str.startswith(f'chr{chrom}:')].copy()
    hack_vars_chr[['chr','pos','ref','alt']] = hack_vars_chr['SNP'].str.split(':', expand=True)
    print(hack_vars_chr.shape)
    print(chrom)
    print(hack_vars_chr['pos'].astype(int).values.tolist())
    hack_vars_chr['pos'].to_csv(f'{metrics_dir}/chr{chrom}_vars.txt', sep=',', header=None, index=None)

(97, 5)
1
[155235197, 155235205, 155235819, 155235843, 155236246, 155236420, 155237394, 155237412, 155237446, 155238141, 155238174, 155238214, 155238228, 155238258, 155238290, 155238291, 155238608, 155238631, 155239639, 155240025, 16986091, 16986235, 16986335, 16986490, 16986526, 16986554, 16986571, 16986610, 16986818, 16986847, 16988138, 16988161, 16988338, 16988444, 16988465, 16989977, 16990132, 16990133, 16990208, 16990254, 16990276, 16991787, 16991795, 16992019, 16992301, 16992322, 16992348, 16992378, 16992564, 16993667, 16993668, 16993690, 16996069, 16996255, 16996293, 16996298, 16996448, 16996487, 16996489, 16997087, 16997135, 16997136, 17000057, 17000099, 17000107, 17000307, 17000494, 17004773, 17005465, 17005548, 17005747, 17005753, 17011732, 20633615, 20633713, 20633815, 20633840, 20637868, 20637888, 20637956, 20638012, 20644540, 20644571, 20644649, 20645615, 20645665, 20645675, 20645695, 20648528, 20648534, 20648612, 20649224, 20650643, 20650673, 65392454, 7962868, 7984985]
(

In [None]:
# get full release freq

full_freq_cmd = f'plink2 --pfile {genotype_path} --freq --out {metrics_dir}/full_maf'
shell_do(full_freq_cmd)

In [None]:
# write a bash script and submit job for each chromosome

with open(f'{wd}/snp_metrics.swarm', 'w') as f1:
    for chrom in chroms:
        with open(f'{wd}/snp_metrics_{chrom}.sh','w') as f:
            f.write('#!/usr/bin/env bash\n\n')
            f.write('source /path/to/conda.sh\n')
            f.write('conda activate env_name\n\n')
            f.write(f'python3 {wd}/snp_metrics_process_subset_variants.py --chr {chrom}')
            f.close()

        f1.write(f'bash {wd}/snp_metrics_{chrom}.sh\n')
    f1.close()

swarm_cmd = f'swarm -f {wd}/snp_metrics.swarm -g 50 --time=30:00:00 --module python/3.9'
shell_do(swarm_cmd)

In [15]:
# loop over ancestries and chromosomes to build full SNP metrics df

full_metrics = pd.DataFrame()

for ancestry in ['AFR','AAC','AJ','AMR','CAH','CAS','EAS','EUR','FIN','MDE','SAS']:
    for chrom in chroms:
        metrics = pd.read_csv(f'{metrics_dir}/{ancestry}/chr{chrom}_metrics.csv', sep=',')
        metrics['ancestry'] = ancestry
        full_metrics = pd.concat([full_metrics, metrics], axis=0)
        
print(full_metrics.head())
print(full_metrics.shape)
print(full_metrics['chromosome'].value_counts())

                              snpID         R     Theta  GenTrain_Score  GT  \
0  PARK7:NM_007262.4:c.501A>G:p.(.)  1.563070  0.003493        0.813555  AA   
1                          exm22865  0.862394  0.971369        0.859224  BB   
2                          exm22897  1.426340  0.958124        0.802682  BB   
3                       rs144557304  1.004180  0.986961        0.884113  BB   
4                          exm22899  1.026380  0.974530        0.891090  BB   

   chromosome  position            Sample_ID phenotype ancestry  
0           1   7984985  204697840003_R04C01        PD      AFR  
1           1  16986554  204697840003_R04C01        PD      AFR  
2           1  16988138  204697840003_R04C01        PD      AFR  
3           1  16988138  204697840003_R04C01        PD      AFR  
4           1  16988161  204697840003_R04C01        PD      AFR  
(757890, 10)
chromosome
15    370925
1     116290
22    114285
6      66165
12     66165
3       8020
21      8020
16      4010
2

In [16]:
# write full metrics to file

full_metrics.to_csv(f'{metrics_dir}/updated_hackathon_snp_metrics.txt', sep='\t', index=False)