## Notebook to subset cohort genotype callset from larger parent cohort callset

#### import libraries and set notebook variables

In [1]:
import pandas as pd
import os
import threading

In [2]:
# naming
cohort = 'foundin'
build = 'freeze9'
cohort_build = f'{cohort}.{build}'

# directories
wrk_dir = f'/labshare/raph/datasets/{cohort}'
genos_dir = f'{wrk_dir}/genotypes'
info_dir = f'{wrk_dir}/sample_info'

# in files
ori_sample_list = f'{wrk_dir}/foundin.samples.list'
source_pfiles_prefix = '/labshare/raph/datasets/amppd/genotypes/ppmi.freeze9'
parent_psam_file = '/labshare/raph/datasets/amppd/sample_info/ppmi.psam'
cohort_psam_file = f'{info_dir}/{cohort}.psam'

# constants 
autosomes = [str(x) for x in list(range(1,23))]
sexomes = ['X']
chromosomes = autosomes + sexomes

#### threading related functions

In [3]:
#### threading related function

# run command line procsss with bash majic
# make this a small function so it can be target function for threading
def run_bash_cmd(this_cmd):
    !{this_cmd}

# for each bash command is list add to execution thread, join and wait til all done
def run_bash_cmds_threaded(cmd_list):
    job_threads = []
    for this_cmd in cmd_list:
        this_thread = threading.Thread(target=run_bash_cmd, args=(this_cmd,))
        job_threads.append(this_thread)
        this_thread.start()

    for job_thread in job_threads:
        job_thread.join()          

#### make sure the needed output directories exist

In [4]:
os.makedirs(genos_dir, exist_ok=True)
os.makedirs(info_dir, exist_ok=True)

#### read the original sample ID list, and re-prefix from old LNG to AMPPD

In [5]:
sample_ids = pd.read_csv(ori_sample_list, header=None)
sample_ids.columns = ['ori_id']
print(sample_ids.shape)
# replace the prefix
sample_ids['id'] = sample_ids['ori_id'].str.replace('PPMISI', 'PP-')
print(sample_ids.shape)
sample_ids.sample(5)

(135, 1)
(135, 2)


Unnamed: 0,ori_id,id
120,PPMISI57277,PP-57277
76,PPMISI51755,PP-51755
44,PPMISI4105,PP-4105
90,PPMISI52828,PP-52828
91,PPMISI52932,PP-52932


#### subset foundin info from parent PPMI set

In [6]:
# read the parent cohort psam
psam_df = pd.read_csv(parent_psam_file, sep='\s+')
print(psam_df.shape)

# now subset to just the cohort
psam_df = psam_df.loc[psam_df['IID'].isin(sample_ids['id'])]
print(psam_df.shape)
psam_df.sample(5)

# now save the cohrt psam
psam_df.to_csv(cohort_psam_file, index=False, sep='\t')

(1610, 4)
(134, 4)


#### for each cohort subset plink2 pfile set

In [7]:
def frmt_plink2_subset(in_pfiles_prefix, genos_dir, out_name, chrom, 
                       keep_file, min_mac=1):
    in_pfiles = f'{in_pfiles_prefix}.chr{chrom}'
    out_pfiles = f'{genos_dir}/{out_name}.chr{chrom}'
    if in_pfiles == out_pfiles:
        print('in name cannot be same of out name, here')
        plink_cmd = '#error'

    filter_pass = ' --var-filter'
    plink_cmd = f'plink2 --pfile {in_pfiles} --keep {keep_file} \
--mac {min_mac} --silent --make-pgen --out {out_pfiles}'
    return plink_cmd

cmds = [frmt_plink2_subset(source_pfiles_prefix, genos_dir, cohort_build, chrom, 
                           cohort_psam_file) for chrom in chromosomes] 

# print(cmds)
# now run the cmds concurrently    
run_bash_cmds_threaded(cmds)  