## Notebook to subset cohort genotype callset from larger parent cohort callset

#### import libraries and set notebook variables

In [1]:
from pandas import read_csv
import threading

In [7]:
# naming
cohort = 'foundin'
build = 'amppdv3'
cohort_build = f'{cohort}_{build}'

# directories
wrk_dir = f'/labshare/raph/datasets/{cohort}'
genos_dir = f'{wrk_dir}/genotypes/2022_v3release_1115'
info_dir = f'{wrk_dir}/sample_info'

# in files
sample_list_file = f'{genos_dir}/{cohort}.samples.list'
source_pfiles_prefix = '/labshare/raph/datasets/amppd/genotypes/2022_v3release_1115/chr'
cohort_psam_file = f'{info_dir}/{cohort_build}.psam'

# constants 
autosomes = [str(x) for x in list(range(1,23))]
sexomes = ['X', 'Y']
chromosomes = autosomes + sexomes
DEBUG = True

#### threading related functions

In [3]:
#### threading related function

# run command line procsss with bash majic
# make this a small function so it can be target function for threading
def run_bash_cmd(this_cmd):
    !{this_cmd}

# for each bash command is list add to execution thread, join and wait til all done
def run_bash_cmds_threaded(cmd_list):
    job_threads = []
    for this_cmd in cmd_list:
        this_thread = threading.Thread(target=run_bash_cmd, args=(this_cmd,))
        job_threads.append(this_thread)
        this_thread.start()

    for job_thread in job_threads:
        job_thread.join()          

#### read the original sample ID list, and re-prefix from old LNG to AMPPD

In [6]:
sample_ids = read_csv(sample_list_file, header=None)
sample_ids.columns = ['id']
print(f'sample list shape {sample_ids.shape}')
if DEBUG:
    display(sample_ids.sample(5))

sample list shape (135, 1)


Unnamed: 0,id
70,PP-50860
6,PP-3422
65,PP-50086
11,PP-3453
67,PP-50219


#### subset foundin info from parent PPMI set

In [8]:
# read the AMP-PD chr22 psam
in_psam = f'{source_pfiles_prefix}22.psam'
psam_df = read_csv(in_psam, sep='\s+')
print(f'AMP-PD chr22 psam shape {psam_df.shape}')

# now subset to just the cohort
psam_df = psam_df.loc[psam_df['IID'].isin(sample_ids['id'])]
print(f'FOUNDIN-PD psam subset shape{psam_df.shape}')
if DEBUG:
    display(psam_df.sample(5))

# now save the cohrt psam
psam_df.to_csv(cohort_psam_file, index=False, sep='\t')

AMP-PD chr22 psam shape (10418, 3)
FOUNDIN-PD psam subset shape(135, 3)


Unnamed: 0,#FID,IID,SEX
9284,PP-41471,PP-41471,
9683,PP-54265,PP-54265,
9586,PP-52530,PP-52530,
8731,PP-3419,PP-3419,
9720,PP-55380,PP-55380,


#### subset FOUNDIN-PD from AMP-PD plink2 pfile set

In [10]:
def frmt_plink2_subset(in_pfiles_prefix, genos_dir, out_name, chrom, 
                       keep_file, min_mac=1):
    in_pfiles = f'{in_pfiles_prefix}{chrom}'
    out_pfiles = f'{genos_dir}/{out_name}.chr{chrom}'
    if in_pfiles == out_pfiles:
        print('in name cannot be same of out name, here')
        plink_cmd = '#error'

    filter_pass = ' --var-filter'
    plink_cmd = f'plink2 --pfile {in_pfiles} --keep {keep_file} \
--mac {min_mac} --silent --make-pgen --out {out_pfiles}'
    return plink_cmd

cmds = [frmt_plink2_subset(source_pfiles_prefix, genos_dir, cohort_build, chrom, 
                           cohort_psam_file) for chrom in chromosomes] 

# print(cmds)
# now run the cmds concurrently    
run_bash_cmds_threaded(cmds)  

Error: No variants remaining after main filters.
