In [1]:
import pandas as pd
import sys
import os
import subprocess

In [2]:
# Supress copy warning.
pd.options.mode.chained_assignment = None

In [3]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [4]:
wd = '/YOUR/DIRECTORY'

The following is a short template for how CNV_finder can be used when a broader data release is available for exploration of highly probable CNV samples within specific cohorts and regions of interest. More customizable argument flags are avaiable to use or are coming soon!

### Path Set-Up

In [None]:
# Find studies in data release
master = pd.read_csv('/YOUR/MASTER/DIRECTORY')
print(master.study.value_counts())

In [None]:
# Set up paths
study = 'LCC' # 'all' for no specified cohort
interval = 'PARK2' # if no interval name can provide chr and positions
window_count = 31 # 50 is default
split = 5 # 5 is default
size = len(master[master.study == study])
out_dir = f'{wd}/testing/{study}/{interval}'
out_path = f'{out_dir}/GP2_{study}_{interval}'

os.makedirs(out_dir, exist_ok=True)

### ML Data Prep

In [95]:
cmd = f'python run_data_prep.py \
--interval_name {interval} \
--study_name {study} \
--split_interval {split} \
--total_windows {window_count} \
--training_ids {wd}/ref_files/training_set_IDs.csv \
--out_path {out_path} \
--create_testing \
--test_size {size}' # currently acts as max test size b/c of missing snp metrics

In [96]:
conda_env = 'python_3_9'

with open(f'swarm/data_prep_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/data_prep_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/data_prep_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/data_prep_{study}_{interval}.swarm --g 200 --time=24:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/data_prep.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9


### LSTM Model

In [15]:
cmd = f'python run_lstm_model.py \
--train_file {wd}/ref_files/final_overlapping_del_training_set_{split}_intervals.csv \
--test_file {out_path}_samples_windows.csv \
--feature_names dosage_interval dosage_gene del_dosage std_baf std_lrr iqr_baf iqr_lrr avg_baf avg_lrr \
--model_file {wd}/ref_files/overlapping_binary_lstm_{window_count}_windows.keras \
--predict \
--print_summary \
--out_path {out_path}'

In [16]:
conda_env = 'python_3_9'

with open(f'swarm/ml_model_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/ml_model_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/ml_model_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/ml_model_{study}_{interval}.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/ml_model.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9


### Local Download & App Prep

In [None]:
# for easier transfer from HPC to local app folder
app_dir = f'{out_dir}/app'
app_out = f'{app_dir}/GP2_{study}_{interval}'

os.makedirs(app_dir, exist_ok=True)

In [24]:
cmd = f'python run_app_prep.py \
--interval_name {interval} \
--test_set_ids {out_path}_testing_IDs.csv \
--test_set_windows {out_path}_samples_windows.csv \
--test_set_results {out_path}_{window_count}_windows_results.csv \
--out_path {app_out} \
--make_app_ready'

In [25]:
conda_env = 'python_3_9'

with open(f'swarm/app_prep_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/app_prep_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/app_prep_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/app_prep_{study}_{interval}.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/app_prep.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9
