In [1]:
import pandas as pd
import sys
import os
import subprocess

In [2]:
# Supress copy warning.
pd.options.mode.chained_assignment = None

In [3]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [4]:
wd = '/YOUR/DIRECTORY'

The following is a short template for how CNV_finder can be used when a broader data release is available for exploration of highly probable CNV samples within specific cohorts and regions of interest. More customizable argument flags are avaiable to use or are coming soon!

### Path Set-Up

In [None]:
# Find studies in data release
master_path = '/YOUR/MASTER/PATH'
master = pd.read_csv(master_path)
print(master.study.value_counts())

In [None]:
# Set up paths
study = 'LCC' # 'all' for no specified cohort
interval = 'PARK2' # if no interval name can provide chr and positions

model = 'del' # 'del' or 'dup'
window_count = 50 # del: 50; dup: 70
split = 5 # del: 5; dup: 10

cpus = 8
size = len(master[master.study == study])
out_dir = f'{wd}/testing/{model}/{study}/{interval}'
out_path = f'{out_dir}/GP2_{study}_{interval}'

os.makedirs(out_dir, exist_ok=True)

### ML Data Prep

In [95]:
cmd = f'python run_data_prep.py \
--interval_name {interval} \
--study_name {study} \
--split_interval {split} \
--total_windows {window_count} \
--master_file {master_path} \
--cpus {cpus} \
--out_path {out_path} \
--create_testing \
--test_size {size}' # currently acts as max test size b/c of potential for missing snp metric files

In [96]:
# Launch a job with your HPC
conda_env = 'python_3_9'

with open(f'swarm/data_prep_{model}_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/data_prep_{model}_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/data_prep_{model}_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/data_prep_{model}_{study}_{interval}.swarm --g 200 --time=24:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/data_prep.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9


### LSTM Model

In [15]:
# parameters to run pre-saved models - can use prelim, updated, or final models with same flags
if model == 'del':
    cmd = f'python run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval dosage_full {model}_dosage_full {model}_dosage_interval std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/final_{model}_{split}_{window_count}_combo4_overlapping_binary_lstm_windows.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'

elif model == 'dup':
    cmd = f'python run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval {model}_dosage_full std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/final_{model}_{split}_{window_count}_combo6_overlapping_binary_lstm_windows.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'  

In [16]:
# Launch a job with your HPC
conda_env = 'python_3_9'

with open(f'swarm/ml_model_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/ml_model_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/ml_model_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/ml_model_{study}_{interval}.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/ml_model.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9


### App Prep & Local Functions

In [None]:
# For easier transfer from HPC to local app folder for Streamlit visuals
app_dir = f'{out_dir}/app'
app_out = f'{app_dir}/GP2_{study}_{interval}'

os.makedirs(app_dir, exist_ok=True)

In [24]:
cmd = f'python run_app_prep.py \
--interval_name {interval} \
--test_set_ids {out_path}_testing_IDs.csv \
--test_set_windows {out_path}_samples_windows.csv \
--test_set_results {out_path}_{window_count}_windows_results.csv \
--cpus {cpus} \
--out_path {app_out} \
--make_app_ready'

In [25]:
# Launch a job with your HPC
conda_env = 'python_3_9'

with open(f'swarm/app_prep_{study}_{interval}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/app_prep_{study}_{interval}.swarm', 'w') as f:
    f.write(f'bash swarm/app_prep_{study}_{interval}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/app_prep_{study}_{interval}.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/app_prep.swarm --g 200 --time=2:00:00 --logdir swarm/logs --module python/3.9


In [None]:
# Copy files to local for app
! scp {app_dir} CNV_app/data/{model}_final_model

In [None]:
# Run app code
! streamlit run CNV_app/Home.py