# Running CNV-Finder on Provided Example Data

* **Version:** Python/3.9.16
* **Status:** Complete
* **Last Updated:** 17-NOV-2024

### Notebook Description
The following is a short template for how researchers can use CNV-Finder to explore a large data release for highly probable CNV samples within specific cohorts and regions of interest. 

More customizable argument flags are avaiable to use, outlined in `docs/parameter_guide.md` or are coming soon!

---

### Getting started

In [1]:
# Import necessary packages after pip installing requirements.txt
import sys
import os
import subprocess
import pandas as pd
import numpy as np

from modules.cnv_finder.data_methods import check_interval

In [2]:
# Method to launch jobs in HPC, can otherwise launch commands in terminal
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [3]:
# Define your working directory
wd = '/YOUR/DIRECTORY'

### Path Set-Up

In [4]:
# Find studies in data release

master_path = f'{wd}/example_data/test_master_key.csv'
master = pd.read_csv(master_path)

print(master.study.value_counts())
master.head()

study
TEST1    10
TEST2    10
Name: count, dtype: int64


Unnamed: 0,study,IID,phenotype,age,sex,label
0,TEST1,Sample_1104,PD,70,2,EUR
1,TEST1,Sample_1112,PD,74,1,EUR
2,TEST1,Sample_1735,Control,63,1,EUR
3,TEST1,Sample_1539,AD,84,1,EUR
4,TEST1,Sample_2379,Control,76,2,MDE


In [5]:
# Check that interval exists in reference list (ref_files/glist_hg38_intervals.csv)

# Use 'PARK2' or 'PRKN' for this tutorial
interval = 'PARK2' # if interval name not recognized can provide chromosome and positions in base pairs
chrom, start, stop = check_interval(interval)
print(f'Chromosome: {chrom}, HG38 Start Position:, {start}, HG38 Stop Position: {stop}')

Chromosome: 6, HG38 Start Position:, 161347557, HG38 Stop Position: 162727802


In [70]:
# Set up paths
study = 'TEST1' # 'all' for no specified cohort
model = 'dup' # 'del' or 'dup'

# For use with pre-trained models
if model == 'del':
    split = 5
    window_count = 50
elif model == 'dup':
    split = 10
    window_count = 70

cpus = 8
size = len(master[master.study == study])
out_dir = f'{wd}/testing/{model}/{study}/{interval}'
out_path = f'{out_dir}/{study}_{interval}_{split}_{window_count}'

os.makedirs(out_dir, exist_ok=True)

### ML Data Prep

In [44]:
cmd = f'python modules/run_data_prep.py \
--interval_name {interval} \
--study_name {study} \
--split_interval {split} \
--total_windows {window_count} \
--master_file {master_path} \
--metrics_path {wd}/example_data/snp_metrics \
--cpus {cpus} \
--out_path {out_path} \
--create_testing \
--test_size {size}'

In [45]:
# Launch a job with your HPC
conda_env = 'python_3_9' # holds packages from requirements.txt file
os.makedirs('swarm', exist_ok=True)

with open(f'swarm/data_prep_{study}_{interval}_{model}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/data_prep_{study}_{interval}_{model}.swarm', 'w') as f:
    f.write(f'bash swarm/data_prep_{study}_{interval}_{model}.sh')
    f.close()

swarm_cmd = f'swarm -f swarm/data_prep_{study}_{interval}_{model}.swarm --g 25 --time=15:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/data_prep_TEST1_PARK2_dup.swarm --g 25 --time=15:00:00 --logdir swarm/logs --module python/3.9


### LSTM Model

In [56]:
# Parameters to run pre-saved models (can use prelim, updated, or final models from ref_files/models with same flags)
if model == 'del':
    cmd = f'python modules/run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval dosage_full {model}_dosage_full {model}_dosage_interval std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/models/final_{model}_{split}_{window_count}_combo4_lstm.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'

elif model == 'dup':
    cmd = f'python modules/run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval {model}_dosage_full std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/models/final_{model}_{split}_{window_count}_combo6_lstm.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'  

In [57]:
# Launch a job with your HPC
conda_env = 'python_3_9'
os.makedirs('swarm', exist_ok=True)

with open(f'swarm/ml_model_{study}_{interval}_{model}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/ml_model_{study}_{interval}_{model}.swarm', 'w') as f:
    f.write(f'bash swarm/ml_model_{study}_{interval}_{model}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/ml_model_{study}_{interval}_{model}.swarm --g 16 --time=1:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/ml_model_TEST1_PARK2_dup.swarm --g 16 --time=1:00:00 --logdir swarm/logs --module python/3.9


### App Prep & Local Functions

In [71]:
# Easier transfer from HPC to local app folder for Streamlit visuals

# Combine all app-ready files into cohort-specific folders
app_dir = f'{wd}/testing/app_ready/{study}/final_{model}_model/{interval}'
app_out = f'{app_dir}/{study}_{interval}'

os.makedirs(app_dir, exist_ok=True)

In [72]:
cmd = f'python modules/run_app_prep.py \
--interval_name {interval} \
--test_set_ids {out_path}_testing_IDs.csv \
--test_set_windows {out_path}_samples_windows.csv \
--test_set_results {out_path}_windows_results.csv \
--cpus {cpus} \
--out_path {app_out} \
--make_app_ready'

In [73]:
# Launch a job with your HPC
conda_env = 'python_3_9'
os.makedirs('swarm', exist_ok=True)

with open(f'swarm/app_prep_{study}_{interval}_{model}.sh', 'w') as f:
    f.write('#!/usr/bin/env bash\n\n')
    f.write('source /data/$USER/conda/etc/profile.d/conda.sh\n')
    f.write(f'conda activate {conda_env}\n')
    f.write(cmd)
    f.close()
    
with open(f'swarm/app_prep_{study}_{interval}_{model}.swarm', 'w') as f:
    f.write(f'bash swarm/app_prep_{study}_{interval}_{model}.sh')
    f.close()
    
swarm_cmd = f'swarm -f swarm/app_prep_{study}_{interval}_{model}.swarm --g 25 --time=2:00:00 --logdir swarm/logs --module python/3.9'
shell_do(swarm_cmd)

Executing: swarm -f swarm/app_prep_TEST1_PARK2_dup.swarm --g 25 --time=2:00:00 --logdir swarm/logs --module python/3.9


In [None]:
# Run app code from your terminal
! streamlit run app/Home.py

---

### Good job!
If you generated predictions for _PRKN_, you should notice that the TEST1 cohort has 5 samples with deletions while TEST2 has 5 samples with duplications in this gene region.