# Running CNV-Finder on Provided Example Data

* **Version:** Python/3.9.16
* **Status:** Complete
* **Last Updated:** 02-JUN-2025

### Notebook Description
The following is a short template for how researchers can use CNV-Finder to explore a large data release for highly probable CNV samples within specific cohorts and regions of interest. 

More customizable argument flags are avaiable to use, outlined in `docs/parameter_guide.md` or are coming soon!

---

### Getting started

In [None]:
! pip install -r requirements.txt

In [1]:
# Import necessary packages after pip installing requirements.txt
import sys
import os
import subprocess
import pandas as pd
import numpy as np

from modules.cnv_finder.data_methods import check_interval

In [None]:
# Define your working directory
wd = '/YOUR/DIRECTORY'

### Path Set-Up

In [3]:
# Find studies in data release

master_path = f'{wd}/example_data/test_master_key.csv'
master = pd.read_csv(master_path, low_memory = False)

print(master.study.value_counts())
master.head()

study
TEST1    10
TEST2    10
Name: count, dtype: int64


Unnamed: 0,study,IID,phenotype,age,sex,label
0,TEST1,2231_1069,PD,75,1,EUR
1,TEST1,2231_1220,PD,67,2,EUR
2,TEST1,2231_1170,Control,60,2,EUR
3,TEST1,2231_2180,AD,80,2,EUR
4,TEST1,2231_1431,Control,76,1,MDE


In [4]:
# Check that interval exists in reference list (ref_files/glist_hg38_intervals.csv)

# Use 'PARK2' or 'PRKN' for this tutorial
interval = 'PARK2' # if interval name not recognized can provide chromosome and positions in base pairs
chrom, start, stop = check_interval(interval)
print(f'Chromosome: {chrom}, HG38 Start Position:, {start}, HG38 Stop Position: {stop}')

Chromosome: 6, HG38 Start Position:, 161347557, HG38 Stop Position: 162727802


In [5]:
# View metadata file for chromosome of interest that holds repeating values per SNP ID
metadata_path = f'{wd}/ref_files/NBA_metadata/CHROM={chrom}'
metadata = pd.read_parquet(metadata_path)
display(metadata)

Unnamed: 0,snpID,POS,GenTrain_Score
0,JHU_6.149608,149609,0.521508
1,JHU_6.149635,149636,0.811666
2,JHU_6.149687,149688,0.723705
3,JHU_6.150278,150279,0.887905
4,JHU_6.151281,151282,0.768985
...,...,...,...
130707,JHU_6.170909504,170600417,0.844019
130708,JHU_6.170916939,170607852,0.711168
130709,exm2266360,170610382,0.551874
130710,JHU_6.170982071,170672984,0.661204


In [6]:
# View one sample's SNP metrics

sample = '2231_1069'
barcode = sample.split('_')[0]

metrics_path = f'{wd}/example_data/snp_metrics/{barcode}/{sample}/chromosome={chrom}'
metrics_df = pd.read_parquet(metrics_path)
display(metrics_df)

Unnamed: 0,snpID,BAF,LRR
0,JHU_6.149608,0.453585,0.192400
1,JHU_6.149635,1.026652,0.017977
2,JHU_6.149687,0.949016,-0.049884
3,JHU_6.150278,0.983779,-0.025711
4,JHU_6.151281,0.002548,0.158113
...,...,...,...
128613,JHU_6.170909504,0.889003,-0.088134
128614,JHU_6.170916939,-0.040370,0.083401
128615,exm2266360,1.015652,0.023781
128616,JHU_6.170982071,-0.063196,0.168623


In [7]:
# Set up paths
study = 'TEST2' # 'all' for no specified cohort
model = 'del' # 'del' or 'dup'

# For use with pre-trained models
if model == 'del':
    split = 5
    window_count = 50
elif model == 'dup':
    split = 10
    window_count = 70

cpus = 8
size = len(master[master.study == study])
out_dir = f'{wd}/testing/{model}/{study}/{interval}'
out_path = f'{out_dir}/{study}_{interval}_{split}_{window_count}'

os.makedirs(out_dir, exist_ok=True)

### ML Data Prep

In [None]:
# Create command to launch with available arguments

cmd = f'python {wd}/modules/run_data_prep.py \
--interval_name {interval} \
--study_name {study} \
--split_interval {split} \
--total_windows {window_count} \
--master_file {master_path} \
--metrics_path {wd}/example_data/snp_metrics \
--cpus {cpus} \
--out_path {out_path} \
--create_testing \
--test_size {size}'

In [None]:
# Launch command - this implementation creates a tmp file directory
! {cmd}

### LSTM Model

In [10]:
# Parameters to run pre-saved models (can use prelim, updated, or final models from ref_files/models with same flags)

if model == 'del':
    cmd = f'python {wd}/modules/run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval dosage_full {model}_dosage_full {model}_dosage_interval std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/models/final_{model}_{split}_{window_count}_combo4_lstm.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'

elif model == 'dup':
    cmd = f'python {wd}/modules/run_lstm_model.py \
    --test_file {out_path}_samples_windows.csv \
    --feature_names dosage_interval {model}_dosage_full std_baf std_mid_baf std_lrr iqr_baf iqr_mid_baf iqr_lrr avg_baf avg_mid_baf avg_lrr \
    --model_file {wd}/ref_files/models/final_{model}_{split}_{window_count}_combo6_lstm.keras \
    --predict \
    --print_summary \
    --out_path {out_path}'  

In [None]:
! {cmd}

### App Prep

In [12]:
# Easier transfer from HPC to local app folder for Streamlit visuals

# Combine all app-ready files into cohort-specific folders
app_dir = f'{wd}/testing/app_ready/{study}/final_{model}_model/{interval}'
app_out = f'{app_dir}/{study}_{interval}'

os.makedirs(app_dir, exist_ok=True)

In [13]:
cmd = f'python modules/run_app_prep.py \
--interval_name {interval} \
--test_set_ids {out_path}_testing_IDs.csv \
--test_set_windows {out_path}_samples_windows.csv \
--test_set_results {out_path}_windows_results.csv \
--cpus {cpus} \
--out_path {app_out} \
--make_app_ready'

In [14]:
! {cmd}

In [None]:
# Run app code from your terminal
! streamlit run app/Home.py

---

### Good job!
If you generated predictions for _PRKN_, you should notice that the TEST1 cohort has 5 samples with deletions while TEST2 has 5 samples with duplications in this gene region.