# Analysis 12: Rank Regions

In [None]:
library(tidyverse)


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Inputs

-   `ns_folder` - path to directory containing the NemaScan output
-   `tox_data_file` - path to toxicant metadata file generated by `analysis/organize_tox_metadata.qmd`
-   `qtl_database_file` - path to directory containing the manual curation data
-   `{inbred/loco}_overlap_dir`: Directory containing HVR overlaps for detected intervals. Generated by running `code/hdr_overlaps/DivergentRegion_Overlaps.R`

In [None]:

# path to the nemascan output directory
ns_folder <- "data/processed/20231116_Analysis_NemaScan"
tox_data_file <- "data/processed/tox_data/con_metadata.csv"
qtl_database_file <- "data/raw/qtl_database/20230710_2021 QTL_Database_review.csv"
inbred_overlap_dir <- "data/processed/HDR_overlap/All_QTL_HD_overlap/inbred"


# Outputs

In [None]:

data_out_dir <- glue::glue("data/processed/qtl_ranking/")

# Create output directories if they don't exist
dir.create(data_out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(glue::glue("{data_out_dir}/inbred"), recursive = TRUE, showWarnings = FALSE)


# Functions

In [None]:
# Function to read and process overlap data
read_overlaps <- function(file) {
  overlap_df <- data.table::fread(file) %>%
      dplyr::select(-`.id`)
  return(overlap_df)
}

process_overlaps <- function(overlap_dir) {
  overlap_files <- list.files(
    overlap_dir,
    pattern = "HD_region_overlap.tsv$",
    full.names = TRUE
  )
  
  overlap_data <- purrr::map(overlap_files, read_overlaps) %>%
    Reduce(rbind, .)
  
  if (is.null(overlap_data) || nrow(overlap_data) == 0) {
    stop(glue::glue("No overlaps data loaded for {overlap_dir}"))
  }
  
  return(overlap_data)
}

# Function to rank QTLs based on criteria
rank_qtl <- function(qtl_summary_df, filter = TRUE) {
  if (filter) {
    ranked_df <- qtl_summary_df %>%
      dplyr::filter(n_strains_alt >= 20) %>%
      dplyr::arrange(
        desc(log10p),
        desc(`Fine Mapping LD Structure`),
        desc(prop_alt_overlap)
      ) %>%
      dplyr::mutate(rank = row_number())
  } else {
    ranked_df <- qtl_summary_df %>%
      dplyr::arrange(
        desc(log10p),
        desc(`Fine Mapping LD Structure`),
        desc(prop_alt_overlap)
      ) %>%
      dplyr::mutate(rank = row_number())
  }
  return(ranked_df)
}


# Main

In [None]:

# load the toxicant metadata
con_metadata <- data.table::fread(tox_data_file)

# pull trait and nice drug label
con_key <- con_metadata %>%
  dplyr::select(
    trait,
    nice_drug_label2,
    big_class,
    moa_class
  )

# Load the inbred mapping results
all.mappings_inbred <- list.files(
  path = glue::glue("{ns_folder}/INBRED/Mapping/Processed"),
  pattern = "AGGREGATE_mapping_inbred",
  recursive = TRUE,
  full.names = TRUE
)
# Load all mapping results
inbred.mapping.results <- purrr::map(all.mappings_inbred, combine.mappings) %>%
  Reduce(rbind, .)
# Process mapping results with condition metadata

mapping.results.class <- inbred.mapping.results %>%
  dplyr::select(marker, trait, log10p, BETA, AF1, var.exp, algorithm) %>%
  # remove the CV_ length_ traits
  dplyr::filter(
    !stringr::str_detect(trait, "CV_")
  ) %>%
  dplyr::distinct() %>%
  # add the nice drug label
  dplyr::left_join(con_key, by = c("trait"))


# Load QTL database
qtl_database <- data.table::fread(qtl_database_file) %>%
    # adjust values in the algorithm column
    dplyr::mutate(Algorithm = stringr::str_to_lower(Algorithm)) %>%
    dplyr::rename(algorithm = Algorithm)
# Process overlap data
inbred_overlaps <- process_overlaps(inbred_overlap_dir)

# Join QTL database with mapping results
mapping_anno <- mapping.results.class %>%
  dplyr::mutate(algorithm = stringr::str_to_lower(algorithm)) %>%
      dplyr::left_join(., qtl_database, by = c("trait" = "Trait", "marker" = "ID", "algorithm" = "algorithm")) %>%
      dplyr::select(-log10p.y) %>%
      dplyr::rename(log10p = log10p.x)

# Join overlap data with QTL annotations
allQTL.anno.overlap <- mapping_anno %>%
  dplyr::left_join(., inbred_overlaps, by = c("trait" = "trait_name", "marker" = "peak_marker", "algorithm" = "algorithm")) %>%
  dplyr::mutate(`Fine Mapping LD Structure` = factor(`Fine Mapping LD Structure`, levels = c("<250kb", "250..500kb", ">500kb")))


# With ALT strain filter
toxin_length_inbred_ranked_top <- allQTL.anno.overlap %>%
  dplyr::filter(algorithm == "inbred") %>%
  rank_qtl(filter = TRUE) %>%
      dplyr::left_join(., con_key, by = "trait")

# Prepare and save Inbred QTL ranking
inbred_ranked_df <- toxin_length_inbred_ranked_top %>%
    dplyr::select(
        trait,
        marker,
        log10p,
        `Fine Mapping LD Structure`,
        `LD between QTL`,
        n_strains_ref,
        n_strains_alt,
        prop_alt_overlap,
        total_strains,
        rank
    )

# Save to processed data folder
data.table::fwrite(
    inbred_ranked_df,
    file = glue::glue("{data_out_dir}/inbred/ranked_qtl.tsv"),
    sep = "\t"
)
