# Analysis 18: Semantic Similarity for All Genes

In [None]:

library(dplyr)



Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

here() starts at /Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript

$strain_table
$strain_table$html
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S1/table_S1.html"

$strain_table$tsv
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S1/table_S1.tsv.zip"

$strain_table$docx
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S1/table_S1.docx"

$strain_table$csv
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S1/table_S1.csv"


$tox_table_ft
$tox_table_ft$html
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S2/table_S2.html"

$tox_table_ft$tsv
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S2/table_S2.tsv.zip"

$tox_table_ft$docx
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S2/table_S2.docx"

$tox_table_ft$csv
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S2/table_S2.csv"


$trait_table
$trait_table$html
[1] "/Users/ryanmckeown/Desktop/ToxinGWAS_Manuscript/tables/table_S3/table_S3.html"

$trait_table$tsv
[1] "


Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows


Attaching package: 'flextable'

The following objects are masked from 'package:kableExtra':

    as_image, footnote

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ data.table::between()    masks dplyr::between()
✖ purrr::compose()         masks flextable::compose()
✖ dplyr::filter()          masks stats::filter()
✖ data.table::first()      masks dplyr::first()
✖ kableExtra::group_rows() masks dplyr::group_rows()
✖ lubridate::hour()        masks data.table::hour()
✖ lubridate::isoweek()     masks data.table::isoweek()
✖ dplyr::lag()             masks stats::lag()
✖ data.table::last()       masks dplyr::last()
✖ lubridate::mday()        masks data.table::mday()
✖ lubridate::minute()      masks data.table::minute()
✖ lubridate::month()       masks data.table::month()
✖ lubridate::quarter()     masks data.table::quarter()
✖ lubridate::second()      masks data.table::second()
✖ purrr::transpose()       masks data.table::transpose()
✖ lubridate::wday()        masks data.table::wday()
✖ lubridate::week()        masks data.table::week()
✖ lubridate::yday()        masks dat

preparing gene to GO mapping data...
preparing IC data...

# Inputs

In [None]:
# path to NS output directory
ns_dir <- "data/processed/20231116_Analysis_NemaScan"
tox_file <- "data/processed/tox_data/tox_metadata.csv"


# Outputs

In [None]:
# path to output directory should be the same name as the dir contain the script
out_dir <- "data/processed/semantic_similarity"
out_fn <- "all_genes_semSim.rds"
# check if the output directory exists, if not create it
if (!dir.exists(out_dir)) {
  message(glue("Output directory '{out_dir}' does not exist. Creating it..."))
  dir.create(out_dir, recursive = TRUE)
  message(glue("Output directory '{out_dir}' created successfully."))
} else {
  message(glue("Output directory '{out_dir}' already exists."))
}


Output directory 'data/processed/semantic_similarity' already exists.

# Functions

In [None]:
# Create a function to process each trait
process_trait <- function(trait_id) {
  message(glue("Processing trait: {trait_id}"))
  genes <- get_genes_all_intervals(
    qtl = inbred,
    trait_id = trait_id,
    gff_df = gff,
    cEGO = cEGO
  )
  return(genes)
}


# Main

In [None]:

## Load the tox data
tox <- data.table::fread(tox_file)

# load dataframe with the inbred interval data
# [ ] - will need a way to connect the trait/drug to the tox metadata
inbred <- data.table::fread(
  glue::glue("{ns_dir}/INBRED/Mapping/Processed/QTL_peaks_inbred.tsv"),
  data.table = F
) %>%
  # fix drug names
  dplyr::mutate(
    drug = stringr::str_replace(
      trait,
      pattern = "^length_",
      replacement = ""
    )
  ) %>%
  # remove the CV_ length_ traits
  dplyr::filter(
    !stringr::str_detect(drug, "CV_")
  ) %>%
  dplyr::select(
    trait,
    drug,
    CHROM,
    marker,
    startPOS,
    endPOS
  ) %>%
  # create an id to be used in list
  dplyr::mutate(
    qtl.id = paste(trait, marker, sep = "_")
  )
# Extract all unique combinations of the 'trait' column
unique_trait_ids <- inbred %>%
  dplyr::pull(trait) %>%
  unique()

# Get number of cores (leave one core free for system)
n_cores <- parallel::detectCores() - 1

# Process all traits in parallel
message(glue("Processing {length(unique_trait_ids)} traits using {n_cores} cores..."))


Processing 16 traits using 7 cores...

All genes semantic similarity saved to data/processed/semantic_similarity/all_genes_semSim.rds