# Analysis 19: Semantic Similarity for Candidate Genes

In [None]:

library(dplyr)



Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ data.table::between() masks dplyr::between()
✖ dplyr::filter()       masks stats::filter()
✖ data.table::first()   masks dplyr::first()
✖ lubridate::hour()     masks data.table::hour()
✖ lubridate::isoweek()  masks data.table::isoweek()
✖ dplyr::lag()          masks stats::lag()
✖ data.table::last()    masks dplyr::last()
✖ lubridate::mday()     masks data.table::mday()
✖ lubridate::minute()   masks data.table::minute()
✖ lubridate::month()    masks data.table::month()
✖ lubridate::quarter()  masks data.table::quarter()
✖ lubridate::second()   masks data.table::second()
✖ purrr::transpose()    masks data.table::transpose()
✖ lubridate::wday()     masks data.table::wday()
✖ lubridate::week()     masks data.table::week()
✖ lubridate::yday()     masks data.table::yday()
✖ lubridate::year()     masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to be

preparing gene to GO mapping data...
preparing IC data...

# Inputs

In [None]:
# path to NS output directory

# Candidate gene output from `finemapping_curation` script
candidate_out_name <- "data/processed/interval_genes/candidate_genes/inbred_candidate_genes.tsv.gz"


# Outputs

In [None]:

# Output directory for semantic similarity results
out_dir <- "data/processed/semantic_similarity"
out_fn <- "candidate_semSim.rds"

# check if the output directory exists, if not create it
if (!dir.exists(out_dir)) {
  dir.create(out_dir, recursive = TRUE)
}


# Functions

In [None]:

#' Prepare candidate gene clusters for semantic similarity analysis
#' @param candidate_df A dataframe with candidate gene data
#' @return A list of gene clusters by trait
prepare_trait_clusters <- function(candidate_df) {
  trait.clusters <- NULL

  # Get unique traits
  traits <- unique(candidate_df$trait)

  for (trait_id in traits) {
    # Get genes for this trait
    genes <- candidate_df %>%
      dplyr::filter(trait == trait_id) %>%
      dplyr::pull(WBGeneID) %>%
      unique()

    # Convert WBGeneID to Entrez IDs
    genes.entrez <- clusterProfiler::bitr(
      geneID = genes,
      fromType = "ENSEMBL",
      toType = "ENTREZID",
      OrgDb = "org.Ce.eg.db"
    ) %>%
      dplyr::pull(ENTREZID)

    # Add to list
    trait.clusters[[trait_id]] <- genes.entrez
  }

  return(trait.clusters)
}

#' Calculate semantic similarity between gene clusters
#' @param gene_clusters List of gene clusters by trait
#' @return Matrix of semantic similarity scores
calculate_semantic_similarity <- function(gene_clusters) {
  # Create GO data object
  cEGO <- GOSemSim::godata("org.Ce.eg.db", ont = "BP")

  # Calculate semantic similarity
  semSim <- GOSemSim::mclusterSim(
    gene_clusters,
    semData = cEGO,
    measure = "Wang",
    combine = "BMA"
  )

  return(semSim)
}

#' Generate summary statistics for GO term coverage
#' @param candidate_df A dataframe with candidate gene data
#' @return A dataframe with GO term coverage statistics
generate_go_coverage_stats <- function(candidate_df) {
  # Get unique traits
  traits <- unique(candidate_df$trait)

  # Initialize results list
  stats <- list()

  for (trait_id in traits) {
    # Get genes for this trait
    genes <- candidate_df %>%
      dplyr::filter(trait == trait_id) %>%
      dplyr::pull(WBGeneID) %>%
      unique()

    # Convert to Entrez IDs
    genes.entrez <- clusterProfiler::bitr(
      geneID = genes,
      fromType = "ENSEMBL",
      toType = "ENTREZID",
      OrgDb = "org.Ce.eg.db"
    )

    # Calculate statistics
    stats[[trait_id]] <- data.frame(
      trait = trait_id,
      total_genes = length(genes),
      genes_with_entrez = nrow(genes.entrez),
      proportion_with_entrez = nrow(genes.entrez) / length(genes)
    )
  }

  # Combine results
  stats_df <- do.call(rbind, stats)
  return(stats_df)
}


# Main

In [None]:

candidate_raw <- data.table::fread(candidate_out_name)

# Generate GO term coverage statistics
coverage_stats <- generate_go_coverage_stats(candidate_raw)


'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns

[1] "GO term coverage statistics:"

                                                          trait total_genes
length_2_4_D                                       length_2_4_D           1
length_Aldicarb                                 length_Aldicarb           7
length_Arsenic_trioxide                 length_Arsenic_trioxide          15
length_Atrazine                                 length_Atrazine           3
length_Cadmium_dichloride             length_Cadmium_dichloride           3
length_Carbaryl                                 length_Carbaryl           3
length_Carboxin                                 length_Carboxin           1
length_Chlorothalonil                     length_Chlorothalonil          45
length_Mancozeb                                 length_Mancozeb           3
length_Methyl_mercury                     length_Methyl_mercury          13
length_Nickel_dichloride               length_Nickel_dichloride          14
length_Paraquat_62_5                       length_Paraquat_62_5           8
length_Propo

In [None]:

# Prepare gene clusters for semantic similarity analysis
candidate_gene_clusters <- prepare_trait_clusters(candidate_raw)


'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns

Candidate genes semantic similarity saved to data/processed/semantic_similarity/candidate_semSim.rds