# Analysis 04: Generate Mapping Inputs

# Setup

In [None]:

library(data.table)
library(dplyr)



Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'purrr'

The following object is masked from 'package:data.table':

    transpose


Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows


Attaching package: 'flextable'

The following objects are masked from 'package:kableExtra':

    as_image, footnote

The following object is masked from 'package:purrr':

    compose

# Overview

The cleaned phenotype data is stored in the data cleaning [Github repo]('https://github.com/AndersenLab/2021_GWA_data_cleaning/tree/main/data/processed'). From the cleaned data, we generate mapping inputs for NemaScan.

This script will pull the trait files from the data cleaning repo for the toxicants analyzed in this study, and format them for NemaScan.

The input format is a file where the rownames represent strain names, and the columns represent the traits to be mapped. The first column is the strain names, and the subsequent columns are the trait values.

# Inputs

In [None]:

# Path to toxicant metadata file (generated by organize_tox_metadata.qmd)
con_metadata_fn <- "data/processed/tox_data/con_metadata.csv"

# GitHub repo details for downloading trait files
github_repo <- "AndersenLab/2021_GWA_data_cleaning"
github_branch <- "main"
github_traitfiles_path <- "data/processed/traitfiles"

# Expected date prefix in trait file names
date_prefix <- "20230322"

# Output path for generated trait file
output_fn <- "data/processed/aggregated_toxicant_traits.tsv"


# Load cleaned data

Download the cleaned data stored in the `traitfiles` folder of the data cleaning repo. The trait files are named according to the following convention: `<date>_<trait_name>_traitfile.tsv`, (e.g., `20230322_2_4_D_traitfile.tsv`, or `20230322_Aldicarb_traitfile.tsv`). Each file has three columns: `strain`, `<trait>_length`, `<trait_CV_length>`.

The folder also contains other data collected for other chemicals not included here. So we filter to just the files that have the toxicant data.

In [None]:

# Load condition metadata to identify which toxicants to download
con_metadata <- data.table::fread(con_metadata_fn)

# Extract unique trait names, removing the "length_" prefix to get file name component
trait_info <- con_metadata %>%
  dplyr::select(trait, toxicant) %>%
  dplyr::distinct() %>%
  dplyr::mutate(
    # Remove "length_" prefix to get the trait name used in filenames
    trait_filename = stringr::str_replace(trait, "^length_", ""),
    # Fix decimal point issues: convert underscore before decimal numbers back to period
    # e.g., "Paraquat_62_5" -> "Paraquat_62.5"
    # This handles cases like: Paraquat_62.5, Silver_nitrate_7.8, Triphenyl_phosphate_6.25
    trait_filename = stringr::str_replace(trait_filename, "_(\\d+)_(\\d+)$", "_\\1.\\2"),
    # Construct expected filename
    filename = glue::glue("{date_prefix}_{trait_filename}_traitfile.tsv")
  )

print(glue::glue("Found {nrow(trait_info)} toxicant traits to download"))


Found 26 toxicant traits to download

                              trait                 toxicant
                             <char>                   <char>
 1:                    length_2_4_D                    2,4-D
 2:                 length_Aldicarb                 Aldicarb
 3:         length_Arsenic_trioxide                  Arsenic
 4:                 length_Atrazine                 Atrazine
 5:       length_Cadmium_dichloride       Cadmium dichloride
 6:                 length_Carbaryl                 Carbaryl
 7:                 length_Carboxin                 Carboxin
 8:             length_Chlorfenapyr             Chlorfenapyr
 9:           length_Chlorothalonil           Chlorothalonil
10:             length_Chlorpyrifos             Chlorpyrifos
11:          length_Copper_chloride          Copper chloride
12:             length_Lead_nitrate        Lead (II) nitrate
13:                length_Malathion                Malathion
14:                 length_Mancozeb                 Mancozeb
15:                 leng

In [None]:

# Function to download a single trait file from GitHub
download_trait_file <- function(filename, repo = github_repo, branch = github_branch,
                                path = github_traitfiles_path) {
  # Construct raw GitHub URL
  raw_url <- glue::glue("https://raw.githubusercontent.com/{repo}/{branch}/{path}/{filename}")

  message(glue::glue("Downloading: {filename}"))

  # Download file - httr handles authentication via .Renviron or system credentials
  response <- httr::GET(raw_url)

  # Check if download was successful
  if (httr::status_code(response) == 200) {
    # Parse content as text and read as data.table
    content_text <- httr::content(response, "text", encoding = "UTF-8")
    df <- data.table::fread(text = content_text)

    message(glue::glue("  ✓ Successfully downloaded {filename} ({nrow(df)} strains)"))
    return(df)
  } else {
    warning(glue::glue("  ✗ Failed to download {filename}: HTTP {httr::status_code(response)}"))
    return(NULL)
  }
}

# Download all trait files
trait_files_list <- purrr::map(trait_info$filename, download_trait_file)


Downloading: 20230322_2_4_D_traitfile.tsv

  ✓ Successfully downloaded 20230322_2_4_D_traitfile.tsv (192 strains)

Downloading: 20230322_Aldicarb_traitfile.tsv

  ✓ Successfully downloaded 20230322_Aldicarb_traitfile.tsv (194 strains)

Downloading: 20230322_Arsenic_trioxide_traitfile.tsv

  ✓ Successfully downloaded 20230322_Arsenic_trioxide_traitfile.tsv (194 strains)

Downloading: 20230322_Atrazine_traitfile.tsv

  ✓ Successfully downloaded 20230322_Atrazine_traitfile.tsv (195 strains)

Downloading: 20230322_Cadmium_dichloride_traitfile.tsv

  ✓ Successfully downloaded 20230322_Cadmium_dichloride_traitfile.tsv (190 strains)

Downloading: 20230322_Carbaryl_traitfile.tsv

  ✓ Successfully downloaded 20230322_Carbaryl_traitfile.tsv (194 strains)

Downloading: 20230322_Carboxin_traitfile.tsv

  ✓ Successfully downloaded 20230322_Carboxin_traitfile.tsv (193 strains)

Downloading: 20230322_Chlorfenapyr_traitfile.tsv

  ✓ Successfully downloaded 20230322_Chlorfenapyr_traitfile.tsv (152 strains)

Downloading: 20230322_Chlorothalonil_traitfile.tsv

  ✓ Successfully downloaded 20230322_Chlorothalonil_traitfile.tsv (194 strains)

Downloading: 20230322_Chlorpyrifos_traitfile.tsv

  ✓ Successfully downloaded 20230322_Chlorpyrifos_traitfile.tsv (173 strains)

Downloading: 20230322_Copper_chloride_traitfile.tsv

  ✓ Successfully downloaded 20230322_Copper_chloride_traitfile.tsv (194 strains)

Downloading: 20230322_Lead_nitrate_traitfile.tsv

  ✓ Successfully downloaded 20230322_Lead_nitrate_traitfile.tsv (194 strains)

Downloading: 20230322_Malathion_traitfile.tsv

  ✓ Successfully downloaded 20230322_Malathion_traitfile.tsv (194 strains)

Downloading: 20230322_Mancozeb_traitfile.tsv

  ✓ Successfully downloaded 20230322_Mancozeb_traitfile.tsv (194 strains)

Downloading: 20230322_Methomyl_traitfile.tsv

  ✓ Successfully downloaded 20230322_Methomyl_traitfile.tsv (143 strains)

Downloading: 20230322_Methyl_mercury_traitfile.tsv

  ✓ Successfully downloaded 20230322_Methyl_mercury_traitfile.tsv (193 strains)

Downloading: 20230322_Nickel_dichloride_traitfile.tsv

  ✓ Successfully downloaded 20230322_Nickel_dichloride_traitfile.tsv (193 strains)

Downloading: 20230322_Paraquat_62.5_traitfile.tsv

  ✓ Successfully downloaded 20230322_Paraquat_62.5_traitfile.tsv (155 strains)

Downloading: 20230322_Paraquat_250_traitfile.tsv

  ✓ Successfully downloaded 20230322_Paraquat_250_traitfile.tsv (192 strains)

Downloading: 20230322_Propoxur_traitfile.tsv

  ✓ Successfully downloaded 20230322_Propoxur_traitfile.tsv (194 strains)

Downloading: 20230322_Pyraclostrobin_traitfile.tsv

  ✓ Successfully downloaded 20230322_Pyraclostrobin_traitfile.tsv (194 strains)

Downloading: 20230322_Silver_nitrate_250_traitfile.tsv

  ✓ Successfully downloaded 20230322_Silver_nitrate_250_traitfile.tsv (186 strains)

Downloading: 20230322_Silver_nitrate_7.8_traitfile.tsv

  ✓ Successfully downloaded 20230322_Silver_nitrate_7.8_traitfile.tsv (194 strains)

Downloading: 20230322_Triphenyl_phosphate_6.25_traitfile.tsv

  ✓ Successfully downloaded 20230322_Triphenyl_phosphate_6.25_traitfile.tsv (175 strains)

Downloading: 20230322_Triphenyl_phosphate_50_traitfile.tsv

  ✓ Successfully downloaded 20230322_Triphenyl_phosphate_50_traitfile.tsv (191 strains)

Downloading: 20230322_Zinc_dichloride_traitfile.tsv

  ✓ Successfully downloaded 20230322_Zinc_dichloride_traitfile.tsv (174 strains)

Successfully downloaded 26 out of 26 trait files

# Aggregate trait data

Extract only the `<trait>_length` columns from each file and combine into a single wide-format dataframe where strains are rows and traits are columns.

In [None]:

# Function to extract and rename the length column from a trait file
process_trait_file <- function(df, trait_name) {
  # Identify the length column (should match pattern: <something>_length but not CV_length)
  length_col <- names(df)[stringr::str_detect(names(df), "_length$") &
    !stringr::str_detect(names(df), "^CV_")]

  if (length(length_col) == 0) {
    warning(glue::glue("No length column found for trait: {trait_name}"))
    return(NULL)
  }

  if (length(length_col) > 1) {
    warning(glue::glue("Multiple length columns found for trait: {trait_name}, using first"))
    length_col <- length_col[1]
  }

  # Select strain and length column, rename length column to match trait name
  # Note: trait_name should already have the "length_" prefix and underscores instead of periods
  result <- df %>%
    dplyr::select(strain, !!sym(length_col)) %>%
    dplyr::rename(!!trait_name := !!sym(length_col))

  return(result)
}

# Process each trait file
processed_traits <- purrr::imap(trait_files_list, ~ process_trait_file(.x, .y))


length_2_4_D, using first

length_Aldicarb, using first

length_Arsenic_trioxide, using first

length_Atrazine, using first

length_Cadmium_dichloride, using first

length_Carbaryl, using first

length_Carboxin, using first

length_Chlorfenapyr, using first

length_Chlorothalonil, using first

length_Chlorpyrifos, using first

length_Copper_chloride, using first

length_Lead_nitrate, using first

length_Malathion, using first

length_Mancozeb, using first

length_Methomyl, using first

length_Methyl_mercury, using first

length_Nickel_dichloride, using first

length_Paraquat_62_5, using first

length_Paraquat_250, using first

length_Propoxur, using first

length_Pyraclostrobin, using first

length_Silver_nitrate_250, using first

length_Silver_nitrate_7_8, using first

length_Triphenyl_phosphate_6_25, using first

length_Triphenyl_phosphate_50, using first

length_Zinc_dichloride, using first

Aggregated traits data:

  Strains: 195

  Traits: 26

  Columns: strain, length_2_4_D, length_Aldicarb, length_Arsenic_trioxide, length_Atrazine, length_Cadmium_dichloride, length_Carbaryl, length_Carboxin, length_Chlorfenapyr, length_Chlorothalonil, length_Chlorpyrifos, length_Copper_chloride, length_Lead_nitrate, length_Malathion, length_Mancozeb, length_Methomyl, length_Methyl_mercury, length_Nickel_dichloride, length_Paraquat_62_5, length_Paraquat_250, length_Propoxur, length_Pyraclostrobin, length_Silver_nitrate_250, length_Silver_nitrate_7_8, length_Triphenyl_phosphate_6_25, length_Triphenyl_phosphate_50, length_Zinc_dichloride

In [None]:

# Save the aggregated trait file
save_tsv(
  data = aggregated_traits,
  output_file = output_fn
)

print(glue::glue("Saved aggregated traits to: {output_fn}"))


Saved aggregated traits to: data/processed/aggregated_toxicant_traits.tsv

# Generate pheno.df.rda file

The `pheno.df.rda` file contains the full phenotype data for all wells and is used in various downstream analyses. This section downloads the cleaned data from the data cleaning GitHub repository and filters it to include only toxicant conditions (excluding controls).

## Download cleaned .Rdata file

In [None]:

# Define paths
rdata_url <- glue::glue("https://raw.githubusercontent.com/{github_repo}/{github_branch}/data/processed/{date_prefix}_FINAL_cleaned_GWA.Rdata")
rdata_local_path <- glue::glue("data/raw/{date_prefix}_FINAL_cleaned_GWA.Rdata")
pheno_df_output <- "data/processed/phenotypes/pheno.df.rda"

# Download .Rdata file if it doesn't exist locally
if (!file.exists(rdata_local_path)) {
  message(glue::glue("Downloading {date_prefix}_FINAL_cleaned_GWA.Rdata from GitHub..."))

  response <- httr::GET(rdata_url)

  if (httr::status_code(response) == 200) {
    # Write binary content to file
    writeBin(httr::content(response, "raw"), rdata_local_path)
    message(glue::glue("  ✓ Successfully downloaded to {rdata_local_path}"))
  } else {
    stop(glue::glue("  ✗ Failed to download .Rdata file: HTTP {httr::status_code(response)}"))
  }
} else {
  message(glue::glue("Using existing file: {rdata_local_path}"))
}


Using existing file: data/raw/20230322_FINAL_cleaned_GWA.Rdata

## Process phenotype data

In [None]:

# Load the .Rdata file
message("Loading cleaned GWA data...")


Loading cleaned GWA data...

Loaded data with 108258 rows

Filtering to 26 toxicants

Processed phenotype data:

  Rows: 53302

  Strains: 195

  Drugs: 26

  Columns: 50

## Save pheno.df.rda

In [None]:

# Create output directory if it doesn't exist
output_dir <- dirname(pheno_df_output)
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
  message(glue::glue("Created directory: {output_dir}"))
}

# Save the pheno.df object
save(pheno.df, file = pheno_df_output)

message(glue::glue("Saved pheno.df to: {pheno_df_output}"))


Saved pheno.df to: data/processed/phenotypes/pheno.df.rda