This notebook processes metadata associated with paired metagenome and metatranscriptome samples. When possible, the ENA BioProject pages were parsed to determine sample pairs.

In [1]:
setwd("..")

In [195]:
library(dplyr)
library(readr)
library(janitor)

## PRJNA406858

In [35]:
prjna406858 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA406858&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [37]:
prjna406858 <- prjna406858 %>% 
  arrange(sample_alias) %>%
  select(study_accession, run_accession, library_name)

head(prjna406858)

study_accession,run_accession,library_name
<chr>,<chr>,<chr>
PRJNA406858,SRR6032600,C_2_cDNA_1
PRJNA406858,SRR6032601,B_2_1
PRJNA406858,SRR6032602,C_2_1
PRJNA406858,SRR6032603,E_2_1
PRJNA406858,SRR6032604,B_2_cDNA_1
PRJNA406858,SRR6032605,E_2_cDNA_1


In [38]:
prjna406858 <- prjna406858 %>%
  mutate(sample_name = gsub("_cDNA", "", library_name))

In [115]:
prjna406858_mtx <- prjna406858 %>%
  filter(grepl("cDNA", library_name)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna406858_mgx <- prjna406858 %>%
  filter(!grepl("cDNA", library_name)) %>%
  select(mgx_study_accession = study_accession,
         mgx_run_accession = run_accession, 
         sample_name)

prjna406858_mtx_vs_mgx <- left_join(prjna406858_mtx, prjna406858_mgx, by = "sample_name") %>%
  mutate(sample_type = "activated_sludge")

prjna406858_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
C_2_1,PRJNA406858,SRR6032600,PRJNA406858,SRR6032602,activated_sludge
B_2_1,PRJNA406858,SRR6032604,PRJNA406858,SRR6032601,activated_sludge
E_2_1,PRJNA406858,SRR6032605,PRJNA406858,SRR6032603,activated_sludge


## PRJNA448333

In [48]:
prjna448333 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA448333&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [49]:
prjna448333 <- prjna448333 %>% 
  arrange(sample_alias) %>%
  select(study_accession, run_accession, library_name, sample_alias)

head(prjna448333)

study_accession,run_accession,library_name,sample_alias
<chr>,<chr>,<chr>,<chr>
PRJNA448333,SRR8397906,Metagenome 101,Rumen microbiome of beef cattle 101
PRJNA448333,SRR8399431,Total-RNA-based metatranscriptome 101,Rumen microbiome of beef cattle 101
PRJNA448333,SRR8416057,mRNA-enriched metatranscriptome 101,Rumen microbiome of beef cattle 101
PRJNA448333,SRR8404214,Metagenome 103,Rumen microbiome of beef cattle 103
PRJNA448333,SRR8416058,mRNA-enriched metatranscriptome 103,Rumen microbiome of beef cattle 103
PRJNA448333,SRR8420492,Total-RNA-based metatranscriptome 103,Rumen microbiome of beef cattle 103


PRJNA448333 has total RNA and mRNA-enriched RNA metatranscriptomes. I'll include both since I don't have a preconceived notion of which to run the comparison with. That means some metagenomes will appear twice.

In [116]:
prjna448333_mtx <- prjna448333 %>% 
  filter(grepl("mRNA-enriched", library_name)) %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna448333_mgx <- prjna448333 %>% 
  filter(grepl("Metagenome", library_name)) %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

In [197]:
prjna448333_mtx_vs_mgx <- left_join(prjna448333_mtx, prjna448333_mgx, by = c("sample_name")) %>%
  mutate(sample_name = make_clean_names(sample_name),
         sample_type = "cattle_rumen")

In [198]:
prjna448333_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
rumen_microbiome_of_beef_cattle_101,PRJNA448333,SRR8416057,PRJNA448333,SRR8397906,cattle_rumen
rumen_microbiome_of_beef_cattle_103,PRJNA448333,SRR8416058,PRJNA448333,SRR8404214,cattle_rumen
rumen_microbiome_of_beef_cattle_104,PRJNA448333,SRR8416055,PRJNA448333,SRR8397905,cattle_rumen
rumen_microbiome_of_beef_cattle_105,PRJNA448333,SRR8416056,PRJNA448333,SRR8397904,cattle_rumen
rumen_microbiome_of_beef_cattle_106,PRJNA448333,SRR8416061,PRJNA448333,SRR8397903,cattle_rumen
rumen_microbiome_of_beef_cattle_107,PRJNA448333,SRR8416062,PRJNA448333,SRR8397910,cattle_rumen
rumen_microbiome_of_beef_cattle_112,PRJNA448333,SRR8416059,PRJNA448333,SRR8397909,cattle_rumen
rumen_microbiome_of_beef_cattle_201,PRJNA448333,SRR8416060,PRJNA448333,SRR8397908,cattle_rumen
rumen_microbiome_of_beef_cattle_202,PRJNA448333,SRR8416064,PRJNA448333,SRR8397907,cattle_rumen
rumen_microbiome_of_beef_cattle_203,PRJNA448333,SRR8416065,PRJNA448333,SRR8397902,cattle_rumen


## PRJNA344005

In [77]:
prjna344005 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA344005&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F) %>%
    mutate(sample_name = gsub("_[DR]NA*.", "", library_name)) 

In [79]:
prjna344005_pairs <- prjna344005 %>%
  group_by(sample_name) %>%
  tally() %>%
  filter(n == 2)

In [80]:
prjna344005 <- prjna344005 %>%
  filter(sample_name %in% prjna344005_pairs$sample_name) %>%
  select(study_accession, run_accession, sample_name, library_strategy)

prjna344005_mtx <- prjna344005 %>%
  filter(library_strategy == "RNA-Seq") %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna344005_mgx <- prjna344005 %>%
  filter(library_strategy == "WGS") %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

In [191]:
prjna344005_mtx_vs_mgx <- left_join(prjna344005_mtx, prjna344005_mgx, by = "sample_name") %>%
  mutate(sample_type = "groundwater")

prjna344005_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
HiTCE_2d,PRJNA344005,SRR4308224,PRJNA344005,SRR4308227,groundwater
HiTCEB12_2d,PRJNA344005,SRR4308225,PRJNA344005,SRR4308226,groundwater


## PRJNA237345 (mtx) & PRJNA237344 (mgx)

In [166]:
prjna237345 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA237345&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)
prjna237344 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA237344&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [194]:
tmp1 <- prjna237345 %>% 
  select(sample_name = experiment_title, mtx_study_accession = study_accession, mtx_run_accession = run_accession) %>%
  mutate(sample_name = gsub("Metatranscriptome ", "", sample_name),
         sample_name = gsub("\\(.*", "", sample_name))

tmp2 <- prjna237344 %>% 
  select(sample_name = experiment_title, mgx_study_accession = study_accession, mgx_run_accession = run_accession) %>%
  mutate(sample_name = gsub("Metagenome ", "", sample_name),
         sample_name = gsub("\\(.*", "", sample_name))

prjna237345_vs_prjna237344 <- left_join(tmp1, tmp2) %>%
  filter(!is.na(mgx_study_accession)) %>%
  mutate(sample_name = gsub("Illumina Genome Analyzer IIx sequencing; May-June 2010 ", "", sample_name),
         sample_name = gsub("Illumina HiSeq 2500 sequencing; May 2011 ", "", sample_name)) %>%
  group_by(sample_name) %>%
  slice(n = 1) %>%
  mutate(sample_name = make_clean_names(sample_name),
         sample_type = "river")

prjna237345_vs_prjna237344

[1m[22mJoining, by = "sample_name"


sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
amazon_plume_2_0um_from_station_10,PRJNA237345,SRR1193190,PRJNA237344,SRR1205250,river
amazon_plume_2_0um_from_station_2,PRJNA237345,SRR1193177,PRJNA237344,SRR1182511,river
amazon_plume_2_0um_from_station_23,PRJNA237345,SRR1193237,PRJNA237344,SRR1202089,river
amazon_plume_2_0um_from_station_27,PRJNA237345,SRR1193629,PRJNA237344,SRR1183643,river
amazon_plume_2_0um_from_station_3,PRJNA237345,SRR1193226,PRJNA237344,SRR1199272,river
amazon_plume_0_2_2_0um_from_station_10,PRJNA237345,SRR1186930,PRJNA237344,SRR1199271,river
amazon_plume_0_2_2_0um_from_station_2,PRJNA237345,SRR1193205,PRJNA237344,SRR1182512,river
amazon_plume_0_2_2_0um_from_station_23,PRJNA237345,SRR1193632,PRJNA237344,SRR1186214,river
amazon_plume_0_2_2_0um_from_station_25,PRJNA237345,SRR1204579,PRJNA237344,SRR1202090,river
amazon_plume_0_2_2_0um_from_station_27,PRJNA237345,SRR1193627,PRJNA237344,SRR1183650,river


## PRJNA453733

In [130]:
prjna453733 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA453733&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F) %>%
    filter(library_strategy == "OTHER")

In [131]:
prjna453733 %>% 
  select(study_accession, run_accession, sample_alias, library_strategy, library_source)

study_accession,run_accession,sample_alias,library_strategy,library_source
<chr>,<chr>,<chr>,<chr>,<chr>
PRJNA453733,SRR7083928,CS-Sed16-2cmA,OTHER,METATRANSCRIPTOMIC
PRJNA453733,SRR7083929,CS-Sed16-2cmB,OTHER,METAGENOMIC
PRJNA453733,SRR7083930,CS-Br16,OTHER,METAGENOMIC
PRJNA453733,SRR7083931,CS-Sed16-2cmA,OTHER,METAGENOMIC
PRJNA453733,SRR7083934,CS-Sed16-5cm,OTHER,METAGENOMIC


In [132]:
prjna453733_pairs <- prjna453733 %>%
  group_by(sample_alias) %>%
  tally() %>%
  filter(n == 2)

prjna453733_pairs

sample_alias,n
<chr>,<int>
CS-Sed16-2cmA,2


In [137]:
prjna453733_mtx <- prjna453733 %>%
  filter(sample_alias %in% prjna453733_pairs$sample_alias) %>%
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession, 
         mtx_run_accession = run_accession)

prjna453733_mgx <- prjna453733 %>%
  filter(sample_alias %in% prjna453733_pairs$sample_alias) %>%
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession, 
         mgx_run_accession = run_accession)

prjna453733_mtx_vs_mgx <- left_join(prjna453733_mtx, prjna453733_mgx, by = "sample_name") %>%
  mutate(sample_type = "lake")
prjna453733_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
CS-Sed16-2cmA,PRJNA453733,SRR7083928,PRJNA453733,SRR7083931,lake


## Microbial metagenomes and metatranscriptomes during a coastal phytoplankton bloom (one PRJNA per sample)
The samples published in the above title each have there own study accession and run accession. 
This section of the notebook parses two supplementary files from the publication to get all of the sample bioproject numbers and to determine which bioproject numbers represent pairs of samples.

In [202]:
download.file(url = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41597-019-0132-4/MediaObjects/41597_2019_132_MOESM1_ESM.zip",
              destfile = "inputs/metadata/41597_2019_132_MOESM1_ESM.zip")

In [204]:
unzip("inputs/metadata/41597_2019_132_MOESM1_ESM.zip", exdir = "inputs/metadata/41597_2019_132_MOESM1_ESM")

In [215]:
moran010B_mgx <- read_tsv("inputs/metadata/41597_2019_132_MOESM1_ESM/a_Moran010B_dna.txt", show_col_types = F) %>%
  clean_names() %>%
  select(sample_name, mgx_study_accession = assay_name) %>%
  mutate(sample_name = gsub("D", "", sample_name))

moran010B_mtx <- read_tsv("inputs/metadata/41597_2019_132_MOESM1_ESM/a_Moran010B_rna.txt", show_col_types = F) %>%
  clean_names() %>%
  select(sample_name, mtx_study_accession = assay_name) %>%
  mutate(sample_name = gsub("R", "", sample_name))

moran010B <- inner_join(moran010B_mtx, moran010B_mgx, by = "sample_name") 


FALSE  TRUE 
    7    74 

In [220]:
# use the moran010B study accessions to programatically generate URLs to get run accessions for each sample
moran010B_run_accessions <- data.frame()
for(study_accession in c(moran010B$mtx_study_accession, moran010B_mgx$mgx_study_accession)){
    url <- paste0("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=", study_accession, "&result=read_run&fields=study_accession,run_accession&format=tsv&download=true&limit=0")
    run_accession <- read_tsv(url, show_col_types = F) 
    moran010B_run_accessions <- bind_rows(moran010B_run_accessions, run_accession)
}

In [226]:
moran010B_mtx_vs_mgx <- moran010B %>%
  left_join(moran010B_run_accessions, by = c("mtx_study_accession" = "study_accession")) %>%
  select(sample_name, mtx_study_accession, mtx_run_accession = run_accession, mgx_study_accession) %>%
  left_join(moran010B_run_accessions, by = c("mgx_study_accession" = "study_accession")) %>%
  select(sample_name, mtx_study_accession, mtx_run_accession, mgx_study_accession, mgx_run_accession = run_accession) %>%
  mutate(sample_type = "ocean")

moran010B_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
26,PRJNA502453,SRR8361532,PRJNA467728,SRR7592711,ocean
58_r,PRJNA502454,SRR8361534,PRJNA467772,SRR7609573,ocean
125_r,PRJNA468332,SRR7962480,PRJNA467773,SRR7609569,ocean
8,PRJNA502455,SRR8297879,PRJNA467724,SRR7592287,ocean
85,PRJNA502456,SRR8297845,PRJNA467765,SRR7609362,ocean
76,PRJNA468305,SRR7949679,PRJNA467757,SRR7608731,ocean
40,PRJNA467774,SRR7609608,PRJNA467736,SRR7595425,ocean
53,PRJNA468299,SRR7609574,PRJNA502421,SRR8361352,ocean
73,PRJNA468306,SRR7949683,PRJNA467754,SRR7608223,ocean
23,PRJNA467775,SRR7609632,PRJNA467727,SRR7633009,ocean


## PRJNA603240

In [228]:
prjna603240 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA603240&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [406]:
prjna603240_mtx <- prjna603240 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  filter(library_selection == "RANDOM PCR") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna603240_mgx <- prjna603240 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna603240_mtx_vs_mgx <- left_join(prjna603240_mtx, prjna603240_mgx, by = "sample_name") %>%
  mutate(sample_type = "deadwood")

prjna603240_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
sample_106,PRJNA603240,SRR10968230,PRJNA603240,SRR10968263,deadwood
sample_069,PRJNA603240,SRR10968231,PRJNA603240,SRR10968264,deadwood
sample_055,PRJNA603240,SRR10968232,PRJNA603240,SRR10968265,deadwood
sample_049,PRJNA603240,SRR10968233,PRJNA603240,SRR10968266,deadwood
sample_031,PRJNA603240,SRR10968234,PRJNA603240,SRR10968267,deadwood
sample_116,PRJNA603240,SRR10968236,PRJNA603240,SRR10968225,deadwood
sample_110,PRJNA603240,SRR10968237,PRJNA603240,SRR10968226,deadwood
sample_044,PRJNA603240,SRR10968238,PRJNA603240,SRR10968227,deadwood
sample_007,PRJNA603240,SRR10968239,PRJNA603240,SRR10968228,deadwood
sample_006,PRJNA603240,SRR10968240,PRJNA603240,SRR10968229,deadwood


## PRJNA202380

In [240]:
prjna202380 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA202380&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [252]:
prjna202380 <- prjna202380 %>%
  filter(instrument_platform == "ILLUMINA") %>%
  mutate(sample_name = gsub("Illumina HiSeq 2000 paired end sequencing; ", "", experiment_title),
         sample_name = gsub(" metatranscri[op]tomic sample", "", sample_name),
         sample_name = gsub(" metagenomic sample", "", sample_name),
         sample_name = gsub("Illumina HiSeq 2000 sequencing; ", "", sample_name)) %>%
  select(study_accession, run_accession,library_source, sample_name)

In [253]:
prjna202380_mtx <- prjna202380 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna202380_mgx <- prjna202380 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna202380_mtx_vs_mgx <- left_join(prjna202380_mtx, prjna202380_mgx, by = "sample_name") %>%
  mutate(sample_type = "sheep_rumen")

prjna202380_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Tag1363-1,PRJNA202380,SRR1138694,PRJNA202380,SRR1267595,sheep_rumen
Tag1111-1,PRJNA202380,SRR1138697,PRJNA202380,SRR1222429,sheep_rumen
Tag1111-2,PRJNA202380,SRR1138702,PRJNA202380,SRR1222431,sheep_rumen
Tag1234-1,PRJNA202380,SRR1206249,PRJNA202380,SRR1206671,sheep_rumen
Tag1494-1,PRJNA202380,SRR873450,PRJNA202380,SRR873595,sheep_rumen
Tag1283-1,PRJNA202380,SRR873451,PRJNA202380,SRR873596,sheep_rumen
Tag1435-1,PRJNA202380,SRR873452,PRJNA202380,SRR873597,sheep_rumen
Tag1494-2,PRJNA202380,SRR873453,PRJNA202380,SRR873598,sheep_rumen
Tag1265-1,PRJNA202380,SRR873454,PRJNA202380,SRR873599,sheep_rumen
Tag1435-2,PRJNA202380,SRR873455,PRJNA202380,SRR873600,sheep_rumen


## PRJNA541981

In [255]:
prjna541981 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA541981&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [413]:
prjna541981_mtx <- prjna541981 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna541981_mgx <- prjna541981 %>% 
  filter(library_source == "METAGENOMIC") %>%
  filter(library_strategy == "WGS") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna541981_mtx_vs_mgx <- left_join(prjna541981_mtx, prjna541981_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_skin")

## PRJNA797778

In [268]:
prjna797778 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA797778&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [272]:
prjna797778_mtx <- prjna797778 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna797778_mgx <- prjna797778 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna797778_mtx_vs_mgx <- left_join(prjna797778_mtx, prjna797778_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_vagina")

prjna797778_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
UAB088_W2D1,PRJNA797778,SRR17858151,PRJNA797778,SRR17635672,human_vagina
UAB082_W10D2,PRJNA797778,SRR17858152,PRJNA797778,SRR17635674,human_vagina
UAB082_W5D7,PRJNA797778,SRR17858153,PRJNA797778,SRR17635676,human_vagina
UAB082_W3D7,PRJNA797778,SRR17858154,PRJNA797778,SRR17635677,human_vagina
UAB082_W2D5,PRJNA797778,SRR17858155,PRJNA797778,SRR17635678,human_vagina
UAB079_W10D1,PRJNA797778,SRR17858156,PRJNA797778,SRR17635679,human_vagina
UAB079_W7D6,PRJNA797778,SRR17858157,PRJNA797778,SRR17635680,human_vagina
UAB079_W6D7,PRJNA797778,SRR17858158,PRJNA797778,SRR17635681,human_vagina
UAB079_W3D7,PRJNA797778,SRR17858159,PRJNA797778,SRR17635682,human_vagina
UAB079_W2D2,PRJNA797778,SRR17858160,PRJNA797778,SRR17635683,human_vagina


## PRJNA339914

In [274]:
prjna339914 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA339914&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [281]:
prjna339914_mtx <- prjna339914 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("RNA", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna339914_mgx <- prjna339914 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna339914_mtx_vs_mgx <- left_join(prjna339914_mtx, prjna339914_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_gut_microbiome")

prjna339914_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
MV_FEI4_t2Q15,PRJNA339914,SRR4100706,PRJNA339914,SRR4052039,human_gut_microbiome
MV_FEI5_t3Q15,PRJNA339914,SRR4100707,PRJNA339914,SRR4052042,human_gut_microbiome
MV_FEM4_t2Q15,PRJNA339914,SRR4100708,PRJNA339914,SRR4052025,human_gut_microbiome
MV_FEM5_t3Q15,PRJNA339914,SRR4100709,PRJNA339914,SRR4052028,human_gut_microbiome


## PRJEB33889

In [282]:
prjeb33889 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB33889&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [285]:
prjeb33889_mtx <- prjeb33889 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("T", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjeb33889_mgx <- prjeb33889 %>% 
  filter(library_source == "METAGENOMIC") %>%
  filter(!grepl("16S", sample_alias)) %>%
  mutate(sample_name = gsub("G", "", sample_alias)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjeb33889_mtx_vs_mgx <- left_join(prjeb33889_mtx, prjeb33889_mgx, by = "sample_name") %>%
  mutate(sample_type = "mouse_cecum")

prjeb33889_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
M1,PRJEB33889,ERR3473664,PRJEB33889,ERR3473656,mouse_cecum
M2,PRJEB33889,ERR3473665,PRJEB33889,ERR3473657,mouse_cecum
M3,PRJEB33889,ERR3473666,PRJEB33889,ERR3473658,mouse_cecum
M4,PRJEB33889,ERR3473667,PRJEB33889,ERR3473659,mouse_cecum
M5,PRJEB33889,ERR3473668,PRJEB33889,ERR3473660,mouse_cecum
M6,PRJEB33889,ERR3473669,PRJEB33889,ERR3473661,mouse_cecum
M7,PRJEB33889,ERR3473670,PRJEB33889,ERR3473662,mouse_cecum
M8,PRJEB33889,ERR3473671,PRJEB33889,ERR3473663,mouse_cecum


## PRJNA698464

In [287]:
prjna698464 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA698464&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [292]:
prjna698464_mtx <- prjna698464 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("Illumina MiSeq sequencing; ", "", experiment_title),
         sample_name = gsub("RNA", "", sample_name),
         sample_name = make_clean_names(sample_name)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna698464_mgx <- prjna698464 %>% 
  filter(library_source == "METAGENOMIC") %>%
  mutate(sample_name = gsub("Illumina MiSeq sequencing; ", "", experiment_title),
         sample_name = gsub("DNA", "", sample_name),
         sample_name = make_clean_names(sample_name)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna698464_mtx_vs_mgx <- left_join(prjna698464_mtx, prjna698464_mgx, by = "sample_name") %>%
  mutate(sample_type = "bioreactor")

prjna698464_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ad_supplemented_with_nitrogen_data_2,PRJNA698464,SRR13618127,PRJNA698464,SRR13618123,bioreactor
ad_supplemented_with_nitrogen_data_1,PRJNA698464,SRR13618128,PRJNA698464,SRR13618124,bioreactor
ad_supplemented_with_hydrogen_data_2,PRJNA698464,SRR13618129,PRJNA698464,SRR13618125,bioreactor
ad_supplemented_with_hydrogen_data_1,PRJNA698464,SRR13618130,PRJNA698464,SRR13618126,bioreactor


## PRJNA396840

In [294]:
prjna396840 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA396840&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [300]:
prjna396840_mtx <- prjna396840 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("RNA", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna396840_mgx <- prjna396840 %>% 
  filter(library_source == "METAGENOMIC") %>%
  mutate(sample_name = gsub("DNA", "", sample_alias)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna396840_mtx_vs_mgx <- left_join(prjna396840_mtx, prjna396840_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_oral")

prjna396840_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
X4074,PRJNA396840,SRR5892181,PRJNA396840,SRR5892217,human_oral
X4080,PRJNA396840,SRR5892182,PRJNA396840,SRR5892216,human_oral
X4068,PRJNA396840,SRR5892183,PRJNA396840,SRR5892215,human_oral
X4072,PRJNA396840,SRR5892184,PRJNA396840,SRR5892214,human_oral
X4060,PRJNA396840,SRR5892185,PRJNA396840,SRR5892213,human_oral
X4064,PRJNA396840,SRR5892186,PRJNA396840,SRR5892212,human_oral
X4050,PRJNA396840,SRR5892187,PRJNA396840,SRR5892211,human_oral
X4056,PRJNA396840,SRR5892188,PRJNA396840,SRR5892210,human_oral
X4108,PRJNA396840,SRR5892189,PRJNA396840,SRR5892233,human_oral
X4082,PRJNA396840,SRR5892190,PRJNA396840,SRR5892209,human_oral


## PRJNA492158

In [302]:
prjna492158 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA492158&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [308]:
prjna492158_mtx <- prjna492158 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna492158_mgx <- prjna492158 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna492158_mtx_vs_mgx <- left_join(prjna492158_mtx, prjna492158_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_gut")

prjna492158_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
L2,PRJNA492158,SRR7880350,PRJNA492158,SRR7880198,human_gut
L3,PRJNA492158,SRR7880351,PRJNA492158,SRR7880201,human_gut
L5,PRJNA492158,SRR7880352,PRJNA492158,SRR7880203,human_gut
L6,PRJNA492158,SRR7880353,PRJNA492158,SRR7880202,human_gut
L7,PRJNA492158,SRR7880354,PRJNA492158,SRR7880205,human_gut
L8,PRJNA492158,SRR7880355,PRJNA492158,SRR7880204,human_gut
L9,PRJNA492158,SRR7880356,PRJNA492158,SRR7880207,human_gut
L11,PRJNA492158,SRR7880357,PRJNA492158,SRR7880227,human_gut
L12,PRJNA492158,SRR7880358,PRJNA492158,SRR7880226,human_gut
O1,PRJNA492158,SRR7880359,PRJNA492158,SRR7880225,human_gut


## PRJNA278075

In [310]:
prjna278075 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA278075&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [313]:
prjna278075_mtx <- prjna278075 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("_Metatranscriptome", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna278075_mgx <- prjna278075 %>% 
  filter(library_source == "METAGENOMIC") %>%
  mutate(sample_name = gsub("_Metagenome", "", sample_alias)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna278075_mtx_vs_mgx <- left_join(prjna278075_mtx, prjna278075_mgx, by = "sample_name") %>%
  mutate(sample_type = "ocean")

prjna278075_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
3m_Station6_GOM,PRJNA278075,SRR1918203,PRJNA278075,SRR2001210,ocean


## PRJEB38017

In [314]:
prjeb38017 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB38017&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [318]:
prjeb38017_mtx <- prjeb38017 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  mutate(sample_name = gsub("CRF", "", sample_alias),
         sample_name = gsub("R", "_", sample_name),
         sample_name = paste0("CRF", sample_name)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjeb38017_mgx <- prjeb38017 %>% 
  filter(library_source == "METAGENOMIC") %>%
  mutate(sample_name = gsub("D", "_", sample_alias)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjeb38017_mtx_vs_mgx <- left_join(prjeb38017_mtx, prjeb38017_mgx, by = "sample_name") %>%
  mutate(sample_type = "cocoa_box_fermentation")

prjeb38017_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
CRF2_7,PRJEB38017,ERR4077213,PRJEB38017,ERR4073751,cocoa_box_fermentation
CRF2_20,PRJEB38017,ERR4077214,PRJEB38017,ERR4073752,cocoa_box_fermentation
CRF2_68,PRJEB38017,ERR4077215,PRJEB38017,ERR4073754,cocoa_box_fermentation


## PRJNA616041

In [322]:
prjna616041 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA616041&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [325]:
prjna616041_mtx <- prjna616041 %>% 
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  select(sample_name = sample_alias,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna616041_mgx <- prjna616041 %>% 
  filter(library_source == "METAGENOMIC") %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna616041_mtx_vs_mgx <- left_join(prjna616041_mtx, prjna616041_mgx, by = "sample_name") %>%
  mutate(sample_type = "paddy_soil")

prjna616041_mtx_vs_mgx

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
HP_AsLow,PRJNA616041,SRR11450577,PRJNA616041,SRR11450583,paddy_soil
SKS_AsHig,PRJNA616041,SRR11450578,PRJNA616041,SRR11450584,paddy_soil
CZ_AsHig,PRJNA616041,SRR11450579,PRJNA616041,SRR11450587,paddy_soil
CL_AsHig,PRJNA616041,SRR11450580,PRJNA616041,SRR11450588,paddy_soil
YCP_AsLow,PRJNA616041,SRR11450585,PRJNA616041,SRR11450581,paddy_soil
LH_AsLow,PRJNA616041,SRR11450586,PRJNA616041,SRR11450582,paddy_soil


## PRJNA393770 (mgx) & PRJNA395125 (mtx)

In [327]:
prjna393770 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA393770&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

prjna395125 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA395125&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [334]:
prjna393770_mgx <- prjna393770 %>%
  filter(library_source %in% c("METAGENOMIC")) %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjna395125_mtx <- prjna395125 %>%
  filter(library_source %in% c("METATRANSCRIPTOMIC")) %>%
  mutate(sample_name = gsub("_MT", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjna395125_vs_prjna393770 <- inner_join(prjna395125_mtx, prjna393770_mgx, by = "sample_name") %>%
  mutate(sample_type = "bioreactor")

## PRJEB12284 (mtx) & PRJEB12083 (mgx)

In [336]:
prjeb12284 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB12284&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

prjeb12083 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB12083&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [388]:
prjeb12284_mtx <- prjeb12284 %>%
  mutate(sample_name = gsub("\\.", "", sample_alias)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjeb12083_mgx <- prjeb12083 %>%
  select(sample_name = sample_alias,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjeb12284_vs_prjeb12083 <- inner_join(prjeb12284_mtx, prjeb12083_mgx, by = "sample_name") %>%
  mutate(sample_type = "wastewater")

prjeb12284_vs_prjeb12083

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AH1,PRJEB12284,ERR1198915,PRJEB12083,ERR1191817,wastewater
AH4,PRJEB12284,ERR1198916,PRJEB12083,ERR1191820,wastewater
AH5,PRJEB12284,ERR1198917,PRJEB12083,ERR1191821,wastewater
AH6,PRJEB12284,ERR1198918,PRJEB12083,ERR1191822,wastewater
DF1,PRJEB12284,ERR1198919,PRJEB12083,ERR1193331,wastewater
DF4,PRJEB12284,ERR1198920,PRJEB12083,ERR1193299,wastewater
DF5,PRJEB12284,ERR1198921,PRJEB12083,ERR1193300,wastewater
DF6,PRJEB12284,ERR1198922,PRJEB12083,ERR1193301,wastewater


## PRJEB32787 (mgx) & PRJEB32788 (mtx)

In [343]:
prjeb32787 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB32787&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

prjeb32788 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJEB32788&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [363]:
prjeb32787_mgx <- prjeb32787 %>%
  mutate(sample_name = gsub("_[^_]*$", "", sample_title)) %>%
  select(sample_name,
         mgx_study_accession = study_accession,
         mgx_run_accession = run_accession)

prjeb32788_mtx <- prjeb32788 %>%
  mutate(sample_name = gsub("_[^_]*$", "", sample_title)) %>%
  select(sample_name,
         mtx_study_accession = study_accession,
         mtx_run_accession = run_accession)

prjeb32788_vs_prjeb32781_1 <- left_join(prjeb32788_mtx, prjeb32787_mgx, by = "sample_name") %>%
  filter(!is.na(mgx_study_accession))

prjeb32788_mtx_2 <- prjeb32788_mtx %>%
  mutate(sample_name = gsub("_[^_]*$", "", sample_name)) 

prjeb32788_vs_prjeb32781_2 <- left_join(prjeb32788_mtx_2, prjeb32787_mgx, by = "sample_name") %>%
  filter(!is.na(mgx_study_accession))

prjeb32788_vs_prjeb32781 <- bind_rows(prjeb32788_vs_prjeb32781_1, prjeb32788_vs_prjeb32781_2) %>%
  mutate(sample_type = "mussel_gill")

prjeb32788_vs_prjeb32781 

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
BazoSym_LS-ET_5,PRJEB32788,ERR3342479,PRJEB32787,ERR3342499,mussel_gill
BazoSym_LS-ET_4,PRJEB32788,ERR3342480,PRJEB32787,ERR3342500,mussel_gill
BazoSym_LS-MS_3,PRJEB32788,ERR3342481,PRJEB32787,ERR3342501,mussel_gill
BazoSym_LS-MS_2,PRJEB32788,ERR3342482,PRJEB32787,ERR3342502,mussel_gill
BazoSym_LS-MS_1,PRJEB32788,ERR3342483,PRJEB32787,ERR3342503,mussel_gill
BspSym_Li_5,PRJEB32788,ERR3342467,PRJEB32787,ERR3342486,mussel_gill
BspSym_Li_4,PRJEB32788,ERR3342468,PRJEB32787,ERR3342487,mussel_gill
BspSym_Li_3,PRJEB32788,ERR3342469,PRJEB32787,ERR3342488,mussel_gill
BspSym_Li_2,PRJEB32788,ERR3342470,PRJEB32787,ERR3342489,mussel_gill
BspSym_Li_1,PRJEB32788,ERR3342471,PRJEB32787,ERR3342490,mussel_gill


## iHMP (PRJNA398089)

In [365]:
prjna398089 <- read_tsv("https://www.ebi.ac.uk/ena/portal/api/filereport?accession=PRJNA398089&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,submission_accession,tax_id,scientific_name,instrument_platform,instrument_model,library_name,nominal_length,library_layout,library_strategy,library_source,library_selection,read_count,base_count,center_name,first_public,last_updated,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_bytes,fastq_md5,fastq_ftp,fastq_aspera,fastq_galaxy,submitted_bytes,submitted_md5,submitted_ftp,submitted_aspera,submitted_galaxy,submitted_format,sra_bytes,sra_md5,sra_ftp,sra_aspera,sra_galaxy,cram_index_ftp,cram_index_aspera,cram_index_galaxy,sample_alias,broker_name,sample_title,nominal_sdev,first_created&format=tsv&download=true&limit=0",
                        show_col_types = F)

In [386]:
prjna398089_mtx <- prjna398089 %>%
  filter(library_source == "METATRANSCRIPTOMIC") %>%
  filter(library_strategy == "RNA-Seq") %>%
  mutate(sample_name = gsub("_MTX", "", experiment_alias)) %>%
  select(sample_name, 
         mtx_study_accession = study_accession, 
         mtx_run_accession = run_accession)

prjna398089_mgx <- prjna398089 %>%
  filter(library_source == "METAGENOMIC") %>%
  filter(library_strategy == "WGS") %>%
  mutate(sample_name = gsub("_MGX", "", experiment_alias)) %>%
  select(sample_name, 
         mgx_study_accession = study_accession, 
         mgx_run_accession = run_accession)

prjna398089_mtx_vs_mgx <- inner_join(prjna398089_mtx, prjna398089_mgx, by = "sample_name") %>%
  mutate(sample_type = "human_gut")

prjna398089_mtx_vs_mgx 

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
CSM67UC6,PRJNA398089,SRR5949109,PRJNA398089,SRR5936217,human_gut
CSM67UDY,PRJNA398089,SRR5949110,PRJNA398089,SRR5936216,human_gut
HSM67VI9,PRJNA398089,SRR5949111,PRJNA398089,SRR5936212,human_gut
HSM6XRTQ,PRJNA398089,SRR5949112,PRJNA398089,SRR5936211,human_gut
CSM79HHM,PRJNA398089,SRR5949113,PRJNA398089,SRR5946811,human_gut
CSM79HIR,PRJNA398089,SRR5949114,PRJNA398089,SRR5936215,human_gut
CSM79HJO,PRJNA398089,SRR5949115,PRJNA398089,SRR5947089,human_gut
HSM67VEI,PRJNA398089,SRR5949116,PRJNA398089,SRR5936210,human_gut
MSM79H9K,PRJNA398089,SRR5949117,PRJNA398089,SRR5935975,human_gut
MSM79HAH,PRJNA398089,SRR5949118,PRJNA398089,SRR5935976,human_gut


# Combine everything together

In [414]:
all_paired_mtx_mgx <- bind_rows(prjna406858_mtx_vs_mgx,
                                prjna448333_mtx_vs_mgx,
                                prjna344005_mtx_vs_mgx,
                                prjna453733_mtx_vs_mgx,
                                prjna237345_vs_prjna237344,
                                moran010B_mtx_vs_mgx,
                                prjna603240_mtx_vs_mgx,
                                prjna202380_mtx_vs_mgx,
                                prjna541981_mtx_vs_mgx,
                                prjna797778_mtx_vs_mgx,
                                prjna339914_mtx_vs_mgx,
                                prjeb33889_mtx_vs_mgx,
                                prjna698464_mtx_vs_mgx,
                                prjna396840_mtx_vs_mgx,
                                prjna492158_mtx_vs_mgx,
                                prjna278075_mtx_vs_mgx,
                                prjeb38017_mtx_vs_mgx,
                                prjna616041_mtx_vs_mgx,
                                prjna395125_vs_prjna393770,
                                prjeb12284_vs_prjeb12083,
                                prjeb32788_vs_prjeb32781,
                                prjna398089_mtx_vs_mgx)
head(all_paired_mtx_mgx)
tail(all_paired_mtx_mgx)
nrow(all_paired_mtx_mgx)

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
C_2_1,PRJNA406858,SRR6032600,PRJNA406858,SRR6032602,activated_sludge
B_2_1,PRJNA406858,SRR6032604,PRJNA406858,SRR6032601,activated_sludge
E_2_1,PRJNA406858,SRR6032605,PRJNA406858,SRR6032603,activated_sludge
rumen_microbiome_of_beef_cattle_101,PRJNA448333,SRR8416057,PRJNA448333,SRR8397906,cattle_rumen
rumen_microbiome_of_beef_cattle_103,PRJNA448333,SRR8416058,PRJNA448333,SRR8404214,cattle_rumen
rumen_microbiome_of_beef_cattle_104,PRJNA448333,SRR8416055,PRJNA448333,SRR8397905,cattle_rumen


sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
PSM6XBQU,PRJNA398089,SRR5963943,PRJNA398089,SRR5936007,human_gut
MSMAPC5Z,PRJNA398089,SRR5963944,PRJNA398089,SRR5946832,human_gut
MSM6J2M3,PRJNA398089,SRR5963945,PRJNA398089,SRR5935963,human_gut
MSM6J2K6,PRJNA398089,SRR5963946,PRJNA398089,SRR5935938,human_gut
MSM6J2Q1,PRJNA398089,SRR5963947,PRJNA398089,SRR5936013,human_gut
MSM6J2PS,PRJNA398089,SRR5963948,PRJNA398089,SRR5936231,human_gut


In [415]:
# look at the breakdown of sample types
table(all_paired_mtx_mgx$sample_type)


      activated_sludge             bioreactor           cattle_rumen 
                     3                     11                     48 
cocoa_box_fermentation               deadwood            groundwater 
                     3                     10                      2 
             human_gut   human_gut_microbiome             human_oral 
                   787                      4                     30 
            human_skin           human_vagina                   lake 
                    17                    180                      1 
           mouse_cecum            mussel_gill                  ocean 
                     8                     17                     75 
            paddy_soil                  river            sheep_rumen 
                     6                     23                     20 
            wastewater 
                     8 

In [418]:
# check and make sure there are no duplicated sample names, 
# either from faulty joins for from x-study shared sample identifiers
length(unique(all_paired_mtx_mgx$sample_name))

tmp <- all_paired_mtx_mgx %>% 
  group_by(sample_name) %>%
  tally() %>%
  filter(n > 1)

all_paired_mtx_mgx %>%
  filter(sample_name %in% tmp$sample_name)

sample_name,mtx_study_accession,mtx_run_accession,mgx_study_accession,mgx_run_accession,sample_type
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
