In [None]:
library(httr)
library(data.table)
library(dplyr)
library(jsonlite)
source('../../get_api_studies_for_disease.R')
getwd()
CTS_V2_API_KEY <- Sys.getenv("CTS_V2_API_KEY")


In [None]:
PAGE_SIZE <- 50
BC <- c("C4872") # Breast Cancer
STAGE_II_BC_CODES <- c("C7768", "C139569", "C139538") # Stage II Breast Cancer (w/ AJCC versions)
#' Even though the filters don't include Stage IIA or IIB, the CTS API will find trials that have those coded at the TRIAL-level
EXPECTED_DISEASE_MATCHES <- c("Stage II Breast Cancer", "Stage IIA Breast Cancer", "Stage IIB Breast Cancer")
INCLUDE_FIELDS <- c(
  "nct_id",
  "diseases",
  "current_trial_status",
  "primary_purpose",
  "sites.recruitment_status"
)

#' Defaults to search maintype = Breast Carcinoma and subtype = Stage ii.
#' If only searching active trials, then this works out the same as the SEC POC search when it
#' filters primary purpose of TREAMENT and excludes site.recruitment_status.
search_like_cancer.gov <- function(
    maintype_codes = BC,
    stage_codes = STAGE_II_BC_CODES,
    from = 0,
    size = PAGE_SIZE) {
  response <- httr::POST(
    "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
    body = list(
      current_trial_status = c(
        "Active"
        # "Approved",
        # "Enrolling by Invitation",
        # "In Review",
        # "Temporarily Closed to Accrual",
        # "Temporarily Closed to Accrual and Intervention"
      ),
      primary_purpose = c(
        "TREATMENT"
      ),
      include = INCLUDE_FIELDS,
      maintype = maintype_codes,
      stage = stage_codes,
      from = from,
      size = size
    ),
    encode = "json",
    httr::add_headers(`x-api-key` = CTS_V2_API_KEY, `Content-Type` = "application/json"),
    httr::timeout(4)
  )
  data <- httr::content(response)
  assertthat::assert_that(response$status_code == 200, msg = paste("Response status is", response$status_code))
  return(data)
}

#' Defaults to search diseases.nci_thesaurus_concept_id = Stage ii Breast cancer
search_like_sec_poc <- function(
    ncit_code = STAGE_II_BC_CODES,
    from = 0,
    size = PAGE_SIZE) {
  response <- httr::POST(
    "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
    body = list(
      current_trial_status = c(
        "Active"
      ),
      primary_purpose = c(
        "TREATMENT"
        # "SCREENING"
      ),
      diseases.nci_thesaurus_concept_id = ncit_code,
      # sites.recruitment_status = "ACTIVE",
      include = INCLUDE_FIELDS,
      from = from,
      size = size
    ),
    encode = "json",
    httr::add_headers(`x-api-key` = CTS_V2_API_KEY, `Content-Type` = "application/json"),
    httr::timeout(4)
  )
  data <- httr::content(response)
  assertthat::assert_that(response$status_code == 200, msg = paste("Response status is", response$status_code))
  return(data)
}

paginate_cts_api <- function(paged_data, total_expected, FUN) {
  print(paste("    Expecting:", total_expected))
  while (length(paged_data) < total_expected) {
    data <- FUN(from = length(paged_data))
    paged_data <- append(paged_data, data$data)
    print(paste("          Got:", length(data$data)))
    print(paste("        Total:", length(paged_data)))
  }
  return(paged_data)
}

flatten_trials <- function(trials) {
  diseases <- data.frame()
  for (trial in trials) {
    for (d in trial$diseases) {
      df <- data.frame(
        nct_id = trial$nct_id,
        primary_purpose = trial$primary_purpose,
        current_trial_status = toString(trial$current_trial_status),
        sites.recruitment_status = ifelse("ACTIVE" %in% unlist(trial$sites, use.names = FALSE), "ACTIVE", "NO_ACTIVE"),
        inclusion_indicator = d$inclusion_indicator,
        is_lead_disease = d$is_lead_disease,
        type = toString(d$type),
        name = d$name,
        nci_thesaurus_concept_id = d$nci_thesaurus_concept_id,
        parents = toString(d$parents),
        stringsAsFactors = FALSE
      )
      diseases <- rbind(diseases, df)
    }
  }
  return(diseases)
}


#### Get Disease Maintypes (used in CTS API)

---

In [None]:
response <- GET(
  paste0(
    "https://clinicaltrialsapi.cancer.gov/api/v2/diseases?type=maintype",
    # "&type_not=subtype&type_not=stage&type_not=grade&type_not=finding",
    "&include=count&include=codes&include=name&include=type"
  ),
  add_headers(
    `x-api-key` = CTS_V2_API_KEY, `Content-Type` = "application/json"
  )
)

response$status_code
contents <- content(response)


In [None]:
terms <- data.frame()
for (x in contents$data) {
  tmp_df <- data.frame(
    name = x$name,
    codes = toString(x$codes),
    type = toString(x$type),
    count = x$count,
    stringsAsFactors = FALSE
  )
  terms <- rbind(terms, tmp_df)
}


In [None]:
terms %>% arrange(name)

In [None]:
terms[grep("lung", terms$name, ignore.case = TRUE), ]

#### Search Trials by Disease

---

In [None]:
page1 <- search_like_cancer.gov()
all_pages_from_cancer.gov <- paginate_cts_api(page1$data, page1$total, search_like_cancer.gov)

page1 <- search_like_sec_poc()
all_pages_from_sec_poc <- paginate_cts_api(page1$data, page1$total, search_like_sec_poc)

In [None]:
trials_cancer.gov <- flatten_trials(all_pages_from_cancer.gov)
trials_sec_poc <- flatten_trials(all_pages_from_sec_poc)

In [None]:
trial_only_cancer.gov <- trials_cancer.gov[trials_cancer.gov$inclusion_indicator == "TRIAL", ]
trial_only_sec_poc <- trials_sec_poc[trials_sec_poc$inclusion_indicator == "TRIAL", ]

cancer.gov_ids <- unique(trial_only_cancer.gov$nct_id)
sec_poc_ids <- unique(trial_only_sec_poc$nct_id)

overlap <- intersect(cancer.gov_ids, sec_poc_ids)
length(overlap)

disjoint_cancer.gov <- cancer.gov_ids[!cancer.gov_ids %in% overlap]
disjoint_sec_poc <- sec_poc_ids[!sec_poc_ids %in% overlap]

print("Overlapping")
unique(trial_only_cancer.gov[trial_only_cancer.gov$nct_id %in% overlap, c("current_trial_status", "sites.recruitment_status", "primary_purpose")])

print("Disjoint cancer.gov")
unique(trial_only_cancer.gov[trial_only_cancer.gov$nct_id %in% disjoint_cancer.gov, c("current_trial_status", "sites.recruitment_status", "primary_purpose")])
print("Disjoint SEC POC")
unique(trial_only_sec_poc[trial_only_sec_poc$nct_id %in% disjoint_sec_poc, c("current_trial_status", "sites.recruitment_status", "primary_purpose")])


> The disjoint cases are where the filters provided to CTS API differ. For example, SEC includes SCREENING in the results while cancer.gov did not.

> But cancer.gov included statuses other than Active.


In [None]:
cancer.gov_exp_disease_t <- trial_only_cancer.gov[trial_only_cancer.gov$name %in% EXPECTED_DISEASE_MATCHES, c("nct_id", "name")]
cancer.gov_exp_disease_t <- cancer.gov_exp_disease_t[which(unique(cancer.gov_exp_disease_t$nct_id) %in% cancer.gov_exp_disease_t$nct_id), ]
row.names(cancer.gov_exp_disease_t) <- NULL
str(cancer.gov_exp_disease_t)

sec_poc_exp_disease_t <- trial_only_sec_poc[trial_only_sec_poc$name %in% EXPECTED_DISEASE_MATCHES, c("nct_id", "name")]
sec_poc_exp_disease_t <- sec_poc_exp_disease_t[which(unique(sec_poc_exp_disease_t$nct_id) %in% sec_poc_exp_disease_t$nct_id), ]
row.names(sec_poc_exp_disease_t) <- NULL
str(sec_poc_exp_disease_t)
