In [None]:
set.seed(130)  # For reproducibility

generate_random_data <- function(start_id, n, is_case) {
  birthdate <- character(n)
  conditions <- logical(n)  # For indicating case or control
  for (i in 1:n) {
    # Adjust age distribution based on case or control
    if (is_case) {
      # For cases, make older ages significantly more likely
      age <- sample(20:75, 1, replace = TRUE, prob = c(rep(1, 35), rep(2, 21)))  # Increase likelihood for ages 55 and above
    } else {
      # For controls, keep a more uniform distribution but slightly favor younger ages
      age <- sample(20:75, 1, replace = TRUE, prob = c(rep(1.2, 40), rep(1, 16)))  # Slightly higher likelihood for ages below 60
    }
    year <- 2024 - age
    start_date <- as.POSIXct(paste0(year, "-01-01 00:00:00"), tz="UTC")
    end_date <- as.POSIXct(paste0(year, "-12-31 00:00:00"), tz="UTC")
    dob <- as.POSIXct(runif(1, as.numeric(start_date), as.numeric(end_date)), origin="1970-01-01", tz="UTC")
    birthdate[i] <- format(dob, "%Y-%m-%d %H:%M:%S %Z")
  }

  data.frame(
    person_id = start_id:(start_id + n - 1),
    sex_at_birth = sample(c("Male", "Female"), n, replace = TRUE, prob=c(.40,.60)),
    date_of_birth = birthdate,
    race = sample(c("Black or African American", "White", "Asian"), n, replace = TRUE, prob=c(.15, .70, .15)),
    ethnicity = sample(c("Not Hispanic or Latino", "Hispanic or Latino"), n, replace = TRUE, prob=c(.83,.17)),
    stringsAsFactors = FALSE
  )
}


# Generate case and control data frames with non-overlapping person_id
generate_two_dataframes <- function(n1, n2) {
  df_case <- generate_random_data(1, n1, TRUE)  # Case data frame with n1 rows
  df_control <- generate_random_data(n1 + 1, n2, FALSE)  # Control data frame with n2 rows starting from n1 + 1
  
  list(df_case = df_case, df_control = df_control)
}

# Generate case and control dataframes with specified row counts
datasets <- generate_two_dataframes(1500, 8000)

# Extract the individual dataframes
dataset_12345678_person_df <- datasets$df_case  # CASE_DF
dataset_23456789_person_df <- datasets$df_control  # CONTROL_DF

disease_case <- sample(dataset_12345678_person_df$person_id, 843, replace = FALSE)
disease_control <- sample(dataset_23456789_person_df$person_id, 2661, replace = FALSE)
# # Create the dataframe
dataset_34567890_person_df <- data.frame(person_id = c(disease_case, disease_control)) # CONDITION_DF