In [1]:
library(SBC);
library(cmdstanr);
library(tidyverse);
options(mc.cores = parallel::detectCores());
library(future);
plan(multisession);

options(SBC.min_chunk_size = 5);

cache_dir <- "./SBC_cache"
if(!dir.exists(cache_dir)) {
    dir.create(cache_dir)
}

ALPHA <- 0.05
model_Poisson_v2 <- cmdstanr::cmdstan_model("../models/Poisson_model_v2.stan")

This is cmdstanr version 0.7.1

- CmdStanR documentation and vignettes: mc-stan.org/cmdstanr

- CmdStan path: /Users/igor.michels/.cmdstan/cmdstan-2.34.1

- CmdStan version: 2.34.1


A newer version of CmdStan is available. See ?install_cmdstan() to install it.
To disable this check set option or environment variable CMDSTANR_NO_VER_CHECK=TRUE.

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.4
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.1
[32mv[39m [34mggplot2  [39m 3.5.0     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::

In [2]:
N_SIMS <- 50
N_CLUBS <- 20
N_ITER_WARMUP <- 200
N_ITER_SAMPLING <- 300
N_CHAINS <- 4

In [3]:
data_generator_single_Poisson_v2 <- function(n_clubs, n_players_per_clubs){
    n_seasons <- 1
    clubs <- 1:n_clubs
    force <- abs(rnorm(length(clubs) * n_players_per_clubs))
    df <- data.frame(Club = clubs)
    data <- merge(df, df, by = NULL) %>% filter(Club.x != Club.y)
    data <- data %>% slice(rep(row_number(), n_seasons))
    home_goals <- list()
    away_goals <- list()
    home_players <- array(NA, dim = c(nrow(data), 11))
    away_players <- array(NA, dim = c(nrow(data), 11))
    players <- seq(1, n_players_per_clubs)
    for (i in 1:nrow(data)) {
        home_team <- data[i, 1]
        away_team <- data[i, 2]
        home_players_game <- sample(players)[1:11]
        away_players_game <- sample(players)[1:11]
        home_force <- sum(force[(home_team - 1) *  n_players_per_clubs + home_players_game])
        away_force <- sum(force[(away_team - 1) *  n_players_per_clubs + away_players_game])
        home_goals <- append(home_goals, rpois(1, home_force / away_force))
        away_goals <- append(away_goals, rpois(1, away_force / home_force))
        home_players[i,] <- home_players_game
        away_players[i,] <- home_players_game
    }

    data$home.goals <- home_goals
    data$away.goals <- away_goals
    names(data) <- c("home_name", "away_name", "home_goals", "away_goals")

    list(
        variables = list(
            skills = force
        ),
        generated = list(
            n_games = nrow(data),
            n_teams = n_clubs,
            n_players_per_team = n_players_per_clubs,
            home_team = data$home_name,
            away_team = data$away_name,
            home_score = data$home_goals,
            away_score = data$away_goals,
            home_players = home_players,
            away_players = away_players
        )
    )
}

In [4]:
run_model <- function(n_players_per_clubs){
    set.seed(0)
    data_generator_Poisson_v2 <- SBC_generator_function(data_generator_single_Poisson_v2, n_clubs = N_CLUBS,
                                                        n_players_per_clubs = n_players_per_clubs)

    dataset_Poisson_v2 <- generate_datasets(data_generator_Poisson_v2, N_SIMS)
    
    # max_rank will be iter_sampling * chains / 10 - 1
    backend_Poisson_v2 <- SBC_backend_cmdstan_sample(model_Poisson_v2, iter_warmup = N_ITER_WARMUP,
                                                     iter_sampling = N_ITER_SAMPLING, chains = N_CHAINS)

    results_Poisson_v2 <- compute_SBC(dataset_Poisson_v2, backend_Poisson_v2,
                                      keep_fits = FALSE,
                                      cache_mode = "results",
                                      cache_location = file.path(cache_dir,
                                                                 sprintf("results_Poisson_v2_%02d_players_per_club",
                                                                         n_players_per_clubs)))
    
    write.csv(results_Poisson_v2$stats,
              file = sprintf("SBC_cache/results_Poisson_v2_%02d_players_per_club_stats.csv",
                             n_players_per_clubs))
    
    results_Poisson_v2
}

In [5]:
all_results <- list()
for (n_players_per_clubs in 11:20) {
    results <- run_model(n_players_per_clubs)

    graph <- plot_ecdf(results)
    plot_data <- ggplot_build(graph)$data
    confidence_interval <- plot_data[[1]]
    ecdf <- plot_data[[2]]

    df1 <- merge(select(confidence_interval, - c(colour, fill, group, flipped_aes, linewidth, linetype, alpha, y)),
                 select(ecdf, - c(colour, fill, group, linewidth, linetype, alpha)),
                 by = c("PANEL", "x"), all.x = TRUE) %>%
        group_by(PANEL, x) %>%
        summarize(ymax = max(ymax, na.rm = TRUE),
                  ymin = max(ymin, na.rm = TRUE),
                  y = max(y, na.rm = TRUE))

    df1$out <- (df1$ymax < df1$y) + (df1$ymin > df1$y)
    df1 <- df1 %>% group_by(PANEL) %>% summarise(out_ratio = sum(out), .groups = 'keep')
    df1$out_ratio <- df1$out_ratio / length(unique(ecdf$x))
    df1$out <- df1$out_ratio > ALPHA

    graph <- plot_ecdf_diff(results)
    plot_data <- ggplot_build(graph)$data
    confidence_interval <- plot_data[[1]]
    ecdf <- plot_data[[2]]

    df2 <- merge(select(confidence_interval, - c(colour, fill, group, flipped_aes, linewidth, linetype, alpha, y)),
                 select(ecdf, - c(colour, fill, group, linewidth, linetype, alpha)),
                 by = c("PANEL", "x"), all.x = TRUE) %>%
        group_by(PANEL, x) %>%
        summarize(ymax = max(ymax, na.rm = TRUE),
                  ymin = max(ymin, na.rm = TRUE),
                  y = max(y, na.rm = TRUE))

    df2$out <- (df2$ymax < df2$y) + (df2$ymin > df2$y)
    df2 <- df2 %>% group_by(PANEL) %>% summarise(out_ratio = sum(out), .groups = 'keep')
    df2$out_ratio <- df2$out_ratio / length(unique(ecdf$x))
    df2$out <- df2$out_ratio > ALPHA

    final_df <- merge(df1, df2, by = "PANEL", suffixes = c("", "_diff"))
    key <- paste0(n_players_per_clubs, "_players")
    all_results[[key]] <- c(mean(as.numeric(final_df$out)), mean(as.numeric(final_df$out_diff)))
}

Results loaded from cache file 'results_Poisson_v2_11_players_per_club'

 - 50 (100%) fits had at least one Rhat > 1.01. Largest Rhat was 1.059.

 - 4 (8%) fits had tail ESS undefined or less than half of the maximum rank, potentially skewing 
the rank statistics. The lowest tail ESS was 45.
 If the fits look good otherwise, increasing `thin_ranks` (via recompute_SBC_statistics) 
or number of posterior draws (by refitting) might help.

Not all diagnostics are OK.
You can learn more by inspecting $default_diagnostics, $backend_diagnostics 

[1m[22m`summarise()` has grouped output by 'PANEL'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'PANEL'. You can override using the
`.groups` argument.
Results loaded from cache file 'results_Poisson_v2_12_players_per_club'

 - 50 (100%) fits had at least one Rhat > 1.01. Largest Rhat was 1.04.

 - 3 (6%) fits had tail ESS undefined or less than half of the maximum rank, potentially skewing 
the rank 

In [6]:
print(all_results)

$`11_players`
[1] 0.01818182 0.07272727

$`12_players`
[1] 0.01666667 0.07916667

$`13_players`
[1] 0.02307692 0.06923077

$`14_players`
[1] 0.003571429 0.060714286

$`15_players`
[1] 0.03000000 0.07666667

$`16_players`
[1] 0.021875 0.068750

$`17_players`
[1] 0.02352941 0.05882353

$`18_players`
[1] 0.02222222 0.07222222

$`19_players`
[1] 0.01578947 0.06315789

$`20_players`
[1] 0.0250 0.0875

