In [None]:
f_func <- "./functions.ipynb"
eval(parse(text = system2('jupyter', c('nbconvert', f_func, '--to=script', '--stdout'), stdout = TRUE)))

f_data_loading <- "./data_loading.ipynb"
eval(parse(text = system2('jupyter', c('nbconvert', f_data_loading, '--to=script', '--stdout'), stdout = TRUE)))

In [None]:
# Step 1: Add a column to distinguish the two datasets
Afum_data <- Afum_IS_meta %>%
    mutate(source = "Aspergillus")  # Adding the source label

host_nonMT_data <- host_nonMT_IS_meta %>%
    mutate(source = "Human")  # Adding the source label

host_MT_data <- host_MT_IS_meta %>%
    mutate(source = "Human")  #purpusly adding same source label to sum MT and nonMT

# Step 2: Combine the two datasets
combined_data <- bind_rows(Afum_data, host_nonMT_data, host_MT_data)

for (tmp_ss in c("no", "yes")[1]){
        if(tmp_ss == "no"){max_TLEN = 500; tmp_title = "no size selection"}
        if(tmp_ss == "yes"){max_TLEN = 100; tmp_title = "size selection, 35-100 bp"}
        
        # Step 3: Calculate the cumulative sum for each sample_type and TLEN, grouped by source
        expanded_result <- combined_data %>%
            filter(origin == "IPA samples") %>% 
            filter(TLEN <= max_TLEN) %>% 
            mutate(EndX = substr(EndMotif, 1, 1)) %>% 
            group_by(sample, type, sample_type, sample_id, source, EndX, origin, read) %>%
            summarise(sumCount = sum(Count), .groups = "keep") %>% 
            group_by(sample, type, sample_type, sample_id, source, origin, read) %>% 
            filter(EndX %in% c("C", "G", "T", "A")) %>%
            pivot_wider(names_from = EndX, values_from = sumCount, values_fill = 0) %>%
            mutate(TotalCount = rowSums(across(c(C, T, G, A)))) %>%
            mutate(across(c(C, T, G, A, TotalCount), ~ .x / TotalCount)) %>%  # Normalize each base count
            reshape2::melt() %>% 
            mutate(variable = ifelse(variable == "TotalCount", yes = ".", no = as.character(variable)))

        # Step 4: Pivot the data to get separate columns for Aspergillus and Human cumSumCount
        sum_wide <- expanded_result %>%
            pivot_wider(names_from = source, values_from = value, values_fill = 0) 

        # Step 5: Calculate the Aspergillus:Human ratio of cumulative sums for each sample_type and TLEN
        sum_wide <- sum_wide %>%
            mutate(ratio = ifelse(Human > 0, Aspergillus/Human, NA)) %>% # Set ratio to NA when Human cumSumCount is 0
            filter(Aspergillus != 0) %>% 
            filter(!is.na(ratio))

        # Step 6: Get the stats 
        stat.test <- compare_means(as.data.frame(sum_wide), 
                           formula = ratio~variable, group.by = c("sample_type", "origin", "read"), 
                           method = "wilcox.test", alternative = "less", 
                           p.adjust.method = "bonferroni")

        sel_stat.test <- stat.test[which(stat.test$group2 == "."),]
        sel_stat.test$plot_pvalue <- round(sel_stat.test$p, digits = 3)
        sel_stat.test <- sel_stat.test %>% 
            mutate(loc_y = case_when(
                group1 == "A" ~ 5, 
                group1 == "C" ~ 5.2, 
                group1 == "G" ~ 5.4, 
                group1 == "T" ~ 5.6,
            )) %>% filter(p.signif != "ns")

        # Step 6: Plot the Aspergillus:Human ratio by TLEN for each sample_type
        ratio_boxplot <- sum_wide %>%
            mutate(variable = factor(variable, levels = rev(sum_wide$variable  %>% unique()))) %>% 
            ggplot(aes(x = variable, y = ratio, col = sample_type)) + 
                #geom_jitter(position = position_jitter(width = 0.15), size = 1, alpha = 0.5) +
                scale_color_manual(values = c("dsLP plasma" = "#C6896C", "ssLP plasma" = "#A84750", 
                                              "dsLP BAL" = "#7CA2C2", "ssLP BAL" = "#6066B6")) + 
                xlab("Selected end-motif") + ylab("Frequency ratio\nA. fumigatus : Human") + 
                theme_bw() + 
                stat_pvalue_manual(sel_stat.test, y.position = "loc_y", coord.flip = TRUE,
                           label = "p.signif", remove.bracket = FALSE, tip.length = 0.01) +
                facet_grid(rows = vars(sample_type), 
                           cols = vars(read)) +
                theme(strip.background = element_blank()) +  
                labs(color = "") + #ggtitle(tmp_title) + 
                coord_flip() +
                geom_dotplot(width = 0.15, binaxis = "y", stackdir = "center", dotsize = 0.2, fill = "white") + 
                geom_text(data = . %>% group_by(sample_type) %>%
                              summarise(unique_count = n_distinct(sample_id), .groups = "drop") %>%
                              mutate(label = paste("n = ", unique_count)), 
                      aes(x = Inf, y = Inf, label = label, col = sample_type), 
                      hjust = 1.5, vjust = 1.5, size = 3.5)

        # Print the ratio plot
        options(repr.plot.width=14, repr.plot.height=8)
        ratio_boxplot %>% print()

        # Save the plot
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_1nt.png"), 
               plot = ratio_boxplot, width = 14, height = 8)
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_1nt.pdf"), 
               plot = ratio_boxplot, width = 14, height = 8)
}

In [None]:
# Step 1: Add a column to distinguish the two datasets
Afum_data <- Afum_IS_meta %>%
    mutate(source = "Aspergillus")  # Adding the source label

host_nonMT_data <- host_nonMT_IS_meta %>%
    mutate(source = "Human")  # Adding the source label

host_MT_data <- host_MT_IS_meta %>%
    mutate(source = "Human")  #purpusly adding same source label to sum MT and nonMT

# Step 2: Combine the two datasets
combined_data <- bind_rows(Afum_data, host_nonMT_data, host_MT_data)

for (tmp_ss in c("no", "yes")[1]){
        if(tmp_ss == "no"){max_TLEN = 500; tmp_title = "no size selection"}
        if(tmp_ss == "yes"){max_TLEN = 100; tmp_title = "size selection, 35-100 bp"}
        
        # Step 3: Calculate the cumulative sum for each sample_type and TLEN, grouped by source
        expanded_result <- combined_data %>%
            filter(origin == "IPA samples") %>% 
            filter(TLEN <= max_TLEN) %>% 
            mutate(EndX = substr(EndMotif, 1, 1)) %>% 
            group_by(sample, type, sample_type, sample_id, source, EndX, origin, read) %>%
            summarise(sumCount = sum(Count), .groups = "keep") %>% 
            group_by(sample, type, sample_type, sample_id, source, origin, read) %>% 
            filter(EndX %in% c("C", "G", "T", "A")) %>%
            pivot_wider(names_from = EndX, values_from = sumCount, values_fill = 0) %>%
            mutate("C&G" = rowSums(across(c(C,G)), na.rm = TRUE)) %>% 
            mutate("C&T" = rowSums(across(c(C,T)), na.rm = TRUE)) %>% 
            mutate("C&A" = rowSums(across(c(C,A)), na.rm = TRUE)) %>% 
            mutate("G&T" = rowSums(across(c(G,T)), na.rm = TRUE)) %>%  
            mutate("G&A" = rowSums(across(c(G,A)), na.rm = TRUE)) %>% 
            mutate("T&A" = rowSums(across(c(T,A)), na.rm = TRUE)) %>%    
            mutate(TotalCount = rowSums(across(c(C, T, G, A)), na.rm = TRUE)) %>% 
            mutate(across(c("C&G", "C&T", "C&A", "G&T", "G&A", "T&A", TotalCount), ~ .x / TotalCount)) %>%  # Normalize each base count
            select(-c("A", "C", "G", "T")) %>% reshape2::melt() %>% 
            mutate(variable = ifelse(variable == "TotalCount", yes = ".&.", no = as.character(variable)))

        # Step 4: Pivot the data to get separate columns for Aspergillus and Human cumSumCount
        sum_wide <- expanded_result %>%
            pivot_wider(names_from = source, values_from = value, values_fill = 0) 

        # Step 5: Calculate the Aspergillus:Human ratio of cumulative sums for each sample_type and TLEN
        sum_wide <- sum_wide %>%
            mutate(ratio = ifelse(Human > 0, Aspergillus/Human, NA)) %>% # Set ratio to NA when Human cumSumCount is 0
            filter(Aspergillus != 0) %>% 
            filter(!is.na(ratio))

        # Step 6: Get the stats 
        stat.test <- compare_means(as.data.frame(sum_wide), 
                           formula = ratio~variable, group.by = c("sample_type", "origin", "read"), 
                           method = "wilcox.test", alternative = "less", 
                           p.adjust.method = "bonferroni")

        sel_stat.test <- stat.test[which(stat.test$group2 == ".&."),]
        sel_stat.test$plot_pvalue <- round(sel_stat.test$p, digits = 3)
        sel_stat.test <- sel_stat.test %>% 
            mutate(loc_y = case_when(
                group1 == "C&G" ~ 2.6, 
                group1 == "C&T" ~ 2.8, 
                group1 == "C&A" ~ 3.0,
                group1 == "G&T" ~ 3.2, 
                group1 == "G&A" ~ 3.4, 
                group1 == "T&A" ~ 3.6,
            )) %>% filter(p.signif != "ns")

        # Step 6: Plot the Aspergillus:Human ratio by TLEN for each sample_type
        ratio_boxplot <- sum_wide %>%
            mutate(variable = factor(variable, levels = rev(sum_wide$variable  %>% unique()))) %>% 
            ggplot(aes(x = variable, y = ratio, col = sample_type)) + 
                scale_color_manual(values = c("dsLP plasma" = "#C6896C", "ssLP plasma" = "#A84750", 
                                              "dsLP BAL" = "#7CA2C2", "ssLP BAL" = "#6066B6")) + 
                xlab("Selected end-motif") + ylab("Frequency ratio\nA. fumigatus : Human") + 
                theme_bw() + 
                stat_pvalue_manual(sel_stat.test, y.position = "loc_y", coord.flip = TRUE,
                           label = "p.signif",remove.bracket = FALSE, tip.length = 0.01) +
                facet_grid(rows = vars(sample_type), 
                           cols = vars(read)) +
                theme(strip.background = element_blank()) + 
                labs(color = "") + #ggtitle(tmp_title) + 
                coord_flip() + 
                geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.2, fill = "white") + 
                geom_text(data = . %>% group_by(sample_type) %>%
                              summarise(unique_count = n_distinct(sample_id), .groups = "drop") %>%
                              mutate(label = paste("n = ", unique_count)), 
                      aes(x = Inf, y = Inf, label = label, col = sample_type), 
                      hjust = 1.5, vjust = 1.5, size = 3.5)

        # Print the ratio plot
        options(repr.plot.width=14, repr.plot.height=8)
        ratio_boxplot %>% print()

        # Save the plot
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_2x1nt.png"), 
               plot = ratio_boxplot, width = 14, height = 8)
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_2x1nt.pdf"), 
               plot = ratio_boxplot, width = 14, height = 8)
}

In [None]:
# Step 1: Add a column to distinguish the two datasets
Afum_data <- Afum_IS_meta %>%
    mutate(source = "Aspergillus")  # Adding the source label

host_nonMT_data <- host_nonMT_IS_meta %>%
    mutate(source = "Human")  # Adding the source label

host_MT_data <- host_MT_IS_meta %>%
    mutate(source = "Human")  #purpusly adding same source label to sum MT and nonMT

# Step 2: Combine the two datasets
combined_data <- bind_rows(Afum_data, host_nonMT_data, host_MT_data)

for (tmp_ss in c("no", "yes")[1]){
        if(tmp_ss == "no"){max_TLEN = 500; tmp_title = "no size selection"}
        if(tmp_ss == "yes"){max_TLEN = 100; tmp_title = "size selection, 35-100 bp"}
        
        # Step 3: Calculate the cumulative sum for each sample_type and TLEN, grouped by source
        expanded_result <- combined_data %>%
            filter(origin == "IPA samples") %>% 
            filter(TLEN <= max_TLEN) %>% 
            mutate(EndXX = substr(EndMotif, 1, 2)) %>% 
            group_by(sample, type, sample_type, sample_id, source, EndXX, origin, read) %>%
            summarise(sumCount = sum(Count), .groups = "keep") %>% 
            group_by(sample, type, sample_type, sample_id, source, origin,read) %>% 
            pivot_wider(names_from = EndXX, values_from = sumCount, values_fill = 0) %>%
            mutate(TotalCount = rowSums(
                across(c(CC,CG,GC,GT,CA,AT,AC,GA,TA,AG,AA,TT,TC,CT,TG,GG)))) %>% 
            mutate(across(c(CC,CG,GC,GT,CA,AT,AC,GA,TA,AG,AA,TT,TC,CT,TG,GG, TotalCount), ~ .x / TotalCount)) %>% 
            reshape2::melt() %>% 
            mutate(variable = ifelse(variable == "TotalCount", yes = "..", no = as.character(variable))) 

        # Step 4: Pivot the data to get separate columns for Aspergillus and Human cumSumCount
        sum_wide <- expanded_result %>%
            pivot_wider(names_from = source, values_from = value, values_fill = 0) 

        # Step 5: Calculate the Aspergillus:Human ratio of cumulative sums for each sample_type and TLEN
        sum_wide <- sum_wide %>%
            mutate(ratio = ifelse(Human > 0, Aspergillus/Human, NA)) %>% # Set ratio to NA when Human cumSumCount is 0
            filter(Aspergillus != 0) %>% 
            filter(!is.na(ratio))

        # Step 6: Get the stats 
        stat.test <- compare_means(as.data.frame(sum_wide), 
                           formula = ratio~variable, group.by = c("sample_type", "origin", "read"), 
                           method = "wilcox.test", alternative = "less", 
                           p.adjust.method = "bonferroni")

        sel_stat.test <- stat.test[which(stat.test$group2 == ".."),]
        sel_stat.test$plot_pvalue <- round(sel_stat.test$p, digits = 3)
        sel_stat.test <- sel_stat.test %>% 
            mutate(loc_y = case_when(
                group1 == "AA" ~ 4.6, 
                group1 == "AC" ~ 4.4, 
                group1 == "AG" ~ 4.2,
                group1 == "AT" ~ 4.0, 
                group1 == "CA" ~ 3.8, 
                group1 == "CC" ~ 3.6,
                group1 == "CG" ~ 3.4, 
                group1 == "CT" ~ 3.2, 
                group1 == "GA" ~ 3.0,
                group1 == "GC" ~ 2.8, 
                group1 == "GG" ~ 2.6,
                group1 == "GT" ~ 2.4, 
                group1 == "TA" ~ 2.2, 
                group1 == "TC" ~ 2.0,
                group1 == "TG" ~ 1.8,
                group1 == "TT" ~ 1.6
            )) %>% filter(p.signif != "ns")

        # Step 6: Plot the Aspergillus:Human ratio by TLEN for each sample_type
        ratio_boxplot <- sum_wide %>%
            ggplot(aes(x = variable, y = ratio, col = sample_type)) + 
                scale_color_manual(values = c("dsLP plasma" = "#C6896C", "ssLP plasma" = "#A84750", 
                                              "dsLP BAL" = "#7CA2C2", "ssLP BAL" = "#6066B6")) + 
                xlab("Selected end-motif") + ylab("Frequency ratio, log10\nA. fumigatus : Human") + 
                theme_bw() + 
                stat_pvalue_manual(sel_stat.test, y.position = "loc_y", coord.flip = TRUE,
                           label = "p.signif",remove.bracket = FALSE, tip.length = 0.01) +
                facet_grid(rows = vars(sample_type), 
                           cols = vars(read)) +
                theme(strip.background = element_blank()) + scale_y_log10() +
                labs(color = "") + #ggtitle(tmp_title) + 
                coord_flip() + 
                geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.4, fill = "white") + 
                geom_text(data = . %>% group_by(sample_type) %>%
                              summarise(unique_count = n_distinct(sample_id), .groups = "drop") %>%
                              mutate(label = paste("n = ", unique_count)), 
                      aes(x = Inf, y = Inf, label = label, col = sample_type), 
                      hjust = 1.5, vjust = 1.5, size = 3.5)

        # Print the ratio plot
        options(repr.plot.width=14, repr.plot.height=15)
        ratio_boxplot %>% print()

        # Save the plot
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_2nt.png"), 
               plot = ratio_boxplot, width = 14, height = 15)
        ggsave(paste0("../../output_figures/Aspergillus_Human_ratioEndMotif_2nt.pdf"), 
               plot = ratio_boxplot, width = 14, height = 15)
}

In [None]:
# Step 1: Add a column to distinguish the two datasets
Afum_data <- Afum_IS_meta %>%
    filter(read == "R1") %>% 
    mutate(source = "Aspergillus")  # Adding the source label

host_nonMT_data <- host_nonMT_IS_meta %>%
    filter(read == "R1") %>% 
    mutate(source = "Human")  # Adding the source label

host_MT_data <- host_MT_IS_meta %>%
    filter(read == "R1") %>% 
    mutate(source = "Human")  #purpusly adding same source label to sum MT and nonMT

# Step 2: Combine the two datasets
combined_data <- bind_rows(Afum_data, host_nonMT_data, host_MT_data)

for (sel_sample in c("plasma", "BAL")){
    # Step 3: Calculate the cumulative sum for each sample_type and TLEN, grouped by source
    cum_sum_data <- combined_data %>%
        mutate(EndX = substr(EndMotif, 1, 1)) %>% 
        filter(sample == sel_sample) %>% 
        group_by(sample, type, sample_type, TLEN, source, EndX) %>%
        summarise(sumCount = sum(Count), .groups = "keep") %>%
        arrange(TLEN) %>%  # Ensure TLEN is in order for cumulative sums
        group_by(sample, type, sample_type, source, EndX) %>%
        mutate(cumSumCount = cumsum(as.numeric(sumCount)) / sum(sumCount))  %>%  # Cumulative sum
        select(-sumCount)

    # Step 4: Pivot the data to get separate columns for Aspergillus and Human cumSumCount
    cum_sum_wide <- cum_sum_data %>%
        pivot_wider(names_from = source, values_from = cumSumCount, values_fill = 0)  # Fill missing values with 0

    # Step 5: Calculate the Aspergillus:Human ratio of cumulative sums for each sample_type and TLEN
    cum_sum_wide <- cum_sum_wide %>%
        mutate(ratio = ifelse(Human > 0, Aspergillus/Human, NA)) %>% # Set ratio to NA when Human cumSumCount is 0
        filter(Aspergillus != 0) %>% 
        filter(!is.na(ratio))

    # Step 6: Plot the Aspergillus:Human ratio by TLEN for each sample_type
    ratio_plot <- cum_sum_wide %>%
        ggplot(aes(x = TLEN, y = ratio, col = sample_type)) + 
            geom_point(shape = 4) + 
            geom_line(size = 0.4) +  # Plot the ratio line
            scale_color_manual(values = c("dsLP plasma" = "#C6896C", "ssLP plasma" = "#A84750", 
                                          "dsLP BAL" = "#7CA2C2", "ssLP BAL" = "#6066B6")) + 
            xlab("Selected end-motif") + ylab("Cumulative frequency ratio, log10\nA. fumigatus : Human") + 
            #ggtitle("Ratio Aspergillus fumigatus:Human chromosomal\nbased on cumulative frequency") + 
            theme_bw() + xlim(0,250) + 
            facet_grid(cols = vars(EndX)) +
            theme(strip.background = element_blank()) + scale_y_log10() + labs(color = "")  

    # Print the ratio plot
    options(repr.plot.width=15, repr.plot.height=3.5)
    ratio_plot %>% print()

    # Save the plot
    #ggsave(paste0("../../output_figures/Aspergillus_Human_ratioCumSUm_length_", sel_sample, "_cummulative.png"), 
    #       plot = ratio_plot, width = 5, height = 3.5)
    #ggsave(paste0("../../output_figures/Aspergillus_Human_ratioCumSUm_length_", sel_sample, "_cummulative.pdf"), 
    #       plot = ratio_plot, width = 5, height = 3.5)
    
}