In [None]:
f_func <- "./functions.ipynb"
eval(parse(text = system2('jupyter', c('nbconvert', f_func, '--to=script', '--stdout'), stdout = TRUE)))

In [None]:
IN_DIR="../../output/nucleotideFreqGenomes/"

func_cleanup_table <- function(data, nr_nucl){
    dat_clean <- data %>% as.data.frame()
    colnames(dat_clean) <- c("motif", "freq")   
    dat_clean <- dat_clean %>%
        filter(nchar(motif) == nr_nucl,                      
               sapply(strsplit(as.character(motif), ""),     
                      function(x) all(x %in% c("A", "C", "T", "G"))) 
        )
    return(dat_clean)
}

In [None]:
Afum_dinuclFreq_MT <- read.csv(file = paste0(IN_DIR, "FungiDB-46_AfumigatusAf293_Genome_cleaned_final_dinuclFreq_MT.txt"), 
            header = FALSE, sep = "\t") %>% func_cleanup_table(., nr_nucl = 2) %>% mutate(source = "Aspergillus")

Afum_dinuclFreq_nonMT <- read.csv(file = paste0(IN_DIR, "FungiDB-46_AfumigatusAf293_Genome_cleaned_final_dinuclFreq_nonMT.txt"), 
            header = FALSE, sep = "\t") %>% func_cleanup_table(., nr_nucl = 2) %>% mutate(source = "Aspergillus")

In [None]:
Afum_dinucl_df <- rbind(Afum_dinuclFreq_MT,Afum_dinuclFreq_nonMT) %>% 
    ungroup() %>% 
    group_by(motif) %>%
    summarize(freq = sum(freq)) %>% 
    ungroup() %>% 
    mutate(norm_freq = freq/sum(freq)*100) %>% 
    mutate(source = "Aspergillus fumigatus")

Afum_dinucl_df %>% 
    filter(motif == "CG")

In [None]:
chromosomes = c(1:22, "X", "Y", "MT")
chrom_dinucl_counts = dinucleotideFrequency(getSeq(Hsapiens, chromosomes))
host_sums_dinucl = colSums(chrom_dinucl_counts)

host_dinucl_df <- data.frame(motif = names(host_sums_dinucl), freq = as.numeric(host_sums_dinucl)) %>% 
    mutate(source = "host") %>% 
    mutate(norm_freq = freq/sum(freq)*100)

host_dinucl_df %>% head()

In [None]:
rbind(Afum_dinucl_df, host_dinucl_df) %>% 
    ggplot(aes(y = motif, x = norm_freq, fill = source)) +
        geom_bar(stat = "identity", position = "dodge") +  
        labs(title = "Dinucleotide Counts by Origin",
            y = "Dinucleotide",
            x = "Percentage") +
          scale_fill_manual(values = c("Aspergillus" = "#A9A9A9", "host" = "#D3D3D3")) +
          theme_minimal() +
          theme(text = element_text(size = 12)) + 
            labs(fill = "")

In [None]:
rbind(Afum_dinucl_df, host_dinucl_df) %>% 
    filter(motif == "CG")