In [1]:
suppressPackageStartupMessages({
    library(ggplot2)
    library(ggpubr)
})

path <- "../2.current_version/"

In [2]:
species <- c("Hsap","Mmus","Pvit","Pmar")

get_OR <- function(species, type){
    df <- Reduce(rbind, lapply(species, FUN = function(s){
        wgd <- read.delim(paste0(path, s, "/", type ,"/", s, ".ohnolog_DEGs.fisher.celltype.txt"), header = T)
        ssd <- read.delim(paste0(path, s, "/", type ,"/", s, ".SSDparalog_DEGs.fisher.celltype.txt"), header = T)
        
        wgd$type <- "WGD"
        ssd$type <- "SSD"
    
        OR_info <- rbind(wgd, ssd)
        OR_info$species <- s
        return(OR_info)
    }))
    
    s = "Bflo"
    pa <- read.delim(paste0("../1.old_versions/2.my_version.v1/", s, "/", type ,"/", s, "_T1.paralog_DEGs.fisher.celltype.txt"), header = T)
    pa$type <- "SSD"
    pa$species <- "Bflo"
    
    df <- rbind(df, pa)
    df$species <- factor(df$species, levels = c("Hsap","Mmus","Pvit","Pmar", "Bflo"))
    
    my_comparisons <- list(c("WGD", "SSD"))
    p <- ggboxplot(df, x = "type", y = "OR", color = "type", palette = "jco") + 
        stat_compare_means(comparisons = my_comparisons, method = "wilcox.test", paired = TRUE, label = "p.signif") + 
        stat_summary(fun = "median", geom = "text", aes(label = round(after_stat(y), 3)), vjust = -1) + 
        facet_wrap(~species, nrow = 1) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
    ggsave(filename = paste0("OR.cell_family.summary.", type, ".pdf"), p, width = 5, height = 7)
}


In [3]:
get_OR(species, "wilcox")
get_OR(species, "roc")

In [4]:
species <- c("Hsap","Mmus","Pvit","Pmar")

get_ratio <- function(species, type){
    df <- Reduce(rbind, lapply(species, FUN = function(s){
        wgd <- read.delim(paste0(path, s, "/", type ,"/", s, ".ohnolog_ratio_inDEGs.stats.txt"), header = T)
        ssd <- read.delim(paste0(path, s, "/", type ,"/", s, ".SSDparalog_ratio_inDEGs.stats.txt"), header = T)
    
        wgd$type <- "WGD"
        ssd$type <- "SSD"
        colnames(wgd) <- colnames(ssd)
    
        info <- rbind(wgd, ssd)
        info$species <- s
        return(info)
    
    }))

    s = "Bflo"
    pa <- read.delim(paste0("../1.old_versions/2.my_version.v1/", s, "/", type ,"/", s, "_T1.paralog_ratio_inDEGs.stats.txt"), header = T)
    pa$type <- "SSD"
    pa$species <- "Bflo"

    df <- rbind(df, pa)
    df$species <- factor(df$species, levels = c("Hsap","Mmus","Pvit","Pmar", "Bflo"))
    
    species <- c("Hsap","Mmus","Pvit","Pmar", "Bflo")

    ratio_bg <- Reduce(rbind, lapply(species, FUN = function(s){
        if (s == "Bflo"){
            tmp <- read.delim(paste0("../1.old_versions/2.my_version.v1/", s, "/", type ,"/", s, "_T1.ratio_bg.txt"), header = F)
            tmp$V1 <- "SSD"
        } else {
            tmp <- read.delim(paste0(path, s, "/", type ,"/", s, ".ratio_bg.txt"), header = F)
            tmp <- tmp[tmp$V1 %in% c("ohnologs", "SSDparalogs"), ]
            tmp[tmp$V1 == "SSDparalogs", "V1"] <- "SSD"
            tmp[tmp$V1 == "ohnologs", "V1"] <- "WGD"
        }
        tmp$species <- s
        return(tmp)
    }))
    colnames(ratio_bg) <- c("type", "bg", "species")
    
    df <- merge(df, ratio_bg, by = c("species","type"))
    df$type <- factor(df$type, levels = c("WGD", "SSD"))
    
    p <- ggboxplot(df, x = "type", y = "paralogs.", color = "type", palette = "jco") + 
        stat_summary(fun = "median", geom = "text", aes(label = round(after_stat(y), 3)), vjust = -1) + 
        geom_hline(aes(yintercept = bg, color = type), linetype = "dashed") +
        facet_wrap(~species, nrow = 1) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
    ggsave(filename = paste0("paralog_ratio.cell_family.summary.", type, ".pdf"),p , width = 5, height = 7)
}


In [5]:
get_ratio(species, "wilcox")
get_ratio(species, "roc")

In [6]:
# plot family/number of paralogs ratio
species <- c("Hsap","Mmus","Pvit","Pmar")

get_fa_ratio <- function(species, type){
    df <- Reduce(rbind, lapply(species, FUN = function(s){
        wgd <- read.delim(paste0(path, s, "/", type ,"/", s, ".ohnolog_ratio_inDEGs.stats.txt"), header = T)
        ssd <- read.delim(paste0(path, s, "/", type ,"/", s, ".SSDparalog_ratio_inDEGs.stats.txt"), header = T)
    
        wgd$type <- "WGD"
        ssd$type <- "SSD"
        colnames(wgd) <- colnames(ssd)
    
        info <- rbind(wgd, ssd)
        info$species <- s
        return(info)
    
    }))

    s = "Bflo"
    pa <- read.delim(paste0("../1.old_versions/2.my_version.v1/", s, "/", type ,"/", s, "_T1.paralog_ratio_inDEGs.stats.txt"), header = T)
    pa$type <- "SSD"
    pa$species <- "Bflo"

    df <- rbind(df, pa)
    df$species <- factor(df$species, levels = c("Hsap","Mmus","Pvit","Pmar", "Bflo"))
    
    species <- c("Hsap","Mmus","Pvit","Pmar", "Bflo")

    ratio_bg <- Reduce(rbind, lapply(species, FUN = function(s){
        if (s == "Bflo"){
            tmp <- read.delim(paste0("../1.old_versions/2.my_version.v1/", s, "/", type ,"/", s, "_T1.family_ratio_bg.txt"), header = F)
            tmp$V1 <- "SSD"
        } else {
            tmp <- read.delim(paste0(path, s, "/", type ,"/", s, ".family_ratio_bg.txt"), header = F)
            tmp <- tmp[tmp$V1 %in% c("ohnologs", "SSDparalogs"), ]
            tmp[tmp$V1 == "SSDparalogs", "V1"] <- "SSD"
            tmp[tmp$V1 == "ohnologs", "V1"] <- "WGD"
        }
        tmp$species <- s
        return(tmp)
    }))
    colnames(ratio_bg) <- c("type", "bg", "species")
    
    df <- merge(df, ratio_bg, by = c("species","type"))
    df$type <- factor(df$type, levels = c("WGD", "SSD"))
    
    p <- ggboxplot(df, x = "type", y = "families_divided_by_paralogs.", color = "type", palette = "jco") + 
        stat_summary(fun = "median", geom = "text", aes(label = round(after_stat(y), 3)), vjust = -1) + 
        geom_hline(aes(yintercept = bg, color = type), linetype = "dashed") +
        facet_wrap(~species, nrow = 1) + 
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
    ggsave(filename = paste0("paralog_family_ratio.switching_within_species.cell_family.summary.", type, ".pdf"),p , width = 5, height = 7)
}

In [7]:
get_fa_ratio(species, "wilcox")
get_fa_ratio(species, "roc")

“[1m[22mRemoved 5 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 5 rows containing non-finite outside the scale range
(`stat_summary()`).”


In [None]:
# one sample t-test p-values were computed below and added to plots


In [6]:
unique(ratio_bg)

Unnamed: 0_level_0,type,bg,species
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
1,WGD,0.26,Hsap
2,SSD,0.35,Hsap
12,WGD,0.26,Mmus
21,SSD,0.31,Mmus
15,SSD,0.28,Pvit
22,WGD,0.29,Pvit
16,WGD,0.18,Pmar
23,SSD,0.29,Pmar
17,SSD,0.44,Bflo


In [14]:
t.test(df[df$type == 'SSD' & df$species == 'Bflo', 'paralogs.'], mu = 0.44, alternative = "two.sided")
t.test(df[df$type == 'SSD' & df$species == 'Pmar', 'paralogs.'], mu = 0.29, alternative = "two.sided")
t.test(df[df$type == 'SSD' & df$species == 'Pvit', 'paralogs.'], mu = 0.28, alternative = "two.sided")
t.test(df[df$type == 'SSD' & df$species == 'Mmus', 'paralogs.'], mu = 0.31, alternative = "two.sided")
t.test(df[df$type == 'SSD' & df$species == 'Hsap', 'paralogs.'], mu = 0.35, alternative = "two.sided")


	One Sample t-test

data:  df[df$type == "SSD" & df$species == "Bflo", "paralogs."]
t = -8.6739, df = 53, p-value = 9.531e-12
alternative hypothesis: true mean is not equal to 0.44
95 percent confidence interval:
 0.3121751 0.3601888
sample estimates:
mean of x 
 0.336182 



	One Sample t-test

data:  df[df$type == "SSD" & df$species == "Pmar", "paralogs."]
t = -13.837, df = 89, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.29
95 percent confidence interval:
 0.2567284 0.2650842
sample estimates:
mean of x 
0.2609063 



	One Sample t-test

data:  df[df$type == "SSD" & df$species == "Pvit", "paralogs."]
t = -5.4552, df = 84, p-value = 4.84e-07
alternative hypothesis: true mean is not equal to 0.28
95 percent confidence interval:
 0.2401816 0.2614566
sample estimates:
mean of x 
0.2508191 



	One Sample t-test

data:  df[df$type == "SSD" & df$species == "Mmus", "paralogs."]
t = 0.52207, df = 109, p-value = 0.6027
alternative hypothesis: true mean is not equal to 0.31
95 percent confidence interval:
 0.3039144 0.3204382
sample estimates:
mean of x 
0.3121763 



	One Sample t-test

data:  df[df$type == "SSD" & df$species == "Hsap", "paralogs."]
t = -4.9783, df = 84, p-value = 3.374e-06
alternative hypothesis: true mean is not equal to 0.35
95 percent confidence interval:
 0.3178254 0.3361930
sample estimates:
mean of x 
0.3270092 


In [15]:
t.test(df[df$type == 'WGD' & df$species == 'Pmar', 'paralogs.'], mu = 0.18, alternative = "two.sided")
t.test(df[df$type == 'WGD' & df$species == 'Pvit', 'paralogs.'], mu = 0.29, alternative = "two.sided")
t.test(df[df$type == 'WGD' & df$species == 'Mmus', 'paralogs.'], mu = 0.26, alternative = "two.sided")
t.test(df[df$type == 'WGD' & df$species == 'Hsap', 'paralogs.'], mu = 0.26, alternative = "two.sided")


	One Sample t-test

data:  df[df$type == "WGD" & df$species == "Pmar", "paralogs."]
t = 33.888, df = 89, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.18
95 percent confidence interval:
 0.3436814 0.3640714
sample estimates:
mean of x 
0.3538764 



	One Sample t-test

data:  df[df$type == "WGD" & df$species == "Pvit", "paralogs."]
t = 11.272, df = 84, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.29
95 percent confidence interval:
 0.4092496 0.4603384
sample estimates:
mean of x 
 0.434794 



	One Sample t-test

data:  df[df$type == "WGD" & df$species == "Mmus", "paralogs."]
t = 16.888, df = 109, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.26
95 percent confidence interval:
 0.3501383 0.3741079
sample estimates:
mean of x 
0.3621231 



	One Sample t-test

data:  df[df$type == "WGD" & df$species == "Hsap", "paralogs."]
t = 16.95, df = 84, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.26
95 percent confidence interval:
 0.3779659 0.4093260
sample estimates:
mean of x 
 0.393646 
