In [1]:
# considering variance > 0.25 as rule to classify gene categories
suppressMessages(suppressWarnings({
    library(dplyr)
    library(ggplot2)
    library(tidyr)
    library(ggvenn)
    library(ggpubr)
}))

In [2]:
get_res <- function(exp, type){
    for (species in c('Hsap', 'Mmus', 'Pvit', 'Pmar')){
        results <- read.delim(paste0(species,'.', type ,'.LMM_variance_decomp.cell_vs_location.txt'))
        results$Category <- "None"
        results[results$location > 0.25 & results$celltype < 0.25, "Category"] <- "High across location"
        results[results$location < 0.25 & results$celltype > 0.25, "Category"] <- "High across cell type family"
        results[results$location > 0.25 & results$celltype > 0.25, "Category"] <- "Both to location and cell type"
        write.table(results, paste0(species,'.', type ,'.LMM_variance_decomp.cell_vs_location.v2.txt'), quote = F, sep = '\t', row.names = F, col.names = T)
        
        p <- ggplot(results, aes(x = celltype, y = location, color = Category)) +
            geom_point(alpha = 0.6, size = 2) +
            scale_color_manual(values = c("High across location" = "#A4CB9E",
                                    "High across cell type family" = "#E17327",
                                    "Both to location and cell type" = "#6D65A3",
                                    "None" = "#A3A5A6"
                                    )) +
            labs(x = "Fraction of variance across cell type family", y = "Fraction of variance across location", color = "Category") +
            theme_minimal() + theme( legend.position = "top", text = element_text(size = 12)) + coord_fixed(ratio = 1)
        ggsave(paste0("figures/", species,'.',type,".LMM_variance_decomp.cell_vs_location.pdf"), p, width = 6, height = 6)    
    }
}

get_res(exp_AST_GABA, 'AST_GABA')
get_res(exp_AST_Glut, 'AST_Glut')

In [3]:
# background genes (protein-coding genes) in our datasets
Hsap_bg <- read.table("/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/04.ohno_para_significance/4.GO_enrichment/Hsap_bg")$V1
Mmus_bg <- read.table("/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/04.ohno_para_significance/4.GO_enrichment/Mmus_bg")$V1
Pvit_bg <- read.table("/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/04.ohno_para_significance/4.GO_enrichment/Pvit_bg")$V1
Pmar_bg <- read.table("/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/04.ohno_para_significance/4.GO_enrichment/Pmar_bg")$V1

In [130]:
# Function to calculate overlap and p-value
calc_pvalue <- function(set1, set2, universe_size) {
  overlap <- length(intersect(set1, set2))
  matrix_vals <- matrix(c(overlap, length(set1) - overlap, length(set2) - overlap, universe_size - length(set1) - length(set2) + overlap), 
                        nrow = 2)
    if(fisher.test(matrix_vals)$estimate <= 1){
        return(1)
    } else {
        fisher.test(matrix_vals)$p.value
    }
}


get_res <- function(species, universe_size){
    x1 = read.delim(paste0(species, '.AST_GABA.LMM_variance_decomp.cell_vs_location.txt'), header = T)
    x2 = read.delim(paste0(species, '.AST_Glut.LMM_variance_decomp.cell_vs_location.txt'), header = T)
    
    gene_sets <- list(
        "AST-Glut Location" = x2[x2$location > 0.25, 1],
        "AST-GABA Location" = x1[x1$location > 0.25, 1],
        "AST-Glut CellType" = x2[x2$celltype > 0.25, 1],
        "AST-GABA CellType" = x1[x1$celltype > 0.25, 1]
    )
    # Calculate p-values matrix
    p_values <- matrix(NA, nrow = 4, ncol = 4)
    rownames(p_values) <- colnames(p_values) <- names(gene_sets)
    for (i in 1:4) {
        for (j in 1:4) {
            p_values[i, j] <- calc_pvalue(gene_sets[[i]], gene_sets[[j]], universe_size)
        }
    }
    
    # Convert to -log10 scale
    log_p_values <- -log10(p_values)

    # Melt for ggplot2
    log_p_values_melt <- reshape2::melt(log_p_values, na.rm = TRUE)
    log_p_values_melt$number <- vapply(1:nrow(log_p_values_melt), FUN = function(row){
        var1 = log_p_values_melt[row, 1]
        var2 = log_p_values_melt[row, 2]
        return(length(intersect(gene_sets[[var1]], gene_sets[[var2]])))
    }, FUN.VALUE = numeric(1))
    
    log_p_values_melt[is.infinite(log_p_values_melt$value), 'value'] = 200 # set a large value to represent -log(0)
    log_p_values_melt[which(log_p_values_melt$Var1 == log_p_values_melt$Var2), 'value'] <- NA
    p <- ggplot(log_p_values_melt, aes(Var1, Var2, fill = value)) + 
        geom_tile(color = "white") + 
        scale_fill_gradient(low = "grey", high = "red", na.value = "white", 
                            name = expression(-log[10](P)), limits = c(0, 10), oob = scales::squish) +
        geom_text(aes(label = number), color = "black") + theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
        labs(x = "", y = "", title = "Gene List Comparison Matrix")
    ggsave(p, filename = paste0('figures/', species, '.comparison_mat.pdf'), width = 6, height = 5)
    
}

In [5]:
get_res("Hsap", length(Hsap_bg))
get_res("Mmus", length(Mmus_bg))
get_res("Pvit", length(Pvit_bg))
get_res("Pmar", length(Pmar_bg))

In [6]:
# find conserved regionalisation genes
find_location_genes <- function (file){
    AST_n <- read.delim(file, header = T)
    AST_n <- AST_n[AST_n$location > 0.25, 'Gene']
    return(AST_n)
}

regionalisation_genes = list(Hsap_AST_GABA = find_location_genes('Hsap.AST_GABA.LMM_variance_decomp.cell_vs_location.txt'),
                             Hsap_AST_Glut = find_location_genes('Hsap.AST_Glut.LMM_variance_decomp.cell_vs_location.txt'),
                             Mmus_AST_GABA = find_location_genes('Mmus.AST_GABA.LMM_variance_decomp.cell_vs_location.txt'),
                             Mmus_AST_Glut = find_location_genes('Mmus.AST_Glut.LMM_variance_decomp.cell_vs_location.txt'),
                             Pvit_AST_GABA = find_location_genes('Pvit.AST_GABA.LMM_variance_decomp.cell_vs_location.txt'),
                             Pvit_AST_Glut = find_location_genes('Pvit.AST_Glut.LMM_variance_decomp.cell_vs_location.txt'),
                             Pmar_AST_GABA = find_location_genes('Pmar.AST_GABA.LMM_variance_decomp.cell_vs_location.txt'),
                             Pmar_AST_Glut = find_location_genes('Pmar.AST_Glut.LMM_variance_decomp.cell_vs_location.txt'))


In [7]:
# load orthogroups
orthogroups <- read.delim('/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/02.gene_relationships/run4/results/Ortho_pipeline/OrthoFinder/Orthogroups/Orthogroups.tsv')
# *** at least one copy for at least one species
orthogroups <- orthogroups %>% select(c('Orthogroup', 'Pmar', 'Pvit', 'Mmus', 'Hsap'))  %>% 
    filter(Pmar != '' | Pvit != ''  | Mmus != '' | Hsap != '')
# having at least one copy for each species scoped, for below GO background 
orthogroups2 <- orthogroups %>% select(c('Orthogroup', 'Pmar', 'Pvit', 'Mmus', 'Hsap'))  %>% 
    filter(Pmar != '' & Pvit != ''  & Mmus != '' & Hsap != '')

In [8]:
# find shared orthogroups for AST-GABA regionalisation
Hsap_AST_GABA_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Hsap, sep = ", ") %>% 
    filter(Hsap %in% regionalisation_genes[['Hsap_AST_GABA']]) %>% distinct() %>% select(Orthogroup)))
Mmus_AST_GABA_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Mmus, sep = ", ") %>% 
    filter(Mmus %in% regionalisation_genes[['Mmus_AST_GABA']]) %>% distinct() %>% select(Orthogroup)))
Pvit_AST_GABA_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Pvit, sep = ", ") %>% 
    filter(Pvit %in% regionalisation_genes[['Pvit_AST_GABA']]) %>% distinct() %>% select(Orthogroup)))
Pmar_AST_GABA_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Pmar, sep = ", ") %>% 
    filter(Pmar %in% regionalisation_genes[['Pmar_AST_GABA']]) %>% distinct() %>% select(Orthogroup)))

# Amniote conserved
Amniote_AST_GABA_region_orthogroups <- intersect(intersect(Hsap_AST_GABA_region_orthogroups, Mmus_AST_GABA_region_orthogroups), Pvit_AST_GABA_region_orthogroups)
# Vertebrate conserved
Vertebrate_AST_GABA_region_orthogroups <- intersect(Amniote_AST_GABA_region_orthogroups, Pmar_AST_GABA_region_orthogroups)


In [9]:
# find shared orthogroups for AST-Glut regionalisation
Hsap_AST_Glut_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Hsap, sep = ", ") %>% 
    filter(Hsap %in% regionalisation_genes[['Hsap_AST_Glut']]) %>% distinct() %>% select(Orthogroup)))
Mmus_AST_Glut_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Mmus, sep = ", ") %>% 
    filter(Mmus %in% regionalisation_genes[['Mmus_AST_Glut']]) %>% distinct() %>% select(Orthogroup)))
Pvit_AST_Glut_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Pvit, sep = ", ") %>% 
    filter(Pvit %in% regionalisation_genes[['Pvit_AST_Glut']]) %>% distinct() %>% select(Orthogroup)))
Pmar_AST_Glut_region_orthogroups <- unique(unlist(orthogroups %>% separate_rows(Pmar, sep = ", ") %>% 
    filter(Pmar %in% regionalisation_genes[['Pmar_AST_Glut']]) %>% distinct() %>% select(Orthogroup)))

# Amniote conserved
Amniote_AST_Glut_region_orthogroups <- intersect(intersect(Hsap_AST_Glut_region_orthogroups, Mmus_AST_Glut_region_orthogroups), Pvit_AST_Glut_region_orthogroups)
# Vertebrate conserved
Vertebrate_AST_Glut_region_orthogroups <- intersect(Amniote_AST_Glut_region_orthogroups, Pmar_AST_Glut_region_orthogroups)


In [10]:
# regionalisation genes in the vertebrate-conserved AST-Glut/AST-GABA regionalisation orthogroups
Hsap_genes_in_region_orthogroups = unlist(orthogroups %>% filter(Orthogroup %in% c(Vertebrate_AST_Glut_region_orthogroups, Vertebrate_AST_GABA_region_orthogroups)) %>% separate_rows(Hsap, sep = ", ") %>%  
    filter(Hsap %in% c(regionalisation_genes[['Hsap_AST_Glut']], regionalisation_genes[['Hsap_AST_GABA']])) %>% distinct() %>% select(Hsap))
Mmus_genes_in_region_orthogroups = unlist(orthogroups %>% filter(Orthogroup %in% c(Vertebrate_AST_Glut_region_orthogroups, Vertebrate_AST_GABA_region_orthogroups)) %>% separate_rows(Mmus, sep = ", ") %>%  
    filter(Mmus %in% c(regionalisation_genes[['Mmus_AST_Glut']], regionalisation_genes[['Mmus_AST_GABA']])) %>% distinct() %>% select(Mmus))
Pvit_genes_in_region_orthogroups = unlist(orthogroups %>% filter(Orthogroup %in% c(Vertebrate_AST_Glut_region_orthogroups, Vertebrate_AST_GABA_region_orthogroups)) %>% separate_rows(Pvit, sep = ", ") %>%  
    filter(Pvit %in% c(regionalisation_genes[['Pvit_AST_Glut']], regionalisation_genes[['Pvit_AST_GABA']])) %>% distinct() %>% select(Pvit))
Pmar_genes_in_region_orthogroups = unlist(orthogroups %>% filter(Orthogroup %in% c(Vertebrate_AST_Glut_region_orthogroups, Vertebrate_AST_GABA_region_orthogroups)) %>% separate_rows(Pmar, sep = ", ") %>%  
    filter(Pmar %in% c(regionalisation_genes[['Pmar_AST_Glut']], regionalisation_genes[['Pmar_AST_GABA']])) %>% distinct() %>% select(Pmar))

In [11]:
# check and plot overalp across species by Fisher's exact test, comparing at orthogroup level
get_res2 <- function(type){
    species <- c('Hsap', 'Mmus', 'Pvit', 'Pmar')
    # Calculate p-values matrix
    p_values <- matrix(NA, nrow = 4, ncol = 4)
    intersect_numbers <- matrix(NA, nrow = 4, ncol = 4)
    rownames(p_values) <- colnames(p_values) <- species
    rownames(intersect_numbers) <- colnames(intersect_numbers) <- species
    for (i in species) {
        for (j in species) {
            if (i != j){
                set1 = get(paste0(i,'_AST_', type, '_region_orthogroups'))
                set2 = get(paste0(j,'_AST_', type, '_region_orthogroups'))
                
                p_values[i, j] <- calc_pvalue(set1, set2, nrow(orthogroups))
                intersect_numbers[i,j] <- length(intersect(set1, set2))
            } else {
                p_values[i, j] <- 1
                set1 = get(paste0(i,'_AST_', type, '_region_orthogroups'))
                intersect_numbers[i,j] <- length(set1)
            }
        }
    }
    
    # Convert to -log10 scale
    log_p_values <- -log10(p_values)

    # Melt for ggplot2
    log_p_values_melt <- reshape2::melt(log_p_values, na.rm = TRUE)
    log_p_values_melt$number <- vapply(1:nrow(log_p_values_melt), FUN = function(row){
        var1 = log_p_values_melt[row, 1]
        var2 = log_p_values_melt[row, 2]
        return(intersect_numbers[var1, var2])
    }, FUN.VALUE = numeric(1))
    
    log_p_values_melt[is.infinite(log_p_values_melt$value), 'value'] = 200 # set a large value to represent -log(0)
    log_p_values_melt[which(log_p_values_melt$Var1 == log_p_values_melt$Var2), 'value'] <- NA
    p <- ggplot(log_p_values_melt, aes(Var1, Var2, fill = value)) + 
        geom_tile(color = "white") + 
        scale_fill_gradient(low = "grey", high = "red", na.value = "white", 
                            name = expression(-log[10](P)), limits = c(0, 10), oob = scales::squish) +
        geom_text(aes(label = number), color = "black") + theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
        labs(x = "", y = "", title = "Orthogroup Comparison Matrix")
    ggsave(p, filename = paste0('figures/Vertebrate.AST_', type, '.comparison_mat_across_species_orthogroup_level.pdf'), width = 6, height = 5)
    
}

In [12]:
get_res2('GABA')
get_res2('Glut')

In [13]:
# GO enrichment plot 
suppressPackageStartupMessages({
    library(org.Hs.eg.db)
    library(org.Mm.eg.db)
    library(org.Pvitticeps.eg.db)
    library(org.Pmarinus.eg.db)
    library(clusterProfiler)
    library(enrichplot)
    library(ggplot2)
})
options(enrichplot.colours = c("red","blue"))

“package ‘Biobase’ was built under R version 4.3.3”
“package ‘IRanges’ was built under R version 4.3.3”
“package ‘S4Vectors’ was built under R version 4.3.3”


In [14]:
Hsap_bg2 <- Hsap_bg[Hsap_bg %in% unlist(orthogroups2 %>% separate_rows(Hsap, sep = ", ") %>% select(Hsap))]
Hsap_ego2 <- enrichGO(gene         = Hsap_genes_in_region_orthogroups,
                 OrgDb         = org.Hs.eg.db,
                 keyType = "ENSEMBL",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Hsap_bg2,
                 ont  = "BP")

In [15]:
Mmus_bg2 <- Mmus_bg[Mmus_bg %in% unlist(orthogroups2 %>% separate_rows(Mmus, sep = ", ") %>% select(Mmus))]
Mmus_ego2 <- enrichGO(gene         = Mmus_genes_in_region_orthogroups,
                 OrgDb         = org.Mm.eg.db,
                 keyType = "SYMBOL",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Mmus_bg2,
                 ont  = "BP")

In [16]:
Pvit_bg2 <- Pvit_bg[Pvit_bg %in% unlist(orthogroups2 %>% separate_rows(Pvit, sep = ", ") %>% select(Pvit))]
Pvit_ego2 <- enrichGO(gene         = Pvit_genes_in_region_orthogroups,
                 OrgDb         = org.Pvitticeps.eg.db,
                 keyType = "GID",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Pvit_bg2,
                 ont  = "BP")

In [17]:
Pmar_bg2 <- Pmar_bg[Pmar_bg %in% unlist(orthogroups2 %>% separate_rows(Pmar, sep = ", ") %>% select(Pmar))]
Pmar_ego2 <- enrichGO(gene         = Pmar_genes_in_region_orthogroups,
                 OrgDb         = org.Pmarinus.eg.db,
                 keyType = "GID",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Pmar_bg2,
                 ont  = "BP")

In [18]:
# plot selected terms
#selected <- c('regionalization', 'forebrain development', 'embryonic organ development', 'forebrain regionalization', 'telencephalon development',
#              'dorsal/ventral pattern formation', 'proximal/distal pattern formation', 
#              'pattern specification process', 'neuron fate determination', 'anterior/posterior pattern specification',
#             'embryonic organ development', 'axon development', 'diencephalon development', 'renal tubule development')
#fig <- dotplot(Hsap_ego2, showCategory = selected)
#ggsave(filename = "figures/Hsap.conserved_regionalisation_genes.selectedGO.pdf", fig, width = 5, height = 6)
#
#selected <- c('pattern specification process', 'anterior/posterior pattern specification','forebrain regionalization', 'embryonic organ development', 'cell fate determination',
#             'regulation of nervous system development','regulation of neuron differentiation', 'hindbrain development',
#             'midbrain development', 'dorsal/ventral pattern formation', 'proximal/distal pattern formation',
#             'embryonic brain development', 'sensory system development', 'visual system development')
#fig <- dotplot(Mmus_ego2, showCategory = selected)
#ggsave(filename = "figures/Mmus.conserved_regionalisation_genes.selectedGO.pdf", fig, width = 5, height = 6)
#
#selected <- c('neural crest cell development', 'forebrain development', 'hindbrain development', 'telencephalon development',
#              'neuron fate commitment','forebrain regionalization','pattern specification process', 'regionalization',
#             'regulation of nervous system development', 'cell fate determination','embryonic brain development', 'dorsal/ventral pattern formation', 
#              'anterior/posterior pattern specification','proximal/distal pattern formation', 'stem cell development')
#fig <- dotplot(Pvit_ego2, showCategory = selected)
#ggsave(filename = "figures/Pvit.conserved_regionalisation_genes.selectedGO.pdf", fig, width = 5, height = 6)
#
#selected <- c('regionalization', 'pattern specification process', 'anterior/posterior pattern specification', 'dorsal/ventral pattern formation',
#              'forebrain regionalization', 'midbrain-hindbrain boundary development', 'proximal/distal pattern formation', 'embryonic organ development',
#              'cell fate commitment', 'midbrain development', 'forebrain development', 'diencephalon development', 'axonogenesis', 'telencephalon development')
#fig <- dotplot(Pmar_ego2, showCategory = selected)
#ggsave(filename = "figures/Pmar.conserved_regionalisation_genes.selectedGO.pdf", fig, width = 5, height = 6)

In [19]:
# select genes for showing in Fig. 4
Hsap_genes_in_regionalisation <- intersect(regionalisation_genes[['Hsap_AST_Glut']], regionalisation_genes[['Hsap_AST_GABA']])
Mmus_genes_in_regionalisation <- intersect(regionalisation_genes[['Mmus_AST_Glut']], regionalisation_genes[['Mmus_AST_GABA']])
Pvit_genes_in_regionalisation <- intersect(regionalisation_genes[['Pvit_AST_Glut']], regionalisation_genes[['Pvit_AST_GABA']])
Pmar_genes_in_regionalisation <- intersect(regionalisation_genes[['Pmar_AST_Glut']], regionalisation_genes[['Pmar_AST_GABA']])


Hsap_ego2 <- enrichGO(gene         = Hsap_genes_in_regionalisation,
                 OrgDb         = org.Hs.eg.db,
                 keyType = "ENSEMBL",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Hsap_bg2,
                 ont  = "BP")

Mmus_ego2 <- enrichGO(gene         = Mmus_genes_in_regionalisation,
                 OrgDb         = org.Mm.eg.db,
                 keyType = "SYMBOL",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Mmus_bg2,
                 ont  = "BP")

Pvit_ego2 <- enrichGO(gene         = Pvit_genes_in_regionalisation,
                 OrgDb         = org.Pvitticeps.eg.db,
                 keyType = "GID",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Pvit_bg2,
                 ont  = "BP")

Pmar_ego2 <- enrichGO(gene         = Pmar_genes_in_regionalisation,
                 OrgDb         = org.Pmarinus.eg.db,
                 keyType = "GID",
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 universe = Pmar_bg2,
                 ont  = "BP")

In [166]:
"Wdr19" %in% Mmus_genes_in_region_orthogroups
"Wdr19" %in% Mmus_genes_in_regionalisation


In [30]:
# select genes for showing case in Fig. 4e
unique(Reduce(c, lapply(Mmus_ego2@result[grepl('dien', Mmus_ego2@result$Description), "geneID"], FUN = function(x){
    strsplit(x, split = '/')[[1]]
})))

In [81]:
Mmus_ego2@result[grepl('hind', Mmus_ego2@result$Description), ]

Unnamed: 0_level_0,ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<int>
GO:0030902,GO:0030902,hindbrain development,19/1250,162/13183,0.1954459,1,1,Aldh1a2/Cbln1/Ctnnb1/Dab1/Dlc1/En1/En2/Foxa2/Hoxa2/Hoxb2/Hoxb3/Igf1r/Kat2a/Neurod1/Neurod2/Ophn1/Prox1/Rnf7/Zbtb18,19
GO:0035137,GO:0035137,hindlimb morphogenesis,6/1250,46/13183,0.267792,1,1,Ctnnb1/Gdf5/Mecom/Tbx3/Tfap2b/Twist1,6
GO:0021533,GO:0021533,cell differentiation in hindbrain,4/1250,29/13183,0.2937841,1,1,Cbln1/Foxa2/Ophn1/Prox1,4
GO:0021932,GO:0021932,hindbrain radial glia guided cell migration,2/1250,13/13183,0.3531847,1,1,Dab1/Rnf7,2
GO:0035116,GO:0035116,embryonic hindlimb morphogenesis,4/1250,35/13183,0.4272707,1,1,Ctnnb1/Mecom/Tbx3/Twist1,4
GO:0021535,GO:0021535,cell migration in hindbrain,2/1250,20/13183,0.5781239,1,1,Dab1/Rnf7,2
GO:0021575,GO:0021575,hindbrain morphogenesis,5/1250,57/13183,0.6391508,1,1,Cbln1/Dab1/Dlc1/Ophn1/Prox1,5


In [20]:
# select genes for showing case in Fig. 4e
unique(Reduce(c, lapply(Mmus_ego2@result[grepl('regionalization|development', Mmus_ego2@result$Description), "geneID"], FUN = function(x){
    strsplit(x, split = '/')[[1]]
})))

In [21]:
# perfertial retention of orthogroups
# calculate number of genes by calcualting commas in it
count_commas <- function(x) {
  sapply(gregexpr(",", x), function(match) ifelse(match[1] == -1, 0, length(match)))
}
number_genes <- data.frame(apply(orthogroups[,-1], c(1,2), count_commas))
         
number_genes$Pmar[orthogroups$Pmar != ""] <- number_genes$Pmar[orthogroups$Pmar != ""] + 1
number_genes$Pvit[orthogroups$Pvit != ""] <- number_genes$Pvit[orthogroups$Pvit != ""] + 1
number_genes$Mmus[orthogroups$Mmus != ""] <- number_genes$Mmus[orthogroups$Mmus != ""] + 1
number_genes$Hsap[orthogroups$Hsap != ""] <- number_genes$Hsap[orthogroups$Hsap != ""] + 1
number_genes$Orthogroup <- orthogroups$Orthogroup

In [22]:
Hsap_region_orthogroups <- intersect(Hsap_AST_GABA_region_orthogroups, Hsap_AST_Glut_region_orthogroups)
Mmus_region_orthogroups <- intersect(Mmus_AST_GABA_region_orthogroups, Mmus_AST_Glut_region_orthogroups)
Pvit_region_orthogroups <- intersect(Pvit_AST_GABA_region_orthogroups, Pvit_AST_Glut_region_orthogroups)
Pmar_region_orthogroups <- intersect(Pmar_AST_GABA_region_orthogroups, Pmar_AST_Glut_region_orthogroups)

In [52]:
# test to show size of orthogroups
number_of_orthogroups <- reshape2::melt(number_genes)
number_of_orthogroups$type <- 'bg'

tmp1 <- number_of_orthogroups %>% filter(Orthogroup %in% Hsap_region_orthogroups & variable == 'Hsap')
tmp2 <- number_of_orthogroups %>% filter(Orthogroup %in% Mmus_region_orthogroups & variable == 'Mmus') 
tmp3 <- number_of_orthogroups %>% filter(Orthogroup %in% Pvit_region_orthogroups & variable == 'Pvit') 
tmp4 <- number_of_orthogroups %>% filter(Orthogroup %in% Pmar_region_orthogroups & variable == 'Pmar') 

tmp <- rbind(tmp1, tmp2, tmp3, tmp4)
tmp$type <- 'regionalisation'
number_of_orthogroups <- rbind(number_of_orthogroups, tmp)
number_of_orthogroups$species <- factor(number_of_orthogroups$variable, levels = species)
number_of_orthogroups$type <- factor(number_of_orthogroups$type, levels = c("regionalisation", "bg"))
p <- gghistogram(number_of_orthogroups, x = "value", fill = "type", color = "type",
   add = "mean",  binwidth = 1,  boundary = 0) +  facet_wrap(~species, ncol = 1) + xlim(0,10)
p

Using Orthogroup as id variables



In [66]:
t.test(number_genes[number_genes$Orthogroup %in% Mmus_region_orthogroups, "Mmus"],
            number_genes$Mmus)
wilcox.test(number_genes[number_genes$Orthogroup %in% Mmus_region_orthogroups, "Mmus"],
            number_genes$Mmus)


	Welch Two Sample t-test

data:  number_genes[number_genes$Orthogroup %in% Mmus_region_orthogroups, "Mmus"] and number_genes$Mmus
t = 9.4956, df = 1411, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.142704 1.737762
sample estimates:
mean of x mean of y 
 2.954785  1.514552 



	Wilcoxon rank sum test with continuity correction

data:  number_genes[number_genes$Orthogroup %in% Mmus_region_orthogroups, "Mmus"] and number_genes$Mmus
W = 13354856, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0


In [24]:
t.test(number_genes[number_genes$Orthogroup %in% Hsap_region_orthogroups, "Hsap"],
            number_genes$Hsap)
wilcox.test(number_genes[number_genes$Orthogroup %in% Hsap_region_orthogroups, "Hsap"],
            number_genes$Hsap)


	Welch Two Sample t-test

data:  number_genes[number_genes$Orthogroup %in% Hsap_region_orthogroups, "Hsap"] and number_genes$Hsap
t = 12.936, df = 1213.1, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.683672 2.285690
sample estimates:
mean of x mean of y 
 3.394247  1.409566 



	Wilcoxon rank sum test with continuity correction

data:  number_genes[number_genes$Orthogroup %in% Hsap_region_orthogroups, "Hsap"] and number_genes$Hsap
W = 12338644, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0


In [25]:
t.test(number_genes[number_genes$Orthogroup %in% Pvit_region_orthogroups, "Pvit"],
            number_genes$Pvit)
wilcox.test(number_genes[number_genes$Orthogroup %in% Pvit_region_orthogroups, "Pvit"],
            number_genes$Pvit)


	Welch Two Sample t-test

data:  number_genes[number_genes$Orthogroup %in% Pvit_region_orthogroups, "Pvit"] and number_genes$Pvit
t = 14.218, df = 2786.3, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.083212 1.429784
sample estimates:
mean of x mean of y 
 2.549296  1.292798 



	Wilcoxon rank sum test with continuity correction

data:  number_genes[number_genes$Orthogroup %in% Pvit_region_orthogroups, "Pvit"] and number_genes$Pvit
W = 25539590, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0


In [26]:
t.test(number_genes[number_genes$Orthogroup %in% Pmar_region_orthogroups, "Pmar"],
            number_genes$Pmar)
wilcox.test(number_genes[number_genes$Orthogroup %in% Pmar_region_orthogroups, "Pmar"],
            number_genes$Pmar)


	Welch Two Sample t-test

data:  number_genes[number_genes$Orthogroup %in% Pmar_region_orthogroups, "Pmar"] and number_genes$Pmar
t = 11.961, df = 1296.9, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.439044 2.003691
sample estimates:
mean of x mean of y 
 2.988076  1.266709 



	Wilcoxon rank sum test with continuity correction

data:  number_genes[number_genes$Orthogroup %in% Pmar_region_orthogroups, "Pmar"] and number_genes$Pmar
W = 13433921, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0


In [134]:
# test whether these genes are significantly associated with WGD or SS
WGD_SSD <- readRDS('../../../../10.dNdS/Combined.SSD_WGD.pairs.rds')

# only OR > 1 results were retained in calc_pvalue function
species <- c('Hsap', 'Mmus', 'Pvit', 'Pmar')
type <- c('WGD', 'SSD')

OR_WGD_SSD_region <- c()
for (s in species){
    tmp1 <- get(paste0(s, '_genes_in_region_orthogroups'))
    for (t in type){
        tmp2 <- WGD_SSD[[s]] %>% filter(type == t)
        tmp2 <- unique(c(tmp2$Dup1, tmp2$Dup2))
        tmp3 <- length(get(paste0(s, '_bg')))
        OR_WGD_SSD_region <- rbind(OR_WGD_SSD_region, c(s, t,  length(intersect(tmp1, tmp2)),
                                                        calc_OR(tmp1, tmp2, tmp3),
                                                        calc_pvalue(tmp1, tmp2, tmp3) ))
    }
}
OR_WGD_SSD_region <- as.data.frame(OR_WGD_SSD_region)
colnames(OR_WGD_SSD_region) <- c('species', 'type', 'number','ORs','pvalue')
OR_WGD_SSD_region$species <- factor(OR_WGD_SSD_region$species, levels = species)
OR_WGD_SSD_region$log_p_values <- -log10(as.double(OR_WGD_SSD_region$pvalue))
OR_WGD_SSD_region[is.infinite(OR_WGD_SSD_region$log_p_values), 'log_p_values'] = 200
OR_WGD_SSD_region$text <- paste(OR_WGD_SSD_region$number,  round(as.double(OR_WGD_SSD_region$ORs),2), sep = '_')
p <- ggplot(OR_WGD_SSD_region, aes(species, type, fill = log_p_values)) + 
        geom_tile(color = "white") + 
        scale_fill_gradient(low = "grey", high = "red", na.value = "white", 
                            name = expression(-log[10](P)), limits = c(0, 15), oob = scales::squish) +
        geom_text(aes(label = text), color = "black") + theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
        labs(x = "", y = "", title = "Fishers between oh/SSDpa to regionalisation genes")
ggsave(p, filename = paste0('figures/Vertebrate.WGD_SSD_comparison_mat_across_species_regionalisation_genes.pdf'), width = 6, height = 3)


In [131]:
calc_OR <- function(set1, set2, universe_size) {
  overlap <- length(intersect(set1, set2))
  matrix_vals <- matrix(c(overlap, length(set1) - overlap, length(set2) - overlap, universe_size - length(set1) - length(set2) + overlap), 
                        nrow = 2)
    return(fisher.test(matrix_vals)$estimate)
}

In [133]:
OR_WGD_SSD_region

species,type,number,ORs,pvalue,log_p_values
<fct>,<chr>,<chr>,<chr>,<chr>,<dbl>
Hsap,WGD,194,2.20898790650062,9.25666991916475e-16,15.0335452
Hsap,SSD,219,1.60967718680702,5.53617893061472e-07,6.2567899
Mmus,WGD,192,2.33491182432456,1.96862782512772e-17,16.7058364
Mmus,SSD,198,1.576384252371,3.12117704650904e-06,5.5056816
Pvit,WGD,230,2.11378610058441,7.36637556681034e-16,15.1327461
Pvit,SSD,223,2.14050738607569,3.94852106060383e-16,15.4035655
Pmar,WGD,156,3.40781973491784,2.32627492899882e-28,27.633339
Pmar,SSD,158,1.15656932523699,0.171869198210773,0.7648019
