### Import and Setup


In [None]:
organism = "org.At.tair.db"
DEGs = "results.csv" #created in exploration.ipynb

In [None]:
library("ggplot2")
library("dplyr")
library("DESeq2")
library("clusterProfiler")
library("pathview")
library("enrichplot")
library(organism, character.only = TRUE)

In [None]:
# reading in data from deseq2
df = read.csv(DEGs, header=TRUE)

# we want the log2 fold change 
original_gene_list <- df$log2FoldChange

# name the vector
names(original_gene_list) <- df$Symbol

# omit any NA values 
gene_list<-na.omit(original_gene_list)

# sort the list in decreasing order (required for clusterProfiler)
gene_list = sort(gene_list, decreasing = TRUE)

In [None]:
# Check available keytypes
keytypes(org.At.tair.db)  

### GSEA

In [None]:
gse <- gseGO(geneList=gene_list, 
             ont ="ALL", 
             keyType = "SYMBOL", 
             #nPerm = 10000, 
             minGSSize = 3, 
             maxGSSize = 800, 
             pvalueCutoff = 0.05, 
             verbose = TRUE, 
             OrgDb = organism)

In [None]:
require(DOSE)
dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)

In [None]:
require(ggridges)
ridgeplot(gse) + labs(x = "enrichment distribution") + 
    theme(plot.margin = margin(1, 1, 1, 1, "cm"),
                axis.text.y = element_text(size = 8)) +
    coord_cartesian(clip = "off")

### KEGG

In [None]:
# Prep input
# Convert gene IDs for gseKEGG function
# We will lose some genes here because not all IDs will be converted
ids<-bitr(names(original_gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb=organism)

# Remove duplicates
dedup_ids = ids[!duplicated(ids[c("SYMBOL")]),]

# Create a new dataframe df2 which has only the genes which were successfully mapped using the bitr function above
df2 = df[df$Symbol %in% dedup_ids$SYMBOL,]

# Create a new column in df2 with the corresponding ENTREZ IDs
df2$Y <- dedup_ids$ENTREZID[match(df2$Symbol, dedup_ids$SYMBOL)]

# Create a vector of the gene unuiverse
kegg_gene_list <- df2$log2FoldChange

# Name vector with ENTREZ ids
names(kegg_gene_list) <- df2$Y

# omit any NA values 
kegg_gene_list<-na.omit(kegg_gene_list)

# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)

In [None]:

kegg_organism = "ath"
kk <- gseKEGG(geneList     = kegg_gene_list,
               organism     = kegg_organism,
               nPerm        = 10000,
               minGSSize    = 3,
               maxGSSize    = 800,
               pvalueCutoff = 0.05,
               keyType      = "ncbi-geneid")


In [None]:
dotplot(kk, showCategory = 10, title = "Enriched Pathways", split=".sign") + facet_grid(.~.sign)

In [None]:
ridgeplot(kk) + labs(x = "enrichment distribution")

### WebGestaltR Over Representation Analysis

In [None]:
# Install and load WebGestaltR if not already installed
library(WebGestaltR)
library(dplyr)
library(stringr)

In [None]:
# Prepare gene lists for WebGestaltR
# Filter for significant genes (padj < 0.05 and |log2FC| > 1)
significant_genes <- df[!is.na(df$padj) & df$padj < 0.05 & abs(df$log2FoldChange) > 1, ]

# Upregulated genes (log2FC > 1 and padj < 0.05)
up_genes_webgestalt <- significant_genes[significant_genes$log2FoldChange > 1, "Symbol"]
up_genes_webgestalt <- up_genes_webgestalt[!is.na(up_genes_webgestalt)]

# Downregulated genes (log2FC < -1 and padj < 0.05)
down_genes_webgestalt <- significant_genes[significant_genes$log2FoldChange < -1, "Symbol"]
down_genes_webgestalt <- down_genes_webgestalt[!is.na(down_genes_webgestalt)]

# Background/reference genes (all genes in the dataset)
background_genes <- df$Symbol[!is.na(df$Symbol)]

# Print summary
cat("Number of upregulated genes:", length(up_genes_webgestalt), "\n")
cat("Number of downregulated genes:", length(down_genes_webgestalt), "\n")
cat("Total background genes:", length(background_genes), "\n")

#### Gene Ontology Analysis - Upregulated Genes

In [None]:
# WebGestaltR GO analysis for upregulated genes
webgestalt_up_BP <- WebGestaltR(
    enrichMethod = "ORA",                    # Over-representation analysis
    organism = "athaliana",                  # Arabidopsis thaliana
    enrichDatabase = "geneontology_Biological_Process",
    interestGene = up_genes_webgestalt,
    referenceGene = background_genes,
    interestGeneType = "genesymbol",
    referenceGeneType = "genesymbol",
    minNum = 5,                              # Minimum number of genes in a category
    maxNum = 500,                            # Maximum number of genes in a category
    fdrThr = 0.05,                          # FDR threshold
    topThr = 20,                            # Top categories to show
    reportNum = 20,                         # Number of categories to report
    projectName = "WebGestalt_UP_BP",
    isOutput = FALSE                        # Don't write files to disk
)

print("WebGestaltR analysis for upregulated genes (BP) completed")

In [None]:
# Create dotplot for upregulated genes GO results
if (!is.null(webgestalt_up_BP) && nrow(webgestalt_up_BP) > 0) {
    # Prepare data for plotting
    plot_data_up <- webgestalt_up_BP %>%
        head(15) %>%  # Top 15 terms
        mutate(
            GeneRatio = overlap / size,
            negLog10FDR = -log10(FDR),
            Description = stringr::str_wrap(description, width = 50)
        ) %>%
        arrange(FDR)
    
    # Create dotplot
    webgestalt_up_dotplot <- ggplot(plot_data_up, aes(x = GeneRatio, y = reorder(Description, negLog10FDR))) +
        geom_point(aes(size = overlap, color = negLog10FDR)) +
        scale_color_gradient(low = "blue", high = "red", name = "-log10(FDR)") +
        scale_size_continuous(name = "Gene Count", range = c(2, 8)) +
        labs(
            title = "GO Biological Process - Upregulated Genes",
            x = "Gene Ratio",
            y = "GO Terms"
        ) +
        theme_bw() +
        theme(
            axis.text.y = element_text(size = 9),
            plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
            legend.text = element_text(size = 8),
            legend.title = element_text(size = 9)
        )
    
    print(webgestalt_up_dotplot)
} else {
    print("No significant GO terms found for upregulated genes")
}

#### Gene Ontology Analysis - Downregulated Genes

In [None]:
# WebGestaltR GO analysis for downregulated genes
webgestalt_down_BP <- WebGestaltR(
    enrichMethod = "ORA",                    # Over-representation analysis
    organism = "athaliana",                  # Arabidopsis thaliana
    enrichDatabase = "geneontology_Biological_Process",
    interestGene = down_genes_webgestalt,
    referenceGene = background_genes,
    interestGeneType = "genesymbol",
    referenceGeneType = "genesymbol",
    minNum = 5,                              # Minimum number of genes in a category
    maxNum = 500,                            # Maximum number of genes in a category
    fdrThr = 0.05,                          # FDR threshold
    topThr = 20,                            # Top categories to show
    reportNum = 20,                         # Number of categories to report
    projectName = "WebGestalt_DOWN_BP",
    isOutput = FALSE                        # Don't write files to disk
)

print("WebGestaltR analysis for downregulated genes (BP) completed")

In [None]:
# Create dotplot for downregulated genes GO results
if (!is.null(webgestalt_down_BP) && nrow(webgestalt_down_BP) > 0) {
    # Prepare data for plotting
    plot_data_down <- webgestalt_down_BP %>%
        head(15) %>%  # Top 15 terms
        mutate(
            GeneRatio = overlap / size,
            negLog10FDR = -log10(FDR),
            Description = stringr::str_wrap(description, width = 50)
        ) %>%
        arrange(FDR)
    
    # Create dotplot
    webgestalt_down_dotplot <- ggplot(plot_data_down, aes(x = GeneRatio, y = reorder(Description, negLog10FDR))) +
        geom_point(aes(size = overlap, color = negLog10FDR)) +
        scale_color_gradient(low = "blue", high = "red", name = "-log10(FDR)") +
        scale_size_continuous(name = "Gene Count", range = c(2, 8)) +
        labs(
            title = "GO Biological Process - Downregulated Genes",
            x = "Gene Ratio",
            y = "GO Terms"
        ) +
        theme_bw() +
        theme(
            axis.text.y = element_text(size = 9),
            plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
            legend.text = element_text(size = 8),
            legend.title = element_text(size = 9)
        )
    
    print(webgestalt_down_dotplot)
} else {
    print("No significant GO terms found for downregulated genes")
}

In [None]:
# Optional: Display summary statistics and top results
if (!is.null(webgestalt_up_BP) && nrow(webgestalt_up_BP) > 0) {
    cat("Top 5 GO terms for UPREGULATED genes:\n")
    print(webgestalt_up_BP[1:min(5, nrow(webgestalt_up_BP)), c("description", "enrichmentRatio", "FDR", "overlap")])
}

if (!is.null(webgestalt_down_BP) && nrow(webgestalt_down_BP) > 0) {
    cat("\nTop 5 GO terms for DOWNREGULATED genes:\n")
    print(webgestalt_down_BP[1:min(5, nrow(webgestalt_down_BP)), c("description", "enrichmentRatio", "FDR", "overlap")])
}