Read in tables with significant DEGs and mapped annotations like GO or KEGG. Create vector of Kegg id's to match them to a descriptive name. 

In [6]:
library('tidyr')
library('ggplot2')
library('stringr')
library('dplyr')
library("RCurl")
library('tibble')
library('KEGGREST')
library('RCurl')
#library('enrichplot')
#library('data.table')


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘RCurl’


The following object is masked from ‘package:tidyr’:

    complete




# Read in kegg annotations from eggnog
## make a list of all unique kegg_ko's from all annotations 


In [7]:
get.eggnog <- function(org){
    dir2  <- paste("/work/nclab/lucy/SAB/Annotation/eggnog/", org, sep="")
    
    emap <- read.csv(file.path(dir2, paste(org,".emapper.annotations", sep="")),
                 sep = "\t",
                 comment.char = "#",
                 header = FALSE,
                 na.strings = "-")
    
    colnames(emap) <- c(
        'orfs', 'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'Description', 
        'Preferred_name', 'GOs', 'EC', 'ko_id', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 
        'KEGG_rclass',  'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs')
    
    emap[, c("orfs", "seed_ortholog", "EC","GOs","ko_id",'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 
       'KEGG_rclass',  'BRITE', 'KEGG_TC', 'CAZy', 'BiGG_Reaction',"PFAMs")]
    }

emap4 <- get.eggnog('04')
emap8 <- get.eggnog('08')
emap6 <- get.eggnog('06')
emap13 <- get.eggnog('13')

In [2]:
length(na.omit(emap4$ko_id))


Create df for each organism with annotation of choice, removing the rows with NA's

In [8]:
get.anno <- function(emap, anno){
    #extract query column and annotation of choice, ex: pfams
    vars <- c("orfs", anno)
    df <- select(emap, all_of(vars))
    df <- df %>% filter(is.na(df[,ncol(df)])==FALSE)
    }

pfam4 <- get.anno(emap4, "PFAMs")
pfam8 <- get.anno(emap8, "PFAMs")
pfam6 <- get.anno(emap6, "PFAMs")
pfam13 <- get.anno(emap13, "PFAMs")

ko4_ls <- get.anno(emap4,"ko_id")
ko8_ls <- get.anno(emap8,"ko_id")
ko6_ls <- get.anno(emap6,"ko_id")
ko13_ls <- get.anno(emap13,"ko_id")

go4 <- get.anno(emap4, "GOs")
go8 <- get.anno(emap8, "GOs")
go6 <- get.anno(emap6, "GOs")
go13 <- get.anno(emap13, "GOs")

paths4 <- get.anno(emap4, c('KEGG_Module', 'KEGG_Pathway','BRITE'))
paths8 <- get.anno(emap8, c('KEGG_Module', 'KEGG_Pathway','BRITE'))
paths6 <- get.anno(emap6, c('KEGG_Module', 'KEGG_Pathway','BRITE'))
paths13 <- get.anno(emap13, c('KEGG_Module', 'KEGG_Pathway','BRITE'))

In [None]:
print('# kos')
dim(ko4_ls)
dim(ko8_ls)
dim(ko6_ls)
dim(ko13_ls)
print('# pfams')
dim(pfam4)
dim(pfam8)
dim(pfam6)
dim(pfam13)
print('# gos')
dim(go4)
dim(go8)
dim(go6)
dim(go13)


## Create a ko-to-orf mapping table
#### Split columns with mulitple annotations and pivot into a long table with repeating orfs
This will give us a 3 column table with columns 'orfs', 'ko_id', and 'ko_iteration'; which repeats orfs for rows from original table with multiple eggnog assignments. The ko_iteration increases from 1 to n for n eggnog assignmnet for a particular orf. 

In [10]:
split_into_multiple <- function(column, pattern = ",", into_prefix){
    #adapted from post on stack overflow
    cols <- str_split_fixed(column, pattern, n = Inf)
  # Replace empty matrix indicies with NA's 
    cols[which(cols == "")] <- NA
  # turn matrix into a table with unique but arbitraty column names  
    cols <- as_tibble(cols, .name_repair = make.names)
  # where m = # columns in tibble 'cols'
    m <- dim(cols)[2]
  # assign column names as 'into_prefix_1', 'into_prefix_2', ..., 'into_prefix_m' 
    names(cols) <- paste(into_prefix, 1:m, sep = "_")
    print('# of values in matrix w/o NA:')
    print(sum(!is.na(cols)))
    return(cols)
}



clean_ko <- function(df, org){
  # split up ko_id's into multiple columns naming each column 
  # ko:_1 to ko:_n 
  # (remember at this point we have the same number of rows but 
  # far more columns)
    ko_iterations <- split_into_multiple(df$ko_id, ",", "ko:")
  # select the orfs column from original df and bind to to 
  # split columns 
    df = df %>% select(orfs) %>% bind_cols(ko_iterations)
  # now combine all ko:_n columns so all ko_id's 
  # become one column called ko_id and each column name, 
  # 'ko:_1'...'ko:_n' becomes one column called 'ko_iteration'. 
  # This tells us how many ko_id's were assigned to a particular orf
  # the column 'orfs' will repeat values for rows with > 1 ko_iteration
  # Drop the values from the matrix which were NA
    df_clean = pivot_longer(df, cols = !orfs, values_drop_na = T,
                            values_to = 'ko_id', names_to = 'ko_iteration')
  # finally, clean up the df a bit by removing the ko_id prefixes 'ko:'
  # we now have a 3 column table with orfs repeated in for each instance 
  # a ko_id was assigned to it by eggnog!
    df_clean$ko_id <- gsub('ko:', '', df_clean$ko_id)
  # the final df should have same number of rows as sum(!na(cols))
    print('# rows in final df:')
    print(nrow(df_clean))
    write.csv(df_clean, paste('../kegg_names/ko',org, '_ls.csv', sep=''), row.names=F)
    df_clean
    }

ko4 <- clean_ko(ko4_ls,'4')
ko6 <- clean_ko(ko6_ls,'6')
ko8 <- clean_ko(ko8_ls,'8')
ko13 <- clean_ko(ko13_ls,'13')

[1] "# of values in matrix w/o NA:"
[1] 14281
[1] "# rows in final df:"
[1] 14281
[1] "# of values in matrix w/o NA:"
[1] 35251
[1] "# rows in final df:"
[1] 35251
[1] "# of values in matrix w/o NA:"
[1] 12765
[1] "# rows in final df:"
[1] 12765
[1] "# of values in matrix w/o NA:"
[1] 30911
[1] "# rows in final df:"
[1] 30911


Create a list of all ko's found across all organisms. We will use the list to find the name and symbol of the ko_id's and map it back to the organism-specific tables made above.  

In [11]:
#create a df of all unique ko's found for all annotations
all.ko <- bind_rows(ko4, ko6,ko8,ko13) %>% select(ko_id)

all.ko <- distinct(all.ko, ko_id, .keep_all = T)
#so that bash will read my file correctly, I need to start with ko= 
#and surround all ko_id's by a single quote
kk=c('ko= ')
q=c('"')
all.ko=rbind(kk, q, all.ko, q)
#write to a .txt file with ko_id's separated by a space

write.table(all.ko, "../kegg_names/all.ko.txt", quote=F, sep=" ", eol=" ", row.names = FALSE, col.names = FALSE)

# a total of 7218 rows, or ko_id's 

In [12]:
#read in table with name and symbol matches to kegg ko's
#make tidy
ko_def <- read.delim("../kegg_names/ko_pathways.txt", head=F,sep = ";")
id <- str_extract(ko_def$V1,'K[[:digit:]]*')
sym <- str_remove(ko_def$V1, 'ko:K[[:digit:]]*')
ko_def <- data.frame(ko_id = id, symbol=sym, name=ko_def$V2)
isip=data.frame(ko_id=c('isip1a','isip_2a','isip_3'), 
                symbol=c('isip_1a','isip_2a','isip_3'),
                name=c('Iron stress induced protein 1a',
                       'Iron stress induced protein 2a', 'Iron stress induced protein 3'))
ko_def=bind_rows(ko_def, isip)
print(paste(nrow(ko_def), "    =   all unique ko's found across organisms"))
write.csv(ko_def,'../kegg_names/ko_def.csv', row.names=F)
tail(ko_def)

[1] "7191     =   all unique ko's found across organisms"


Unnamed: 0_level_0,ko_id,symbol,name
Unnamed: 0_level_1,<chr>,<chr>,<chr>
7186,K11287,TSPYL4,TSPY-like 4
7187,K11288,TSPYL5,TSPY-like 5
7188,K11289,TSPYL6,TSPY-like 6
7189,isip1a,isip_1a,Iron stress induced protein 1a
7190,isip_2a,isip_2a,Iron stress induced protein 2a
7191,isip_3,isip_3,Iron stress induced protein 3


# Now we have tidy df of query and annotation for mapping back to counts. We need to add the ISIP annotations though. 
## Read in isip isoform annotations and make a tidy df of all isip isoforms for each organism
Check that isip's were also counted by Salmon, remove any which were not

In [13]:
## read in the files
dir <- "/work/nclab/lucy/SAB/Annotation/eggnog/isip/pep_hit/"

read.isip <- function(org, prot){
    f = t(read.delim(file=paste(dir, org, 'isip', prot,'_hits.txt', sep=""),
                 sep = ' ',header = F))
    if (nrow(f) >=1){
    f <- data.frame("orfs"=f, 
                    "ko_iteration"="ko:_1",
                    "ko_id"=paste("isip_", prot, sep=""))}
}

i41 <- read.isip("04", "1")
i41a <- read.isip("04", "1a")
i41b <- read.isip("04", "1b")
i42a <- read.isip("04", "2a")
i43 <- read.isip("04", "3")
isip4 <- rbind(i41a, i41b, i42a, i43)
isip4 <- distinct(isip4, orfs, .keep_all = T)

i81 <- read.isip("08", "1")
i81a <- read.isip("08", "1a")
i81b <- read.isip("08", "1b")
i82a <- read.isip("08", "2a")
i83 <- read.isip("08", "3")
isip8 <- rbind( i81a, i81b, i82a, i83)  
isip8 <- distinct(isip8, orfs, .keep_all = T)


i61 <- read.isip("06", "1")
i61a <- read.isip("06", "1a")
i61b <- read.isip("06", "1b")
i62a <- read.isip("06", "2a")
i63 <- read.isip("06", "3")
isip6 <- rbind(i61a, i61b, i62a, i63)
isip6 <- distinct(isip6, orfs, .keep_all = T)

i132a <- read.isip("13", "2a")
i133 <- read.isip("13", "3")
isip13 <- rbind(i132a, i133)
isip13 <- distinct(isip13, orfs, .keep_all = T)

## Create df with ko_id and names mapped to ORF's of each organism
add the isip proteins to ko dataframe, checking not to repeat if the orf was already annotated. (should I replace annotation with isip?)

1. Merge full ko definition list with ko's in organism using many-to-one merge relationship.  Organism's ko df have repeated ko_id, as the same annotation could be matched to multiple ORFs. So many rows in organisms's df may match to one row in full ko definition df. The column ko_id is named the same for both objects and will be used to merge the two dataframes. For this we use dplyr's left-join with relationship=many-to-one. 

2. Isip df and ko df have column query in common. We create a list of ORF's that the isip df and ko df have in common. isip rows are added to ko df for those ORFs not in common.

In [14]:

combine_save <- function(isip, ko_df, org){
    
    ko_fin <- bind_rows(ko_df, isip)
 
    write.csv(ko_fin, paste("../kegg_names/ko", org,"_ls.csv", sep=""), row.names=F)
    print(nrow(ko_fin))
    print(nrow(ko_df)+nrow(isip))
}


ko4_def <- combine_save(isip4, ko4, "4")
ko8_def <- combine_save(isip8, ko8, "8")
ko6_def <- combine_save(isip6, ko6, "6")
ko13_def <- combine_save(isip13, ko13, "13")

[1] 14312
[1] 14312
[1] 12787
[1] 12787
[1] 35262
[1] 35262
[1] 30920
[1] 30920


# Now we have a df with all possible ko annotations, including the isip proteins, associated with orf's 

## this will be now used in creating figures. 


In [None]:
emap.ko4 <- emap4[,c('orfs', 'KEGG_ko')]
ko4_def$name <- gsub(" \\[EC.*\\]", "", ko4_def$name)
ko.sum <- ko4_def %>% group_by(orfs)%>% summarize('symbol'=str_flatten(symbol, collaps="; "), 'name'= str_flatten(name, collaps="; "))
ko.na <- data.frame('orfs'=emap.ko4[(emap.ko4$orfs %in% ko.sum$orfs)==FALSE, 'orfs'], 'symbol'=NA, 'name'=NA)
full.anno <- rbind(ko.sum, ko.na )
nrow(emap4)
nrow(full.anno)

In [None]:
#read in table with name and symbol matches to kegg ko's
#make tidy
br_def <- read.csv("../kegg_names/brite_def.csv", header = FALSE)
head(br_def)
id <- str_extract(br_def$V1,'K[[:digit:]]*')
sym <- str_remove(br_def$V1, 'ko:K[[:digit:]]*')
br_def <- data.frame(br_id = id, symbol=sym, name=br_def$V2)

#write.csv(ko_def,"../kegg_names/br_def.csv", row.names=F)

print(paste(nrow(br_def), "     =   all unique BRITEs found across organisms"))
dim(br_def)
head(br_def)

In [None]:
clean_br <- function(df){
    df <- df %>% bind_cols(split_into_multiple(.$BRITE, ",", "br:")) %>%
           select("orfs", starts_with("br:_", ))
paste(head(df))
    df_clean <- pivot_longer(df, cols = !orfs, names_to = 'br_iteration',
                         values_to ='br_id' , values_drop_na=TRUE)
        df_clean$br_id <- gsub('br:', '', df_clean$br_id)
    df_clean
    }

clean_mods <- function(df){
    df <- df %>% bind_cols(split_into_multiple(.$KEGG_Module, ",", "mo:")) %>%
           select("orfs", starts_with("mo:_", ))
paste(head(df))
    df_clean <- pivot_longer(df, cols = !orfs, names_to = 'mo_iteration',
                         values_to ='mo_id' , values_drop_na=TRUE)
        df_clean$mo_id <- gsub('mo:', '', df_clean$mo_id)
    df_clean
    }

clean_paths <- function(df){
    df <- df %>% bind_cols(split_into_multiple(.$KEGG_Pathway, ",", "path:")) %>%
           select("orfs", starts_with("path:_", ))
paste(head(df))
    df_clean <- pivot_longer(df, cols = !orfs, names_to = 'path_iteration',
                         values_to ='path_id' , values_drop_na=TRUE)
        df_clean$path_id <- gsub('path:', '', df_clean$path_id)
    df_clean
    }

br4 <- clean_br(paths4)
br8 <- clean_br(paths8)
br6 <- clean_br(paths6)
br13 <- clean_br(paths13)

all.br <- bind_rows(br4, br6, br8, br13) %>% select(br_id)
all.br <- distinct(all.br, br_id)
#write to a .txt file with path_id's separated by a space 
 write.table(paste("brite=", paste(all.br$br_id, sep=" ", collapse = " ")), "all.brite.txt", row.names = FALSE, col.names = FALSE)


mo4 <- clean_mods(paths4)
mo8 <- clean_mods(paths8)
mo6 <- clean_mods(paths6)
mo13 <- clean_mods(paths13)

all.mo <- bind_rows(mo4, mo6, mo8, mo13) %>% select(mo_id)
all.mo <- distinct(all.mo, mo_id)
#write to a .txt file with path_id's separated by a space 
 write.table(paste("module=", paste(all.mo$mo_id, sep=" ", collapse = " ")), "all.modules.txt", row.names = FALSE, col.names = FALSE)


pa4 <- clean_paths(paths4)
pa8 <- clean_paths(paths8)
pa6 <- clean_paths(paths6)
pa13 <- clean_paths(paths13)

all.path <- bind_rows(pa4, pa6,pa8,pa13) %>% select(path_id)
all.path <- distinct(all.path, path_id)
#write to a .txt file with path_id's separated by a space 
 write.table(paste("pathway=", paste(all.path$path_id, sep=" ", collapse = " ")), "all.path.txt", row.names = FALSE, col.names = FALSE)


# Using Kegg api for Kegg annotations 
Imported csv files of modules, paths, and individual ko's, which are not associated with a path or module, have associated subcategories and broad categories which I am interested in. To get the module or path name, all of the ko's and thier names and symbols from the module or path code, we loop through each module/path code and using Kegg api, extract this information. Because pathways and modules are associated with one annother, we will keep these tables seperate
## After running each loop, we should have a dataframe for each original file with a columns:
- ko_id
- name
- symbol
- pathway/module
- sub_category
- broad_category
## Extract all ko's from tables which appear in at least one sample
We will use the resulting tables for subsetting data later in making heat maps. We can create heat maps based on pathway, module, or category. 

In [15]:
mod_man=read.csv('../kegg_names//subcategories_ko.csv')[,1:3]
head(mod_man)

Unnamed: 0_level_0,ko_id,sub_category,broad_category
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,K11959,Urea transporter,Nitrogen Metabolism
2,K08907,Light harvesting complex I,Photosynthesis
3,K08908,Light harvesting complex I,Photosynthesis
4,K08909,Light harvesting complex I,Photosynthesis
5,K08910,Light harvesting complex I,Photosynthesis
6,K08911,Light harvesting complex I,Photosynthesis


In [16]:
#1. for manually made modules ko's
## find name and symbol for each
## add name and symbol to dataframe
mod_man=read.csv('../kegg_names//subcategories_ko.csv')[,1:3]
dim(mod_man)
url=c("https://rest.kegg.jp/")
symbol=list()
ko_id=list()
name=list()

for (i in mod_man$ko_id){
    #m=c(keggFind('ko',i))
    m = getURL(paste(url,'find/ko/', i, sep=''))
    m = str_remove(m, 'ko:K[[:digit:]]{5}\t')
    ko_id=c(ko_id,i)
    symbol=c(symbol, str_remove(str_extract(string = m, pattern = '.+.*;'),';'))
    name=c(name,str_remove(str_extract(string = m, pattern = '; [[:alnum:]]+.*'),'; '))
    }

mod_ko=data.frame('ko_id'=unlist(ko_id),
                  'symbol'= unlist(symbol),
                  'name'= str_remove(unlist(name),'\n'))
mod_man=left_join(mod_man,mod_ko, by='ko_id')
mod_man=mutate(mod_man, Path=NA, path_name=mod_man$sub_category)
head(mod_man)
tail(mod_man)

Unnamed: 0_level_0,ko_id,sub_category,broad_category,symbol,name,Path,path_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>
1,K11959,Urea transporter,Nitrogen Metabolism,urtA,urea transport system substrate-binding protein,,Urea transporter
2,K08907,Light harvesting complex I,Photosynthesis,LHCA1,light-harvesting complex I chlorophyll a/b binding protein 1,,Light harvesting complex I
3,K08908,Light harvesting complex I,Photosynthesis,LHCA2,light-harvesting complex I chlorophyll a/b binding protein 2,,Light harvesting complex I
4,K08909,Light harvesting complex I,Photosynthesis,LHCA3,light-harvesting complex I chlorophyll a/b binding protein 3,,Light harvesting complex I
5,K08910,Light harvesting complex I,Photosynthesis,LHCA4,light-harvesting complex I chlorophyll a/b binding protein 4,,Light harvesting complex I
6,K08911,Light harvesting complex I,Photosynthesis,LHCA5,light-harvesting complex I chlorophyll a/b binding protein 5,,Light harvesting complex I


Unnamed: 0_level_0,ko_id,sub_category,broad_category,symbol,name,Path,path_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>
21,K26552,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF2, GTR, NRT1","MFS transporter, POT/PTR family, nitrate/chloride/glucosinolate transporter",,Nirate/Nitrite transport
22,K26553,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF3, NITR, NRT1","MFS transporter, POT/PTR family, nitrite transporter",,Nirate/Nitrite transport
23,K26548,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF4, NRT1","MFS transporter, POT/PTR family, nitrate transporter",,Nirate/Nitrite transport
24,K26562,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF5, NRT1, NAET","MFS transporter, POT/PTR family, peptide/nitrate/nicotianamine transporter",,Nirate/Nitrite transport
25,K26549,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF6, NRT1","MFS transporter, POT/PTR family, nitrate transporter",,Nirate/Nitrite transport
26,K26554,Nirate/Nitrite transport,Nitrogen Metabolism,"NPF7, NRT1","MFS transporter, POT/PTR family, low-affinity nitrate transporter",,Nirate/Nitrite transport


In [4]:
#2. Kegg modules
## Get name for each module
## add module name to dataframe
## get list of all ko's for each module
## get ko name and symbol for each module
sub_module=read.csv("../kegg_names/subcategories_module.csv")
nrow(sub_module)
url=c("https://rest.kegg.jp/")

find.mod=list()
for (i in sub_module$Module){
    #p=keggFind('module', i)
    p=getURL(paste(url,'find/module/', i, sep=''))
    find.mod=unname(c(find.mod,str_remove(p, ", .*")))
    #module=unname(c(module,str_remove(p, ", .*"))) #str_extract(p, '[[:alpha:]]+.*'))
}

module_name=data.frame('module_name'=str_remove(find.mod,'md:M[[:digit:]]{5}\t'))
sub_mod=bind_cols(sub_module, module_name)
colnames(sub_mod)[1]='Module'

p=list()
for (i in sub_mod$Module){
    link.ko=getURL(paste(url,'link/ko/', i, sep=''))
    p=c(p,link.ko)
    #p=c(p,keggLink('ko', i))
}

p=data.frame(ko_id=unlist(str_extract_all(p,'ko:K[[:digit:]]+')), 
               module=(unlist(str_extract_all(p, 'M[[:digit:]]{5}'))))

symbol=list()
name=list()
for (i in p$ko_id){
     g=getURL(paste(url,'find/ko/', i, sep=''))
   # g=keggFind('ko', i)
    symbol=c(symbol,str_remove(str_extract(unname(g), '\t.+;'), ';'))
    name=c(name, str_remove(str_extract(unname(g),'; .*'), '; '))
}

mod_ko=data.frame('Module'=p$module,
                  'ko_id'=p$ko_id,
                  'symbol'=str_remove(unlist(symbol),'\t'),
                  'name'=unlist(name))
sub_mod=left_join(mod_ko, sub_mod, by='Module')
head(sub_mod)

Unnamed: 0_level_0,Module,ko_id,symbol,name,sub_category,broad_category,module_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,M00161,ko:K02703,psbA,photosystem II P680 reaction center D1 protein [EC:1.10.3.9],Photosystem II,photosynthesis,Photosystem II
2,M00161,ko:K02706,psbD,photosystem II P680 reaction center D2 protein [EC:1.10.3.9],Photosystem II,photosynthesis,Photosystem II
3,M00161,ko:K02705,psbC,photosystem II CP43 chlorophyll apoprotein,Photosystem II,photosynthesis,Photosystem II
4,M00161,ko:K02704,psbB,photosystem II CP47 chlorophyll apoprotein,Photosystem II,photosynthesis,Photosystem II
5,M00161,ko:K02707,psbE,photosystem II cytochrome b559 subunit alpha,Photosystem II,photosynthesis,Photosystem II
6,M00161,ko:K02708,psbF,photosystem II cytochrome b559 subunit beta,Photosystem II,photosynthesis,Photosystem II


## Pulling out pathways used for heatmaps in paper to make loop faster

In [17]:
sub_cat_path=read.csv('../kegg_names/subcategories_path.csv')

heatMapPath = filter(sub_cat_path, sub_category %in% c(
    'Carbon fixation in photosynthetic organisms', 'Photosynthesis', 
    'Photosynthesis-antenna proteins', 'Nitrogen metabolism', 'Carotenoid biosynthesis'))
heatMapPath

Pathway,sub_category,broad_category
<chr>,<chr>,<chr>
map00710,Carbon fixation in photosynthetic organisms,Carbon Metabolism
map00195,Photosynthesis,Photosynthesis
map00196,Photosynthesis-antenna proteins,Photosynthesis
map00910,Nitrogen metabolism,Nitrogen Metabolism
map00906,Carotenoid biosynthesis,Photosynthesis


In [33]:
#3. pathways
## get name for each pathway
## add pathway name to dataframe
## get list of all ko's for each pathway
## get name and symbol for each ko


url=c("https://rest.kegg.jp/")
find.path=list()
for (i in heatMapPath$Pathway){
    p=getURL(paste(url,'find/pathway/', i, sep=''))
    find.path=unname(c(find.path,p))
}

path_name=data.frame('path_name'=str_remove(find.path,'path:map[[:digit:]]{5}\t'))
path_name$path_name=str_remove(path_name$path_name, '\n')
head(path_name)
sub_path=bind_cols(heatMapPath, path_name)
colnames(sub_path)[1]='Path'

p=list()
for (i in sub_path$Path){
    link.ko=getURL(paste(url,'link/ko/', i, sep=''))
    #link.ko=str_extract(p, 'ko:K[[:digit:]]+')
    p=c(p,link.ko)
}

p=data.frame(ko_id=unlist(str_extract_all(p,'K[[:digit:]]+')), 
               path=(unlist(str_extract_all(p, 'map[[:digit:]]{5}')))) 
symbol=list()
name=list()
for (i in p$ko_id){
    g=getURL(paste(url,'find/ko/', i, sep=''))
    symbol=c(symbol,str_remove(str_extract(unname(g), '\t.+;'), ';'))
    name=c(name, str_remove(str_extract(unname(g),'; .*'), '; '))
}
path_ko=data.frame('Path'=p$path,
                   'ko_id'=p$ko_id,
                   'symbol'=str_remove(unlist(symbol),'\t'),
                   'name'= str_remove(unlist(name), '\\[EC:.*\\]'))

sub_path=left_join(path_ko, sub_path, by='Path', relationship='many-to-many')

head(sub_path)
dim(sub_path)

## add fld genes 
flavodoxin=data.frame(Path='map00195', ko_id=c('K03839','K03840','K21567','K00528'), 
                      symbol=c('fldA, nifF, isiB','fldB','fnr','fpr'), 
                      name=c('flavodoxin I','flavodoxin II','ferredoxin/flavodoxin---NADP+ reductase',
                            'ferredoxin/flavodoxin---NADP+ reductase'), sub_category='Photosynthesis',
                     broad_category='photosynthesis', path_name='Photosynthesis')
sub_path=bind_rows(sub_path, flavodoxin) 

tail(sub_path)
dim(sub_path)
filter(sub_path, path_name=='Photosynthesis')

Unnamed: 0_level_0,path_name
Unnamed: 0_level_1,<chr>
1,Carbon fixation in photosynthetic organisms
2,Photosynthesis
3,Photosynthesis - antenna proteins
4,Nitrogen metabolism
5,Carotenoid biosynthesis


Unnamed: 0_level_0,Path,ko_id,symbol,name,sub_category,broad_category,path_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,map00710,K00024,mdh,malate dehydrogenase,Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms
2,map00710,K00025,MDH1,malate dehydrogenase,Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms
3,map00710,K00026,MDH2,malate dehydrogenase,Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms
4,map00710,K00028,E1.1.1.39,malate dehydrogenase (decarboxylating),Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms
5,map00710,K00029,maeB,malate dehydrogenase (oxaloacetate-decarboxylating)(NADP+),Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms
6,map00710,K00051,E1.1.1.82,malate dehydrogenase (NADP+),Carbon fixation in photosynthetic organisms,Carbon Metabolism,Carbon fixation in photosynthetic organisms


Unnamed: 0_level_0,Path,ko_id,symbol,name,sub_category,broad_category,path_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
263,map00906,K25073,cruO,1'-hydroxy-gamma-carotene C-4' ketolase,Carotenoid biosynthesis,Photosynthesis,Carotenoid biosynthesis
264,map00906,K25074,crtU,carotenoid chi-ring synthase,Carotenoid biosynthesis,Photosynthesis,Carotenoid biosynthesis
265,map00195,K03839,"fldA, nifF, isiB",flavodoxin I,Photosynthesis,photosynthesis,Photosynthesis
266,map00195,K03840,fldB,flavodoxin II,Photosynthesis,photosynthesis,Photosynthesis
267,map00195,K21567,fnr,ferredoxin/flavodoxin---NADP+ reductase,Photosynthesis,photosynthesis,Photosynthesis
268,map00195,K00528,fpr,ferredoxin/flavodoxin---NADP+ reductase,Photosynthesis,photosynthesis,Photosynthesis


Path,ko_id,symbol,name,sub_category,broad_category,path_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
map00195,K02108,"ATPF0A, atpB",F-type H+-transporting ATPase subunit a,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02109,"ATPF0B, atpF",F-type H+-transporting ATPase subunit b,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02110,"ATPF0C, atpE",F-type H+-transporting ATPase subunit c,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02111,"ATPF1A, atpA",F-type H+/Na+-transporting ATPase subunit alpha,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02112,"ATPF1B, atpD",F-type H+/Na+-transporting ATPase subunit beta,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02113,"ATPF1D, atpH",F-type H+-transporting ATPase subunit delta,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02114,"ATPF1E, atpC",F-type H+-transporting ATPase subunit epsilon,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02115,"ATPF1G, atpG",F-type H+-transporting ATPase subunit gamma,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02634,petA,apocytochrome f,Photosynthesis,Photosynthesis,Photosynthesis
map00195,K02635,petB,cytochrome b6,Photosynthesis,Photosynthesis,Photosynthesis


In [127]:
sub_path$path_name[sub_path$path_name=='Photosynthesis - antenna proteins']='Photosynthesis'

sub_path[str_detect(sub_path$symbol, "DUF"), 'sub_category'] =  "NADH dehydrogenase"
sub_path[str_detect(sub_path$symbol, "COX|cox|CYC"), 'sub_category'] = "cytochrome c"
sub_path[str_detect(sub_path$symbol, "ATP.*V"),'sub_category'] = "V-Type ATP-ase"
sub_path[str_detect(sub_path$symbol, "ATP.*F"), 'sub_category'] = "F-Type ATP-ase"

heat.path=(filter(sub_path, (ko_id %in% ko_def$ko_id)==T))

heat.path$name= str_replace_all(heat.path$name, c('light-harvesting complex I '='LHCA ','light-harvesting complex II'='LHCB',  
                                 'chlorophyll'='Chl','photosystem I '='PSI ', 'photosystem II'='PSII',
                                 'F-type .* subunit '='F-Type ATP-ase ', 'isip_'='Iron starvation induced protein ',
                                 'fructose-bisphosphate aldolase'='FBA', 'ribulose-bisphosphate carboxylase'='RuBisCO',
                                               'MFS transporter, NNP family, '=''))

urea_cycle = filter(ko_def, ko_id%in%c('K00611','K01940','K01755','K01476'))
urea_cycle = mutate(urea_cycle, 'sub_category'='Urea cycle', 'broad_category'='Nitrogen metabolism','path_name'='Nitrogen metabolism', 'Path'='map00910')
heat.path = full_join(heat.path, urea_cycle)

[1m[22mJoining with `by = join_by(Path, ko_id, symbol, name, sub_category, broad_category, path_name)`


In [128]:
heat.path[str_detect(heat.path$symbol, 'MDH1'), 'name'] = 'malate dehydrogenase 1'
heat.path[str_detect(heat.path$symbol, 'MDH2'), 'name'] = 'malate dehydrogenase 2'
heat.path[str_detect(heat.path$symbol, 'maeB|ppdK'), 'sub_category'] = 'CAM light'
heat.path[str_detect(heat.path$symbol, 'MDH1|MDH2|mdh|ppc'), 'sub_category'] = 'CAM dark'
heat.path[heat.path$ko_id %in% 
          c('K00855','K00927','K01100','K01601','K01623','K01624','K01783',
            'K01803','K01807','K01808','K02446','K03841','K11532','K00134',
            'K00615'), 'sub_category'] = 'Calvin cycle' 
heat.path$sub_category = str_replace(heat.path$sub_category, 'Carbon fixation in photosynthetic organisms', 'C4 Dicarboxilic acid cycle')
heat.path$name = str_remove(heat.path$name, '.(phosphorylating).')
filter(heat.path, path_name=='Carbon fixation in photosynthetic organisms')


Path,ko_id,symbol,name,sub_category,broad_category,path_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
map00710,K00024,mdh,malate dehydrogenase,CAM dark,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00025,MDH1,malate dehydrogenase 1,CAM dark,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00026,MDH2,malate dehydrogenase 2,CAM dark,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00028,E1.1.1.39,malate dehydrogenase (decarboxylating),C4 Dicarboxilic acid cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00029,maeB,malate dehydrogenase (oxaloacetate-decarboxylating)(NADP+),CAM light,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00134,"GAPDH, gapA",glyceraldehyde 3-phosphate dehydrogenase,Calvin cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00615,"E2.2.1.1, tktA, tktB",transketolase,Calvin cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00814,"GPT, ALT",alanine transaminase,C4 Dicarboxilic acid cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00855,"PRK, prkB",phosphoribulokinase,Calvin cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms
map00710,K00927,"PGK, pgk",phosphoglycerate kinase,Calvin cycle,Carbon Metabolism,Carbon fixation in photosynthetic organisms


In [129]:
heat.path[str_detect(heat.path$name, 'carbonic anhydrase'), 'sub_category'] = 'Carbonic anhydrase'
heat.path[str_detect(heat.path$name, 'nitrate/nitrite transport'), 'sub_category'] = 'Nitrogen transporters'
heat.path[str_detect(heat.path$name, 'glutamate|glutamine'), 'sub_category'] = 'GS/GOGAT and GDH'
heat.path[str_detect(heat.path$name, 'nitrite reductase'), 'sub_category'] = 'Nitrite reductase'
heat.path[str_detect(heat.path$name, 'nitrate reductase'), 'sub_category'] = 'Nitrite reductase'
heat.path[str_detect(heat.path$symbol, 'CPS1'), 'sub_category'] = 'Urea cycle'
heat.path$name = str_remove(heat.path$name, '\\[.*\\]')
filter(heat.path, path_name=='Nitrogen metabolism')


Path,ko_id,symbol,name,sub_category,broad_category,path_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
map00910,K00261,"GLUD1_2, gdhA",glutamate dehydrogenase (NAD(P)+),GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00262,"E1.4.1.4, gdhA",glutamate dehydrogenase (NADP+),GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00264,GLT1,glutamate synthase (NADH),GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00265,gltB,glutamate synthase (NADPH) large chain,GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00266,gltD,glutamate synthase (NADPH) small chain,GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00284,"GLU, gltS",glutamate synthase (ferredoxin),GS/GOGAT and GDH,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00362,nirB,nitrite reductase (NADH) large subunit,Nitrite reductase,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00366,nirA,ferredoxin-nitrite reductase,Nitrite reductase,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00368,nirK,nitrite reductase (NO-forming),Nitrite reductase,Nitrogen Metabolism,Nitrogen metabolism
map00910,K00372,"nasC, nasA",assimilatory nitrate reductase catalytic subunit,Nitrite reductase,Nitrogen Metabolism,Nitrogen metabolism


In [130]:
heat.path[str_detect(heat.path$symbol, 'psa'), 'sub_category'] = 'PSI'
heat.path[str_detect(heat.path$symbol, 'psb'), 'sub_category'] = 'PSII'
heat.path[str_detect(heat.path$symbol, 'LHCA'), 'sub_category'] = 'LHCA'
heat.path[str_detect(heat.path$symbol, 'LHCB'), 'sub_category'] = 'LHCB'
heat.path$sub_category = str_replace(heat.path$sub_category, 'Photosynthesis', 'Electron transport chain')
heat.path[str_detect(heat.path$sub_category, 'F-Type ATP-ase'), 'path_name'] = 'F-Type ATP-ase'

filter(heat.path, path_name=='Photosynthesis')

Path,ko_id,symbol,name,sub_category,broad_category,path_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
map00195,K02634,petA,apocytochrome f,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02636,petC,cytochrome b6-f complex iron-sulfur subunit,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02639,petF,ferredoxin,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02641,petH,ferredoxin--NADP+ reductase,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02689,psaA,psa P700 Chl a apoprotein A1,PSI,Photosynthesis,Photosynthesis
map00195,K02690,psaB,psa P700 Chl a apoprotein A2,PSI,Photosynthesis,Photosynthesis
map00195,K02692,psaD,psa subunit II,PSI,Photosynthesis,Photosynthesis
map00195,K02693,psaE,psa subunit IV,PSI,Photosynthesis,Photosynthesis
map00195,K02698,psaK,psa subunit X,PSI,Photosynthesis,Photosynthesis
map00195,K02703,psbA,psb P680 reaction center D1 protein,PSII,Photosynthesis,Photosynthesis


In [131]:
heat.path[str_detect(heat.path$name, 'violaxanthin|zeaxanthin'), 'path_name'] = 'Photosynthesis'
heat.path[str_detect(heat.path$name, 'violaxanthin|zeaxanthin'), 'sub_category'] = 'Xanthophyll cycle'

filter(heat.path, path_name=='Photosynthesis')

Path,ko_id,symbol,name,sub_category,broad_category,path_name
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
map00195,K02634,petA,apocytochrome f,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02636,petC,cytochrome b6-f complex iron-sulfur subunit,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02639,petF,ferredoxin,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02641,petH,ferredoxin--NADP+ reductase,Electron transport chain,Photosynthesis,Photosynthesis
map00195,K02689,psaA,psa P700 Chl a apoprotein A1,PSI,Photosynthesis,Photosynthesis
map00195,K02690,psaB,psa P700 Chl a apoprotein A2,PSI,Photosynthesis,Photosynthesis
map00195,K02692,psaD,psa subunit II,PSI,Photosynthesis,Photosynthesis
map00195,K02693,psaE,psa subunit IV,PSI,Photosynthesis,Photosynthesis
map00195,K02698,psaK,psa subunit X,PSI,Photosynthesis,Photosynthesis
map00195,K02703,psbA,psb P680 reaction center D1 protein,PSII,Photosynthesis,Photosynthesis


In [132]:
unique(heat.path$path_name)
write.csv(heat.path, '../kegg_names/pathwaysHeatMap.csv', row.names=F)

In [115]:
all(heat.path$ko_id %in% ko_def$ko_id)
dim(heat.path)

## Combine kegg module and manual module tables 

### Now I have mod/path code and name, sub and broad categorie and ko_id, name and symbol for each
using the ko_id find matches in my data, some ko's might be repeated in different categories so make sure they are repeated in each category

## in the end I want a df with links of ko's in my data, name, symbol, path/module code and name, subcategory, broad category
i can use this when refering to data for plotting
## Make sure to check that genes i am interested in like lhp and ferredoxin actually showed up! check differences in length between original list and those in data.

In [8]:
ko4=read.csv('../kegg_names/ko4_ls.csv')
ko8=read.csv('../kegg_names/ko8_ls.csv')
ko6=read.csv('../kegg_names/ko6_ls.csv')
ko13=read.csv('../kegg_names/ko13_ls.csv')
head(ko13)

Unnamed: 0_level_0,orfs,ko_iteration,ko_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,NODE_10000_length_2085_cov_180.263917_g1941_i2.p2,ko:_1,K19364
2,NODE_10000_length_2085_cov_180.263917_g1941_i2.p3,ko:_1,K19364
3,NODE_10002_length_2085_cov_50.039761_g2374_i1.p2,ko:_1,K01142
4,NODE_10002_length_2085_cov_50.039761_g2374_i1.p2,ko:_2,K01930
5,NODE_10002_length_2085_cov_50.039761_g2374_i1.p2,ko:_3,K10771
6,NODE_10005_length_2085_cov_13.522366_g594_i1.p1,ko:_1,K00850


In [116]:
a=unique(ko4$ko_id)
a=a[a%in%heat.path$ko_id]

b=unique(ko8$ko_id)
b=b[b%in%heat.path$ko_id]

c=unique(ko6$ko_id)
c=c[c%in%heat.path$ko_id]

d=unique(ko13$ko_id)
d=d[d%in%heat.path$ko_id]


length(b)
length(c)
length(d)
length(a)
x=union(a,b)
y=union(c,d)
z=union(x,y)

length(x)
length(y)
length(z)
st=heat.path[(heat.path$ko_id)%in%z,]
nrow(st)
length(unique(heat.path$ko_id))

In [7]:
mod.path=function(df,ko){
    inner_join(x = ko, y=df,by = 'ko_id', relationship='many-to-many')
    }

path4=mod.path(sub_path, ko4)
path8=mod.path(sub_path, ko8)
path6=mod.path(sub_path, ko6)
path13=mod.path(sub_path, ko13)

write.csv(path4,'../kegg_names/path4.csv',row.names=FALSE)
write.csv(path8,'../kegg_names/path8.csv',row.names=FALSE)
write.csv(path6,'../kegg_names/path6.csv',row.names=FALSE)
write.csv(path13,'../kegg_names/path13.csv',row.names=FALSE)

sub_path=bind_rows(path4,path8,path6,path13)
sub_path=sub_path[,-c(1:2)] %>% distinct()

write.csv(all.path,'../kegg_names/all.path.csv', row.names=F)

In [None]:
mod4=mod.path(sub_mod_all, ko4)
mod8=mod.path(sub_mod_all, ko8)
mod6=mod.path(sub_mod_all, ko6)
mod13=mod.path(sub_mod_all, ko13)

write.csv(mod4,'../kegg_names/mod4.csv',row.names=FALSE)
write.csv(mod8,'../kegg_names/mod8.csv',row.names=FALSE)
write.csv(mod6,'../kegg_names/mod6.csv',row.names=FALSE)
write.csv(mod13,'../kegg_names/mod13.csv',row.names=FALSE)

#now rewriting sub_cat table so only genes found in my organisms show up
all.mod=bind_rows(mod4,mod8,mod6,mod13)
all.mod=all.mod[,-c(1:2)] %>% distinct()

write.csv(all.mod,'../kegg_names/all.mod.csv', row.names=F)

----
# Old way of extracting pathways of interest

In [5]:
## List of pathways of interest with ko-id's associated. 
## find orfs and counts within pathways and plot on heatmap
## add name of pathway in df by joinging pathways dataframes

select.paths <- data.frame(Name=c('Nitrogen Metabolism', 
                                'Photosynthesis', 
                                'Oxidative Phosphorylation', 
                                'Photosynthesis-antenna Proteins', 
                                'Carbon Fixation',
                               'Methan Metabolism', 
                                'Sulfur Metabolism',
                                'Citrate Cycle', 
                                'Glycolysis', 
                                'ABC Transport', 
                                'Phosphotransferase System'),
                      path_id=c('map00910', 'map00195', 'map00190','map00196', 'map00710',
                              'map00680', 'map00920', 'map00020', 'map00010', 'map02010', 'map02060'))
#keggLink('ko', 'map00910')

code <- list(select.paths$path_id)
head(code)
for (i in code){
   #print(keggList(i))
    pathways <- as.matrix(keggLink('ko', i))
}
pathways <- data.frame('ko_id'=pathways) %>% rownames_to_column('path_id')  
pathways$path_id <- gsub('path.', '', pathways$path_id)
pathways$ko_id <- gsub('ko:','', pathways$ko_id)
pathways$path_id <- str_remove(pathways$path_id, '\\.[[:digit:]]+')
pathways <- pathways %>% group_by(path_id)

select.pathways <- left_join(pathways, select.paths, by='path_id')
write.csv(select.pathways, '../kegg_names/select.pathways.csv', row.names=FALSE)