In [1]:
import sys
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [2]:
sys.path.append("../../oats")
from oats.biology.groupings import Groupings
from oats.utils.utils import save_to_pickle
from oats.nlp.preprocess import replace_delimiter, concatenate_with_delim

Paths to input files from databases or papers. Some others are specified below in the dictionaries as well.

In [3]:
lloyd_meinke_cleaned_supplemental_table_path_hierarchy = "../papers/lloyd_meinke_2012/versions_cleaned_by_me/192393Table_S1_Final.csv"
lloyd_meinke_cleaned_supplemental_table_path_mappings = "../papers/lloyd_meinke_2012/versions_cleaned_by_me/192393Table_S2_Final_Revised.csv"

Paths to the csv files that are created for each type of data for grouping genes.

In [4]:
lloyd_meinke_subsets_output_path = "../reshaped_data/lloyd_meinke_subsets.csv"
lloyd_meinke_classes_output_path = "../reshaped_data/lloyd_meinke_classes.csv"
kegg_pathways_output_path = "../reshaped_data/kegg_pathways.csv"
plantcyc_pathways_output_path = "../reshaped_data/plantcyc_pathways.csv"

Paths to the csv files that are created to specify the mappings between IDs and names for each group.

In [5]:
lloyd_meinke_subsets_name_mapping_path = "../reshaped_data/lloyd_meinke_subsets_name_map.csv"
lloyd_meinke_classes_name_mapping_path = "../reshaped_data/lloyd_meinke_classes_name_map.csv"
kegg_pathways_name_mapping_path = "../reshaped_data/kegg_pathways_name_map.csv"
plantcyc_pathways_name_mapping_path = "../reshaped_data/plantcyc_pathways_name_map.csv"

PlantCyc

Mapping between species codes and files downloaded from PlantCyc.

In [6]:
plantcyc_paths_dictionary = {
    "ath":"../databases/plantcyc/aracyc_pathways.20180702", 
    "zma":"../databases/plantcyc/corncyc_pathways.20180702", 
    "mtr":"../databases/plantcyc/mtruncatulacyc_pathways.20180702", 
    "osa":"../databases/plantcyc/oryzacyc_pathways.20180702", 
    "gmx":"../databases/plantcyc/soycyc_pathways.20180702",
    "sly":"../databases/plantcyc/tomatocyc_pathways.20180702"}

Create and save the pathways object using PlantCyc.

In [7]:
plantcyc_df = Groupings.get_dataframe_for_plantcyc(paths=plantcyc_paths_dictionary)
plantcyc_df.to_csv(plantcyc_pathways_output_path, index=False)
plantcyc_name_mapping = {row.pathway_id:row.pathway_name for row in plantcyc_df.itertuples()}
pd.DataFrame(plantcyc_name_mapping.items(), columns=["group_id","group_name"]).to_csv(plantcyc_pathways_name_mapping_path, index=False)

KEGG

Mapping between species codes and files saved using another script that uses the KEGG REST API.


kegg_paths_dictionary = {<br>
    "ath":"/Users/irbraun/plant-data/databases/kegg/ath_pathway_files_from_api",<br>
    "zma":"/Users/irbraun/plant-data/databases/kegg/zma_pathway_files_from_api",<br>
    "osa":"/Users/irbraun/plant-data/databases/kegg/osa_pathway_files_from_api",<br>
    "mtr":"/Users/irbraun/plant-data/databases/kegg/mtr_pathway_files_from_api",<br>
    "gmx":"/Users/irbraun/plant-data/databases/kegg/gmx_pathway_files_from_api",<br>
    "sly":"/Users/irbraun/plant-data/databases/kegg/sly_pathway_files_from_api",<br>
    "hsa":"/Users/irbraun/plant-data/databases/kegg/hsa_pathway_files_from_api",<br>


In [8]:
path_to_kegg = "../databases/kegg/"

In [9]:
ath = str(path_to_kegg) + "ath_pathway_files_from_api"
zma = str(path_to_kegg) + "zma_pathway_files_from_api"
osa = str(path_to_kegg) + "osa_pathway_files_from_api"
mtr = str(path_to_kegg) + "mtr_pathway_files_from_api"
gmx = str(path_to_kegg) + "gmx_pathway_files_from_api"
sly = str(path_to_kegg) + "sly_pathway_files_from_api"
hsa = str(path_to_kegg) + "hsa_pathway_files_from_api"

In [10]:
kegg_paths_dictionary = {
	"ath":str(ath),
	"zma":str(zma),
	"osa":str(osa),
	"mtr":str(mtr),
	"gmx":str(gmx),
	"sly":str(sly),
	"hsa":str(hsa)
}

In [11]:
kegg_df = Groupings.get_dataframe_for_kegg(paths=kegg_paths_dictionary)
kegg_df.to_csv(kegg_pathways_output_path, index=False)
kegg_name_mapping = {row.pathway_id:row.pathway_name for row in kegg_df.itertuples()}
pd.DataFrame(kegg_name_mapping.items(), columns=["group_id","group_name"]).to_csv(kegg_pathways_name_mapping_path, index=False)

../databases/kegg/ath_pathway_files_from_api/path_ath00901.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00740.txt
../databases/kegg/ath_pathway_files_from_api/path_ath01100.txt
../databases/kegg/ath_pathway_files_from_api/path_ath04070.txt
../databases/kegg/ath_pathway_files_from_api/path_ath01210.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00450.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00906.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00270.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00908.txt
../databases/kegg/ath_pathway_files_from_api/path_ath01250.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00410.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00071.txt
../databases/kegg/ath_pathway_files_from_api/path_ath03040.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00941.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00590.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00

../databases/kegg/ath_pathway_files_from_api/path_ath00970.txt
../databases/kegg/ath_pathway_files_from_api/path_ath04122.txt
../databases/kegg/ath_pathway_files_from_api/path_ath03410.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00040.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00942.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00072.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00945.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00290.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00620.txt
../databases/kegg/ath_pathway_files_from_api/path_ath03450.txt
../databases/kegg/ath_pathway_files_from_api/path_ath00310.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00730.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00380.txt
../databases/kegg/zma_pathway_files_from_api/path_zma04075.txt
../databases/kegg/zma_pathway_files_from_api/path_zma02010.txt
../databases/kegg/zma_pathway_files_from_api/path_zma01

../databases/kegg/zma_pathway_files_from_api/path_zma00563.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00750.txt
../databases/kegg/zma_pathway_files_from_api/path_zma01110.txt
../databases/kegg/zma_pathway_files_from_api/path_zma04136.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00511.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00601.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00260.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00053.txt
../databases/kegg/zma_pathway_files_from_api/path_zma04120.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00909.txt
../databases/kegg/zma_pathway_files_from_api/path_zma03008.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00500.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00900.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00790.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00480.txt
../databases/kegg/zma_pathway_files_from_api/path_zma00

../databases/kegg/osa_pathway_files_from_api/path_osa00601.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00053.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00440.txt
../databases/kegg/osa_pathway_files_from_api/path_osa01200.txt
../databases/kegg/osa_pathway_files_from_api/path_osa03010.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00564.txt
../databases/kegg/osa_pathway_files_from_api/path_osa04144.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00563.txt
../databases/kegg/osa_pathway_files_from_api/path_osa01110.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00750.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00220.txt
../databases/kegg/osa_pathway_files_from_api/path_osa03022.txt
../databases/kegg/osa_pathway_files_from_api/path_osa01232.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00130.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00710.txt
../databases/kegg/osa_pathway_files_from_api/path_osa00

../databases/kegg/mtr_pathway_files_from_api/path_mtr00073.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00860.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00240.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00944.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00232.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr03030.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00460.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00770.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00592.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00543.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr04141.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00561.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr03015.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00100.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr04146.txt
../databases/kegg/mtr_pathway_files_from_api/path_mtr00

../databases/kegg/gmx_pathway_files_from_api/path_gmx00965.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx04130.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00052.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx03250.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx03018.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00510.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00562.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00910.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx04145.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00780.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00330.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00020.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00565.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00640.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx03430.txt
../databases/kegg/gmx_pathway_files_from_api/path_gmx00

../databases/kegg/sly_pathway_files_from_api/path_sly03420.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00996.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00360.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00940.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00591.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00998.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00650.txt
../databases/kegg/sly_pathway_files_from_api/path_sly01232.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00220.txt
../databases/kegg/sly_pathway_files_from_api/path_sly03022.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00130.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00710.txt
../databases/kegg/sly_pathway_files_from_api/path_sly03050.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00061.txt
../databases/kegg/sly_pathway_files_from_api/path_sly01240.txt
../databases/kegg/sly_pathway_files_from_api/path_sly00

../databases/kegg/hsa_pathway_files_from_api/path_hsa04024.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04217.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05321.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04913.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00280.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00630.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04961.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa03440.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00471.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00010.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04390.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04720.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05160.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05167.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04727.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04

../databases/kegg/hsa_pathway_files_from_api/path_hsa04923.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04130.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00600.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa01040.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04068.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00052.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa03250.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05235.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04014.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00020.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04710.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05150.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04371.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04066.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04142.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05

../databases/kegg/hsa_pathway_files_from_api/path_hsa00360.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04740.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05100.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05217.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa03420.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04664.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04810.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05222.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa03008.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05135.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa00500.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04934.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05016.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04933.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa04120.txt
../databases/kegg/hsa_pathway_files_from_api/path_hsa05

Lloyd and Meinke et al., 2012

Some preprocessing on the supplemental file from Lloyd and Meinke, 2012 paper to extrac the columns used.

In [None]:
df = pd.read_csv(lloyd_meinke_cleaned_supplemental_table_path_mappings)
df.fillna("", inplace=True)
combine_columns = lambda row, columns: concatenate_with_delim("|", [row[column] for column in columns])
df["Alias Symbols"] = df["Alias Symbols"].apply(lambda x: replace_delimiter(text=x, old_delim=";", new_delim="|"))
df["gene_identifiers"] = df.apply(lambda x: combine_columns(x, ["Locus", "Gene Symbol", "Alias Symbols", "Full Gene Name"]), axis=1)

Specific to classes (more general).

In [None]:
df_class = df[["Phenotype Classb", "gene_identifiers"]]
df_class["species"] = "ath"
df_class.columns = ["group_ids", "gene_identifiers","species"]
df_class = df_class[["species", "group_ids", "gene_identifiers"]]
df_class.to_csv(lloyd_meinke_classes_output_path, index=False)

Specific to subsets (more specific).

In [None]:
df_subset = df[["Phenotype Subsetsb", "gene_identifiers"]]
df_subset["species"] = "ath"
df_subset.columns = ["group_ids", "gene_identifiers","species"]
df_subset = df_subset[["species", "group_ids", "gene_identifiers"]]
df_subset["group_ids"] = df_subset["group_ids"].apply(lambda x: x.replace("W:", "").replace("S:","").replace("(",",").replace(")",",").replace(";",","))
df_subset["group_ids"] = df_subset["group_ids"].apply(lambda x: replace_delimiter(text=x, old_delim=",", new_delim="|"))
df_subset.to_csv(lloyd_meinke_subsets_output_path, index=False)

Provide a mapping from subset or class IDs to the longer names that define them.

In [None]:
df = pd.read_csv(lloyd_meinke_cleaned_supplemental_table_path_hierarchy)
subset_id_to_name_dict = {row[5]:row[7] for row in df.itertuples()}
class_id_to_name_dict = {row[3]:row[4] for row in df.itertuples()}
pd.DataFrame(subset_id_to_name_dict.items(), columns=["group_id","group_name"]).to_csv(lloyd_meinke_subsets_name_mapping_path, index=False)
pd.DataFrame(class_id_to_name_dict.items(), columns=["group_id","group_name"]).to_csv(lloyd_meinke_classes_name_mapping_path, index=False)

Briefly checking whether groupings object can be successfully built from the created files.<br>
Create actual oats.Grouping objects using those CSV files that were created previously, and quick check of the contents.

In [None]:
print(Groupings(path=lloyd_meinke_subsets_output_path, name_mapping=subset_id_to_name_dict).describe())
print(Groupings(path=lloyd_meinke_classes_output_path, name_mapping=class_id_to_name_dict).describe())
print(Groupings(path=kegg_pathways_output_path, name_mapping=kegg_name_mapping).describe())
print(Groupings(path=plantcyc_pathways_output_path, name_mapping=plantcyc_name_mapping).describe())