In [None]:
from IPython.display import display
import pandas as pd
import os

try:
    import CBRdb
except ImportError:
    import sys
    import os
    sys.path.append(os.path.abspath('../'))
    import CBRdb

download_data = False
manually_add_compounds = False
preprocess_reactions = False
preprocess_compounds = False

# download data
if download_data: CBRdb.download_data(target="R"), CBRdb.download_data(target="C_full"), CBRdb.download_data(target="C")

# manually add compounds without mol files
if manually_add_compounds: CBRdb.compounds_manual_add()

# preprocess reactions
if preprocess_reactions:
    kegg_data_R = CBRdb.preprocess(target="R")
    atlas_data_R = CBRdb.clean_atlas(f_exclude_kegg=True)
else:
    kegg_data_R = pd.read_csv('../data/kegg_data_R.csv.zip', header=0)
    atlas_data_R = pd.read_csv('../data/atlas_data_R.csv.zip', header=0, index_col=0)

# preprocess compounds
if preprocess_compounds:
    C_meta, C_main = CBRdb.preprocess(target="C")
else: 
    C_meta = pd.read_csv('../data/kegg_data_C_metadata.csv.zip', header=0)
    C_main = pd.read_csv('../data/kegg_data_C.csv.zip', header=0)
    x_group_compounds = ['C00462', 'C01322', 'C01706', 'C01365', 'C03122', 'C01872', 'C00462', 'C01322', 'C01706', 'C01365', 
                         'C03122', 'C01872', 'C02103', 'C01812', 'C01813', 'C02103', 'C15564', 'C01812', 'C01813']

#identify and remove shortcuts and other suspect reactions
kegg_data_R = CBRdb.remove_suspect_reactions()
# identify duplicate compounds
dupemap = CBRdb.merge_data_sets.dedupe_compound_files(data_folder='../data/')
# replace duplicate compounds with their dupe-group identifier
kegg_data_R['reaction'] = (kegg_data_R['reaction'].str.split(expand=True).replace(dupemap)
                           .fillna('').apply(lambda x: ' '.join(x), axis=1).str.strip())
atlas_data_R['reaction'] = (atlas_data_R['reaction'].str.split(expand=True).replace(dupemap)
                           .fillna('').apply(lambda x: ' '.join(x), axis=1).str.strip())
kegg_data_R.to_csv('../data/kegg_data_R.csv.zip', compression='zip', encoding='utf-8', index=False)
atlas_data_R.to_csv('../data/atlas_data_R.csv.zip', compression='zip', encoding='utf-8', index=False)

display(kegg_data_R.head(3))
display(atlas_data_R.head(3))
display(C_meta.head(3))
display(C_main.head(3))


next step: dedupe reactions.