# Main compound analisis after compound dicovery step

Requirements: 
- Exported compounds database in excel format
- Exported Chemspider results database in excel format


In [1]:
import pandas as pd
from chemspi_local_db import ChemspiLocalDB
from chemspi_web_db import ChemspiWebDB

# Input
folder = "D:/SharedLAN/plasma_positive_tables/"
compounds_database_file = folder + "positive_plasma_melanoma_triplicate_S7.xlsx"
chemspider_results_db_file = folder + "positive_plasma_melanoma_triplicate_S7_CSID.xlsx"

export_file = folder + "positive_plasma_melanoma_triplicate_S7_removed_duplicates_lower.xlsx"

web_api_key = "6aAKBdgMGALeExc41btfo6GC48wQSSvH"


# init resources and databases
compounds_table = pd.read_excel(compounds_database_file)
chempider_local_db = ChemspiLocalDB(chemspider_results_db_file)
chempider_web_db = ChemspiWebDB(web_api_key)

# remove not needed columns
compounds_table.drop(columns=['Checked'], inplace=True)


print("Sucess reading databases")
#display(compounds_table)

Sucess reading databases


## Step 1: Removing duplicated entries
- Sort by Name
- Sort by area
- Sort by retention time
- Select from duplicated names: one with highest area then retention time

In [2]:
import compound_analisis_utils as utils
compounds_table = utils.remove_duplicated_entries(compounds_table)
print("Done")
compounds_table.to_excel(export_file)
#display(compounds_table)

Done


# Step 2: Find CSId from chemspi databases
- Try local first
- Try online otherwise

In [None]:
csid_values = []
for row_index, row_data in compounds_table.iterrows() :
    #print("#########################")
    compound_name = row_data.Name.lower()
    compound_mass = row_data['Molecular Weight']
    #print("Searching csid for compound: ", compound_name)
    db_entry = chempider_local_db.find_compound_by_name(compound_name)
    
    if (db_entry is not None) and (db_entry.Name.lower() == compound_name):
        #print("Found in Local DB:           ", db_entry.Name)
        #print("CSID:                        ", db_entry.CSID)
        csid_values.append(db_entry.CSID)
    else:
        #print("#########################")
        print("Compound(", compound_name, ") not found in local database, trying online...")
        compound_csids, warnings = chempider_web_db.find_compounds_by_name_mass(compound_name, compound_mass)
        if len(warnings) > 0:
            print("Warnings:", warnings)
            
        if len(compound_csids) > 1 :
            csids = ""
            for csid in compound_csids:
                csids += str(csid) + ";";
            print("Found csids: ", csids)
            csid_values.append(csids)
        elif len(compound_csids) == 1 :
            csid_values.append(compound_csids[0])
        else:
            #print("Not Found")
            csid_values.append('ND')

        
compounds_table.insert(0, 'CSID', csid_values)
display(compounds_table)
compounds_table.to_excel(export_file)

# Step 3: Find External references from chemspi web search

In [None]:
external_databases = ['Human Metabolome Database', 'KEGG', 'LipidMAPS']
external_db_info = dict()
for db in external_databases:
    external_db_info[db] = []
        
for row_index, row_data in compounds_table.iterrows() :
    # search for external references from known databases
    print("Finding external references for ", row_data.Name)
    external_databases_result = {}
    for db in external_databases:
        external_databases_result[db] = []
    
    for csid in utils.parse_generated_CSID(row_data.CSID):
        print("CSID: ", csid)
        external_refs, warnings = chempider_web_db.find_external_references(csid, external_databases)
        display(external_refs)
        
        if len(warnings) > 0 :
            display(warnings)
        
        for db, values in external_refs.items():
            external_databases_result[db].extend(values)
    
    for db, values in external_databases_result.items():
        str_result = ""
        for v in values :
            str_result += str(v) + ";"
        external_db_info[db].append(str_result)
        
insert_pos = 1
for db_name, value in external_db_info.items():
    compounds_table.insert(insert_pos, db_name, value)
    insert_pos += 1

display(compounds_table)
compounds_table.to_excel(export_file)
print("Done!")