In [1]:
# Import modules
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Set variables
plantTablePath = Path("../LOTUS/lotus_NP_synonyms.tsv")
endogenousTablePath = Path("./pre_hmdb_endogenous.tsv")
plantTableOutputPath = Path("../LOTUS/natural_product_list.tsv")

In [3]:
# Read table containing plant compounds
plantTable = pd.read_csv(plantTablePath, sep="\t", header=0)

# Read table containing endogenous compounds
endogenousTable = pd.read_csv(endogenousTablePath, sep="\t", header=0)

# Show Tables
print(plantTable, "\n", endogenousTable)

                                                     Name          ID
0                                            Dasytrichone  LTS0000005
1                                            CHEMBL191819  LTS0000005
2                                             151655-69-5  LTS0000005
3                                          desmotumotin B  LTS0000005
4                                                  C10035  LTS0000005
...                                                   ...         ...
892539  1,8a-dimethyl-4-[(2-methylbut-2-enoyl)oxy]-6-o...  LTS0276514
892540  (1s,2s,4r,7e,9s,10r,11r)-9-hydroxy-4,8-dimethy...  LTS0276515
892541  (3ar,4s,6ar,7r,8s,9r,9as,9bs)-7,8-dihydroxy-3,...  LTS0276516
892542  (4as,6as,6br,8as,10s,12ar,12bs,14bs)-10-{[(2s,...  LTS0276517
892543  (1r,4as,5s,8ar)-5-[(3z)-4-carboxy-3-methylbut-...  LTS0276518

[892544 rows x 2 columns] 
                                                   Name      HMDB_ID
0                              ​17-methyloctadecanoate  HMDB003

In [4]:
# Convert plantTable into numpy matrix
    # Remove \u200b unicode character (empty character)
plantMatrix = plantTable.to_numpy()
plantMatrix[:, 0] = [name.replace('\u200b', '').lower().strip() for name in plantMatrix[:, 0]]


# Get numpy array with endogenous metabolites
    # Remove \u200b unicode character (empty character)
endogenousArray = np.array([name.replace('\u200b', '').lower().strip() for name in endogenousTable.loc[:, 'Name'].to_list()])


#### Save both tables: Testing one (plant) and Endogenous. This is to make removal in C++

In [5]:
pd.DataFrame(plantMatrix).to_csv("testing_table.tsv", sep="\t", index=False, header=False)
pd.DataFrame(endogenousArray).to_csv("endogenous_list.tsv", sep="\t", index=False, header=False)

#### Make processing (removal) in Notebook

In [None]:
# Get ID of plant compounds contained in endogenous array
plantEndogenousIDArray = plantMatrix[np.isin(plantMatrix[:, 0], endogenousArray), 1]

In [19]:
# Get ID without repeats
plantEndogenousIDArrayUniq = np.array(list(set(plantEndogenousIDArray)))

In [47]:
# Obtain plantTable with rows not contained in endogenousTable
plantTableFiltered = plantTable.loc[~np.isin(plantTable.loc[:, 'PlantCyc_ID'].to_numpy(), plantEndogenousIDArrayUniq), :]

In [52]:
plantTableFiltered

Unnamed: 0,Name,PlantCyc_ID
0,4'-demethyldeoxypodophyllotoxin,CPD-18756
1,3590-93-0,CPD-18756
2,4'-demethyldesoxypodophyllotoxin,CPD-18756
3,a 80198,CPD-18756
4,chebi:1729,CPD-18756
...,...,...
52554,violdelphin,CPD-16561
52555,wighteone,CPD-6644
52556,wogonin,CPD-12727
52558,zeinoxanthin,CPD-5661


In [51]:
# Convert compound names to lower case
plantTableFiltered["Name"] = plantTableFiltered["Name"].str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plantTableFiltered["Name"] = plantTableFiltered["Name"].str.lower()


In [53]:
# Write output table
plantTableFiltered.to_csv(plantTableOutputPath, sep="\t", index=False)