In [1]:
# Import modules
import pandas as pd
import numpy as np
from pathlib import Path

In [26]:
# Set variables
plantTablePath = Path("./plant_pre3_database.tsv")
endogenousTablePath = Path("./pre_hmdb_endogenous.tsv")
plantTableOutputPath = Path("./plant_database.tsv")

In [3]:
# Read table containing plant compounds
plantTable = pd.read_csv(plantTablePath, sep="\t", header=0)

# Read table containing endogenous compounds
endogenousTable = pd.read_csv(endogenousTablePath, sep="\t", header=0)

# Show Tables
print(plantTable, "\n", endogenousTable)

                                   Name PlantCyc_ID
0       4'-Demethyldeoxypodophyllotoxin   CPD-18756
1                             3590-93-0   CPD-18756
2      4'-Demethyldesoxypodophyllotoxin   CPD-18756
3                               A 80198   CPD-18756
4                            CHEBI:1729   CPD-18756
...                                 ...         ...
52556                           wogonin   CPD-12727
52557                       xanthohumol    CPD-7119
52558                      zeinoxanthin    CPD-5661
52559                         zerumbone   CPD-11421
52560                       zymosterone    CPD-4581

[52561 rows x 2 columns] 
                                                   Name      HMDB_ID
0                              ​17-methyloctadecanoate  HMDB0037397
1                          ​17-Methyloctadecanoic acid  HMDB0037397
2            ​2,3,4,5-tetrahydro-2-pyridinecarboxylate  HMDB0012130
3        ​2,3,4,5-Tetrahydro-2-pyridinecarboxylic acid  HMDB0012130
4       

In [4]:
# Convert plantTable into numpy matrix
    # Remove \u200b unicode character (empty character)
plantMatrix = plantTable.to_numpy()
plantMatrix[:, 0] = [name.replace('\u200b', '') for name in plantMatrix[:, 0]]


# Get numpy array with endogenous metabolites
    # Remove \u200b unicode character (empty character)
endogenousArray = np.array([name.replace('\u200b', '') for name in endogenousTable.loc[:, 'Name'].to_list()])

In [6]:
# Get ID of plant compounds contained in endogenous array
plantEndogenousIDArray = plantMatrix[np.isin(plantMatrix[:, 0], endogenousArray), 1]

In [19]:
# Get ID without repeats
plantEndogenousIDArrayUniq = np.array(list(set(plantEndogenousIDArray)))

In [47]:
# Obtain plantTable with rows not contained in endogenousTable
plantTableFiltered = plantTable.loc[~np.isin(plantTable.loc[:, 'PlantCyc_ID'].to_numpy(), plantEndogenousIDArrayUniq), :]

In [52]:
plantTableFiltered

Unnamed: 0,Name,PlantCyc_ID
0,4'-demethyldeoxypodophyllotoxin,CPD-18756
1,3590-93-0,CPD-18756
2,4'-demethyldesoxypodophyllotoxin,CPD-18756
3,a 80198,CPD-18756
4,chebi:1729,CPD-18756
...,...,...
52554,violdelphin,CPD-16561
52555,wighteone,CPD-6644
52556,wogonin,CPD-12727
52558,zeinoxanthin,CPD-5661


In [51]:
# Convert compound names to lower case
plantTableFiltered["Name"] = plantTableFiltered["Name"].str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plantTableFiltered["Name"] = plantTableFiltered["Name"].str.lower()


In [53]:
# Write output table
plantTableFiltered.to_csv(plantTableOutputPath, sep="\t", index=False)