# Clean Goodcents dataset

In [179]:
import pyrfume
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

```
Info on goodcents dataset in Pyrfume

[raw]
"data_rw_opl.csv" = "Odor labels for molecules as originally compiled by John Leffingwell and cleaned by Sanchez-Lengeling et al"
"data_rw_odor.csv" = "Information about the dataset"
"data_rw_contents.csv" = "..."
LICENSE = "Licensing information and use restrictions according to the terms of The Goodscents Company"
"README.md" = "Additional information about this archive"

[parsed]
"opl.csv" = "Molecular structure data"

[processed]
"molecules.csv" = "Information about odorant molecules used"
"stimuli.csv" = "Mappings between PubChem IDs and GoodScents IDs, Concentration, and Solvent"
"behavior.csv" = "Odor descriptors for each molecule"
```

In [180]:
# Load goodcents datasets from pyrfume
molecules = pyrfume.load_data('goodscents/molecules.csv', remote=True)
behavior = pyrfume.load_data('goodscents/behavior.csv', remote=True)
stimuli = pyrfume.load_data('goodscents/stimuli.csv', remote=True)
raw_opl_data = pyrfume.load_data('goodscents/data_rw_opl.csv', remote=True)

# Load goodcents datasets locally
# molecules = pd.read_csv("./goodscents/molecules.csv", delimiter=',', index_col='CID')
# behavior = pd.read_csv("./goodscents/behavior.csv", delimiter=',', index_col='Stimulus')
# stimuli = pd.read_csv("./goodscents/stimuli.csv", delimiter=',', index_col='Stimulus')


In [181]:
"""
Required descriptors based on the preprint:

Brian K. Lee, Emily J. Mayhew, Benjamin Sanchez-Lengeling,
Jennifer N. Wei, Wesley W. Qian, Kelsie Little, Matthew Andres,
Britney B. Nguyen, Theresa Moloy, Jane K. Parker, Richard C. Gerkin,
Joel D. Mainland, Alexander B. Wiltschko

`A Principal Odor Map Unifies Diverse Tasks in Human Olfactory Perception preprint
<https://www.biorxiv.org/content/10.1101/2022.09.01.504602v4>`_.
"""

required_desc = [
'alcoholic', 'aldehydic', 'alliaceous', 'almond', 'amber', 'animal',
'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy',
'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt',
'buttery', 'cabbage', 'camphoreous', 'caramellic', 'cedar', 'celery',
'chamomile', 'cheesy', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clean',
'clove', 'cocoa', 'coconut', 'coffee', 'cognac', 'cooked', 'cooling',
'cortex', 'coumarinic', 'creamy', 'cucumber', 'dairy', 'dry', 'earthy',
'ethereal', 'fatty', 'fermented', 'fishy', 'floral', 'fresh', 'fruit skin',
'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grassy',
'green', 'hawthorn', 'hay', 'hazelnut', 'herbal', 'honey', 'hyacinth',
'jasmin', 'juicy', 'ketonic', 'lactonic', 'lavender', 'leafy', 'leathery',
'lemon', 'lily', 'malty', 'meaty', 'medicinal', 'melon', 'metallic',
'milky', 'mint', 'muguet', 'mushroom', 'musk', 'musty', 'natural', 'nutty',
'odorless', 'oily', 'onion', 'orange', 'orangeflower', 'orris', 'ozone',
'peach', 'pear', 'phenolic', 'pine', 'pineapple', 'plum', 'popcorn',
'potato', 'powdery', 'pungent', 'radish', 'raspberry', 'ripe', 'roasted',
'rose', 'rummy', 'sandalwood', 'savory', 'sharp', 'smoky', 'soapy',
'solvent', 'sour', 'spicy', 'strawberry', 'sulfurous', 'sweaty', 'sweet',
'tea', 'terpenic', 'tobacco', 'tomato', 'tropical', 'vanilla', 'vegetable',
'vetiver', 'violet', 'warm', 'waxy', 'weedy', 'winey', 'woody'
]

### Analysis of molecules.csv

In [182]:
molecules.head()

Unnamed: 0_level_0,MolecularWeight,IsomericSMILES,IUPACName,name
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,75.11,CC(CN)O,1-aminopropan-2-ol,1-aminopropan-2-ol
49,116.11,CC(C)C(=O)C(=O)O,3-methyl-2-oxobutanoic acid,3-methyl-2-oxobutanoic acid
51,146.1,C(CC(=O)O)C(=O)C(=O)O,2-oxopentanedioic acid,2-ketoglutaric acid
58,102.09,CCC(=O)C(=O)O,2-oxobutanoic acid,2-oxobutanoic acid
70,130.14,CC(C)CC(=O)C(=O)O,4-methyl-2-oxopentanoic acid,4-methyl-2-oxopentanoic acid


In [183]:
molecules.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4565 entries, 4 to 152743294
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MolecularWeight  4565 non-null   float64
 1   IsomericSMILES   4565 non-null   object 
 2   IUPACName        4559 non-null   object 
 3   name             4565 non-null   object 
dtypes: float64(1), object(3)
memory usage: 178.3+ KB


In [184]:
assert len(molecules['IsomericSMILES'].drop_duplicates()) == len(molecules)
# check for duplicates in molecules

In [185]:
from rdkit import Chem

def canonical_smiles(smiles):
    """
    Function return canonical smiles for a given smiles
    """
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

In [186]:
total_molecules = len(molecules['IsomericSMILES'].value_counts())
total_molecules

4565

In [187]:
molecules['canonicalSMILES'] = molecules['IsomericSMILES'].progress_apply(lambda x: canonical_smiles(x))
assert len(molecules['canonicalSMILES'].value_counts()) == total_molecules

# all isomeric smiles are in canonical from
molecules = molecules.drop(columns=['canonicalSMILES'])

  0%|          | 0/4565 [00:00<?, ?it/s]

100%|██████████| 4565/4565 [00:00<00:00, 16200.81it/s]


### Analysis of behavior.csv

In [188]:
behavior.head()

Unnamed: 0_level_0,Descriptors
Stimulus,Unnamed: 1_level_1
100-06-1,sweet;vanilla;cherry maraschino cherry;powdery...
100-09-4,phenolic;animal;fecal;medicinal
100-42-5,sweet;plastic;floral;balsamic
100-51-6,sweet;floral;rose;fruity;phenolic;balsamic;che...
100-52-7,sweet;cherry;cherry maraschino cherry;nutty;fr...


In [189]:
behavior.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4626 entries, 100-06-1 to NF0825
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Descriptors  4622 non-null   object
dtypes: object(1)
memory usage: 72.3+ KB


In [190]:
# check for nans
behavior_nan = behavior[behavior.isna().any(axis=1)]
behavior_nan

Unnamed: 0_level_0,Descriptors
Stimulus,Unnamed: 1_level_1
1523-19-9,
53584-56-8,
6698-82-0,
78183-56-9,


In [191]:
# remove nans
behavior_clean = behavior.dropna()
behavior_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4622 entries, 100-06-1 to NF0825
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Descriptors  4622 non-null   object
dtypes: object(1)
memory usage: 72.2+ KB


### Analysis of stimuli.csv

In [192]:
stimuli.head()

Unnamed: 0_level_0,TGSC ID,CID,Concentration %,Solvent
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100-06-1,1000111,7476,100.0,
100-09-4,1031871,7478,10.0,dipropylene glycol
100-42-5,1009281,7501,0.1,triacetin
100-51-6,1001651,244,100.0,
100-52-7,1001491,240,10.0,dipropylene glycol


In [193]:
# check for duplicates based on CID column

stimuli['CID'].duplicated(keep=False).sum()

119

Analysis of duplicates

In [194]:
cid_behaviors = pd.merge(behavior_clean['Descriptors'], stimuli, how='inner', left_index=True, right_index=True)
cid_behaviors.head(3)

Unnamed: 0_level_0,Descriptors,TGSC ID,CID,Concentration %,Solvent
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100-06-1,sweet;vanilla;cherry maraschino cherry;powdery...,1000111,7476,100.0,
100-09-4,phenolic;animal;fecal;medicinal,1031871,7478,10.0,dipropylene glycol
100-42-5,sweet;plastic;floral;balsamic,1009281,7501,0.1,triacetin


In [195]:
dup = cid_behaviors[cid_behaviors['CID'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['CID'].to_list())
for i in index_set:
    print(dup[dup['CID'] == i])
    print()

                                                 Descriptors  TGSC ID     CID  \
Stimulus                                                                        
2623-23-6  tea;earthy;mentholic;herbal;floral;fruity;wood...  1002031  220674   
89-48-5                   tea;mentholic;fruity;cooling;minty  1046271  220674   

           Concentration % Solvent  
Stimulus                            
2623-23-6            100.0     NaN  
89-48-5              100.0     NaN  

                               Descriptors  TGSC ID       CID  \
Stimulus                                                        
91069-39-5           creamy;peach;lactonic  1537621  18598146   
91069-40-8  creamy;spicy;cinnamon;lactonic  1537631  18598146   

            Concentration %             Solvent  
Stimulus                                         
91069-39-5              1.0  dipropylene glycol  
91069-40-8              1.0  dipropylene glycol  

                     Descriptors  TGSC ID     CID  Concentration

It can be observed that the duplicate CIDs have different 'Descriptors' in many cases. Upon further investigation, it was found that duplicates exist because new entries were made to the Goodcents website for same molecules few years later after inital entry without deleting the original entry for the molecules.

```
Examples:

    TGSC ID
    1002031    2021.0
    1046271       NaN
    1046271    1989.0
    Name: Source Year, dtype: float64
    TGSC ID
    1002031    2623-23-6
    1046271      89-48-5
    Name: CAS Number, dtype: object

    TGSC ID
    1537621   NaN
    1537631   NaN
    Name: Source Year, dtype: float64
    TGSC ID
    1537621    91069-39-5
    1537631    91069-40-8
    Name: CAS Number, dtype: object (#E-Z)

    TGSC ID
    1587131   NaN
    1593881   NaN
    Name: Source Year, dtype: float64
    TGSC ID
    1587131    137886-38-5
    1593881     22451-49-6
    1593881     22451-50-9
    Name: CAS Number, dtype: object (#more info on cis-trans one)

    TGSC ID
    1412121   NaN
    1011091   NaN
    Name: Source Year, dtype: float64
    TGSC ID
    1412121    29759-11-3
    1011091       93-92-5 (#-outdated link (older data))
    Name: CAS Number, dtype: object 

    TGSC ID
    1008311    1985.0
    1008311    1997.0
    1337381       NaN
    Name: Source Year, dtype: float64
    TGSC ID
    1008311    110-93-0
    1337381    409-02-9
    Name: CAS Number, dtype: object

    TGSC ID
    1010451    1987.0
    1010451    2000.0
    1007031       NaN
    Name: Source Year, dtype: float64
    TGSC ID
    1010451      141-25-3
    1007031      106-22-9
    1007031    26489-01-0
    Name: CAS Number, dtype: object
```

In some cases, the source year was explicitly mentioned, while in other cases its was not directly apparent. It was noticed that a lot of newer TGSC IDs had updated safety references with issue years as recent as 2023. These TGSC ID were assumed to be the latest entries. There were still few cases in which it was difficult to identify the most recent entries.

To solve this issue of duplication, a dictionary was created to filter out the newer entries. For entries with no clear distinction in source years, all duplicate entries were taken and descriptors were merged together per set of duplicates.

In [196]:
CAS_number_src_year = {'2623-23-6': 2021,
 '22451-49-6': None,
 '29759-11-3': None,
 '110-93-0': 1997,
 '141-25-3': 2000,
 '1321-89-7': 2017,
 '25773-40-4': None,
 '93905-03-4': None,
 '6728-26-3': 2007,
 '41199-19-3': None,
 '71832-76-3': None,
 '1334-82-3': 2020,
 '65505-16-0': 2015,
 '1118-39-4': 2015,
 '546-79-2': 2009,
 '99-83-2': 1991,
 '72881-27-7': 2021,
 '1365-19-1': 2016,
 '9003-73-0': 2016,
 '143-14-6': 2021,
 '13952-84-6': None,
 '513-49-5': None,
 '13828-37-0': 2017,
 '39872-57-6': None,
 '70266-48-7': None,
 '63450-30-6': 2011,
 '1335-86-0': None,
 '591-49-1': None,
 '139-33-3': None,
 '6381-92-6': None,
 '25524-95-2': 1997,
 '80417-97-6': 2018,
 '23696-85-7': 1991,
 '80722-28-7': None,
 '80957-74-0': None,
 '16423-19-1': None,
 '620-23-5': 2007,
 '491-07-6': 2016,
 '127-42-4': 1994,
 '2102-59-2': None,
 '67883-79-8': 2000,
 '68259-31-4': None,
 '93904-56-4': None,
 '8000-41-7': 2015,
 '5524-05-0': 2022,
 '30207-98-8': 2023,
 '1866-31-5': 2022,
 '1866-31-5 (N)': 2022,
 '1866-31-5 (Z)': 2022,
 '98-52-2': 2022,
 '23787-90-8': 1984,
 '89-78-1': 2023,
 '58985-18-5': 2015,
 '133-37-9': None,
 '526-83-0': None,
 '106-21-8': 2023,
 '32210-23-4': 2022,
 '85508-08-3': None,
 '58625-95-9': None,
 '18368-91-7': 2023,
 '13092-66-5': None,
 '2882-20-4': 2023,
 '4180-23-8': 2023,
 '2396-77-2': None,
 '2396-78-3': 2022,
 '91069-39-5': None,
 '67859-99-8': None,
 '111-60-4': None,
 '9004-99-3': None,
 '39638-67-0': None,
 '80041-01-6': None,
 '124071-42-7': None,
 '134769-33-8': None,
 '7757-87-1': None,
 '91069-40-8': None,
 '67874-69-5': None}

In [197]:
set_acceptable_tgsc_id_from_dups = set()
available_tgsc_ids = dup['TGSC ID'].to_list()
def get_acceptable_tgsc_id_from_dups(CAS_num):
    tgsc_id_list_raw = list(raw_opl_data[raw_opl_data['CAS Number']==CAS_num].index)
    tgsc_id_list = []
    for tgsc_id in tgsc_id_list_raw:
        if tgsc_id in available_tgsc_ids:
            tgsc_id_list.append(tgsc_id)
    set_acceptable_tgsc_id_from_dups.update(tgsc_id_list)

for CAS_num in CAS_number_src_year.keys():
    get_acceptable_tgsc_id_from_dups(CAS_num)

print("available_tgsc_ids: ", len(set(available_tgsc_ids)))
print("acceptable_tgsc_id_from_dups: ", len(set_acceptable_tgsc_id_from_dups))

set_tgsc_ids_to_remove = set.difference(set(available_tgsc_ids), set_acceptable_tgsc_id_from_dups)

print("tgsc_id_to_remove: ", len(set_tgsc_ids_to_remove))

available_tgsc_ids:  119
acceptable_tgsc_id_from_dups:  75
tgsc_id_to_remove:  44


In [198]:
# remove outdated tgsc ids
cid_behaviors = cid_behaviors.drop(cid_behaviors[cid_behaviors['TGSC ID'].isin(list(set_tgsc_ids_to_remove))].index)

In [199]:
# remaining duplicates
cid_behaviors[cid_behaviors['CID'].duplicated(keep=False)]

Unnamed: 0_level_0,Descriptors,TGSC ID,CID,Concentration %,Solvent
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
111-60-4,mild;waxy,1299941,24762,100.0,
124071-42-7,mandarin,1123891,6450538,1.0,dipropylene glycol
13092-66-5,odorless,1609991,24439,100.0,
133-37-9,caramellic;mild,1060791,875,100.0,
1335-86-0,citrus,1506631,11574,10.0,dipropylene glycol
134769-33-8,sweet;aldehydic;tangerine;fresh;watery;citrus,1030521,6450538,10.0,dipropylene glycol
139-33-3,odorless,1131691,8759,100.0,
13952-84-6,fishy;ammonia,1056631,24874,0.1,propylene glycol
1866-31-5N,spicy;plum;fruity;pineapple;apricot;cinnamyl;b...,1002311,15834,100.0,
1866-31-5Z,fruity,1117241,15834,100.0,


In [200]:
# merge descriptors from remaining duplicates
dup = cid_behaviors[cid_behaviors['CID'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['CID'].to_list())
list_idx_of_duplicates = []
for i in index_set:
    # print(dup[dup['CID'] == i])
    list_idx_of_duplicates.append(list(dup[dup['CID'] == i].index))

for dup_set in list_idx_of_duplicates:
    desc_set = set()
    desc_string = ';'.join(cid_behaviors['Descriptors'][dup_set].to_list())
    for desc in desc_string.split(';'):
        if desc == 'odorless':
            continue
        desc_set.update([desc])
    if len(desc_set) == 0:
        desc_set.update(['odorless'])
    new_desc_string = ';'.join(desc_set)
    cid_behaviors['Descriptors'][dup_set] = new_desc_string

cid_behaviors = cid_behaviors.drop_duplicates('CID', keep='first')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_behaviors['Descriptors'][dup_set] = new_desc_string
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_behaviors['Descriptors'][dup_set] = new_desc_string
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_behaviors['Descriptors'][dup_set] = new_desc_string
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [201]:
assert len(cid_behaviors[cid_behaviors['CID'].duplicated(keep=False)]) == 0 # no duplicates remaining
cid_behaviors.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4560 entries, 100-06-1 to NF0825
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Descriptors      4560 non-null   object 
 1   TGSC ID          4560 non-null   int64  
 2   CID              4560 non-null   int64  
 3   Concentration %  4080 non-null   float64
 4   Solvent          1825 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 213.8+ KB


Now we merge `cid_behaviors` and `molecules` to get new dataframe

In [202]:
cid_behaviors_mol = pd.merge(molecules['IsomericSMILES'], cid_behaviors[['Descriptors', 'CID']],how='inner', on='CID')
cid_behaviors_mol.head()

Unnamed: 0,CID,IsomericSMILES,Descriptors
0,4,CC(CN)O,fishy
1,49,CC(C)C(=O)C(=O)O,fruity
2,51,C(CC(=O)O)C(=O)C(=O)O,odorless
3,58,CCC(=O)C(=O)O,sweet;caramellic;creamy;brown;lactonic
4,70,CC(C)CC(=O)C(=O)O,fruity


### Analysis of cid_behaviors_mol

In [203]:
cid_behaviors_mol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4560 entries, 0 to 4559
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CID             4560 non-null   int64 
 1   IsomericSMILES  4560 non-null   object
 2   Descriptors     4560 non-null   object
dtypes: int64(1), object(2)
memory usage: 107.0+ KB


In [204]:
# get list of odors currently in cid_behaviors_mol
def get_odors(desc, odor_list):
    odors = desc.split(';')
    for i in odors:
        if i not in odor_list:
            odor_list.append(i)

odor_list = []
cid_behaviors_mol['Descriptors'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

odors_df = pd.DataFrame(odor_list, columns=['desc'])
odors_df

no of odor descriptors:  663


Unnamed: 0,desc
0,absinthe
1,absolute
2,acacia
3,acetaldehyde
4,acetic
...,...
658,worty
659,yeasty
660,ylang
661,yogurt


### Clean odor descriptors

Steps of cleaning:
1. merge descriptors containing required descriptor words
    ```
    Example:
    'almond': ['almond',
      'almond bitter almond',
      'almond roasted almond',
      'almond toasted almond']
    ```
2. merge descriptors containing stems of the required descriptor words
    ```
    Example:
    'balsamic': ['balsam', 'balsamic', 'tolu balsam']
    ```
3. merge descriptors containing root prefix of the required descriptor words (in this case `required_desc[:-2]` or `required_desc[:-3]`)
    ```
    Example:
    'milky': ['milk', 'milky']
    'floral': ['floral','flower','flowers','flowery']
    ```

In [205]:
# code for step1:

# def desc_analyser():
#     for desc in required_desc:
#         odor_set = odors_df[odors_df['desc'].str.match(f'.* {desc}.*|.*{desc} .*|{desc}')]
#         if len(odor_set) > 1:
#             yield(desc, odor_set)
# gen = desc_analyser()

# merger_dict = {}
# for out in desc_analyser():
#     merger_dict[out[0]] = list(out[1]['desc'])
# merger_dict

In [206]:
# code for step2:

# import nltk
# nltk.download('snowball_data')
# from nltk.stem import SnowballStemmer

# # Initialize the SnowballStemmer for English
# stemmer = SnowballStemmer('english')

# def desc_stem_analyser():
#     for desc in required_desc:
#         root = stemmer.stem(desc)
#         odor_set = odors_df[odors_df['desc'].str.match(f'.*{root}.*')]
#         if len(odor_set) > 1:
#             yield(desc, odor_set)
# gen = desc_stem_analyser()

# merger_dict_v2 = {}
# for out in desc_stem_analyser():
#     merger_dict_v2[out[0]] = list(out[1]['desc'])
# merger_dict_v2

In [207]:
# code for step3:

# def desc_suffix_analyser():
#     for desc in required_desc:
#         root = desc[:-2]
#         odor_set = odors_df[odors_df['desc'].str.match(f'.*{root}.*')]
#         if len(odor_set) > 1:
#             yield(desc, odor_set)
# gen = desc_suffix_analyser()

# to_be_checked = []
# for out in desc_suffix_analyser():
#     # manual selection
#     choice = input(out)
#     if choice == '1':
#         to_be_checked.append(out[0])
# merger_dict_v3 = {}
# for out in desc_suffix_analyser():
#     if out[0] in to_be_checked:
#         merger_dict_v3[out[0]] = list(out[1]['desc'])
# merger_dict_v3

Based on these cleaning steps and manual curation, a dictionary for cleaning was made:

(Note that 'fruit skin' is not originally in goodcents. It was added based on peels and skins of fruits.)

In [208]:
merger_dict = \
{'almond': ['almond','almond bitter almond','almond roasted almond','almond toasted almond'],
 'amber': ['amber', 'ambergris'],
 'apple': ['apple','apple cooked apple','apple dried apple','apple green apple','apple skin'],
 'banana': ['banana','banana peel','banana ripe banana','banana unripe banana'],
 'beefy': ['beefy roasted beefy', 'beef', 'beef juice', 'beefy'],
 'berry': ['berry', 'berry ripe berry'],
 'bitter': ['almond bitter almond', 'bitter', 'orange bitter orange'],
 'black currant': ['currant black currant', 'currant bud black currant bud'],
 'burnt': ['burnt', 'sugar burnt sugar', 'woody burnt wood'],
 'cedar': ['cedar', 'cedarwood'],
 'cheesy': ['cheese','cheesy limburger cheese','cheesy parmesan cheese','cheesy','cheesy roquefort cheese','cheesy feta cheese','cheesy bleu cheese'],
 'cherry': ['cherry', 'cherry maraschino cherry'],
 'chocolate': ['chocolate', 'chocolate dark chocolate'],
 'citrus': ['citralva','citric','citronellal','citrus rind','citronella','citral','citrus','citrus peel'],
 'coffee': ['coffee', 'coffee roasted coffee'],
 'cucumber': ['cucumber', 'cucumber skin'],
 'fresh': ['fresh', 'fresh outdoors', 'freshly'],
 'grape': ['concord grape', 'grape', 'grape skin'],
 'grapefruit': ['grapefruit', 'grapefruit peel', 'grapfruit'],
 'green': ['apple green apple','bean green bean','green','clover','pea green pea','tea green tea'],
 'hay': ['hay', 'hay new mown hay', 'new mown hay'],
 'hazelnut': ['hazelnut', 'hazelnut roasted hazelnut'],
 'honey': ['honey', 'honeydew', 'honeysuckle'],
 'juicy': ['juicy', 'juicy fruit'],
 'lemon': ['lemon peel', 'lime', 'lemon', 'lemongrass'],
 'lily': ['lily of the valley', 'lily-of-the-valley', 'lily', 'lilial'],
 'meaty': ['meaty roasted meaty', 'meat', 'meaty'],
 'melon': ['melon', 'melon rind', 'melon unripe melon', 'watermelon rind'],
 'mint': ['cornmint', 'peppermint', 'mint', 'minty', 'spearmint'],
 'musk': ['musky', 'musk', 'ambrette', 'nitromusk'],
 'odorless': ['almost odorless', 'odorless'],
 'onion': ['onion', 'onion cooked onion', 'onion green onion'],
 'orange': ['orange', 'orange bitter orange', 'orange peel', 'orange rind'],
 'pear': ['pear', 'pear skin'],
 'plum': ['plum', 'plum skin'],
 'potato': ['potato','potato baked potato','potato chip','potato raw potato'],
 'ripe': ['banana ripe banana','banana unripe banana','berry ripe berry','fruit overripe fruit','fruit ripe fruit','ripe'],
 'roasted': ['barley roasted barley','chicken roasted chicken','grain toasted grain','almond roasted almond','peanut roasted peanut','hazelnut roasted hazelnut','coffee roasted coffee','beefy roasted beefy','meaty roasted meaty','roasted'],
 'rose': ['rose red rose','rose','rose tea rose','rose dried rose','bois de rose','rosy','rosey'],
 'tea': ['rose tea rose', 'tea', 'tea black tea', 'tea green tea'],
 'vegetable': ['vegetable', 'vegetables'],
 'woody': ['wood','woody','woody-lactone','woody old wood','woody burnt wood'],
 'grassy': ['lemongrass', 'grassy', 'grass'],
 'lactonic': ['woody-lactone', 'lactonic', 'lactone'],
 'leafy': ['leafy', 'tomato leaf', 'leaf', 'violet leaf'],
 'fruit skin': ['apple skin','grape skin','pear skin','plum skin','orange peel','banana peel','citrus peel','grapefruit peel','lemon peel'],
 'fruity': ['fruit','fruit ripe fruit','juicy fruit','fruit overripe fruit','fruit tropical fruit','fruit dried fruit','fruity'],
 'dry': ['dried','dry','fruit dried fruit','rose dried rose','apple dried apple'],
 'spicy': ['allspice', 'spicy', 'nutmeg', 'spice', 'rosemary'],
 'anisic': ['anise', 'anisic'],
 'balsamic': ['balsam', 'balsamic', 'tolu balsam'],
 'camphoreous': ['camphor', 'camphoreous'],
 'caramellic': ['caramel', 'caramellic'],
 'cooling': ['cool', 'cooling'],
 'coumarinic': ['coumarin', 'coumarinic'],
 'ethereal': ['ether', 'ethereal'],
 'metallic': ['metal', 'metallic'],
 'sulfurous': ['sulfurous', 'sulfury'],
 'terpenic': ['terpene', 'terpenic', 'terpentine'],
 'buttery': ['butter', 'buttery'],
 'cinnamon': ['cinnamon', 'cinnamyl'],
 'fatty': ['chicken fat', 'fatty'],
 'fishy': ['fish', 'fishy', 'shellfish'],
 'herbal': ['herb', 'herbaceous', 'herbal'],
 'milky': ['milk', 'milky'],
 'nutty': ['nut', 'nut flesh', 'nut skin', 'nutty'],
 'oily': ['oil', 'oily'],
 'radish': ['horseradish', 'radish'],
 'rummy': ['rum', 'rummy'],
 'smoky': ['sausage smoked sausage', 'smoky'],
 'sweaty': ['sweat', 'sweaty'],
 'winey': ['wine', 'winey'],
 'floral': ['floral', 'flower', 'flowers', 'flowery']}

In [209]:
# merge operation based on merger dict
from collections import defaultdict
reverse_merger_dict = defaultdict(list)
for key, values in merger_dict.items():
    for value in values:
        reverse_merger_dict[value].append(key)

def update_desc(desc_string):
    desc_list = desc_string.split(';')
    new_odors = set()
    flag = 0
    for desc in desc_list:
        flag = 0
        for old_desc, new_desc in reverse_merger_dict.items():
            if desc == old_desc:
                new_odors.update(new_desc)
                flag = 1
                break
        if flag==0:
            new_odors.update([desc])
    return ';'.join(new_odors)

cid_behaviors_mol['Updated_Desc'] = cid_behaviors_mol['Descriptors'].apply(lambda x: update_desc(x))

odor_list = []
cid_behaviors_mol['Updated_Desc'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

odors_df = pd.DataFrame(odor_list, columns=['desc'])
odors_df

no of odor descriptors:  521


Unnamed: 0,desc
0,absinthe
1,absolute
2,acacia
3,acetaldehyde
4,acetic
...,...
516,worty
517,yeasty
518,ylang
519,yogurt


### Get required odor descriptors

In [210]:
# check for required desc not in odor list (previously, fruit skin was one such case)
missing_desc = []
for req in required_desc:
    if req not in odor_list:
        missing_desc.append(req)
missing_desc

[]

In [211]:
# Get required odor descriptors
def get_req_desc(desc_string):
    desc_list = desc_string.split(';')
    desc_set = set()
    for desc in desc_list:
        if desc in required_desc:
            desc_set.update([desc])
    return ';'.join(desc_set)

cid_behaviors_mol['Updated_Desc_v2'] = cid_behaviors_mol['Updated_Desc'].apply(lambda x: get_req_desc(x))

odor_list = []
cid_behaviors_mol['Updated_Desc_v2'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

odors_df = pd.DataFrame(odor_list, columns=['desc'])
odors_df

no of odor descriptors:  139


Unnamed: 0,desc
0,
1,alcoholic
2,aldehydic
3,alliaceous
4,almond
...,...
134,warm
135,waxy
136,weedy
137,winey


In [212]:
# encode the descriptors
odor_dummies = cid_behaviors_mol['Updated_Desc_v2'].str.get_dummies(sep=';')
cid_behaviors_mol_encoded = pd.concat([cid_behaviors_mol, odor_dummies], axis=1)
cid_behaviors_mol_encoded = cid_behaviors_mol_encoded.drop(columns=['CID', 'Descriptors', 'Updated_Desc'])

In [213]:
cid_behaviors_mol_encoded.head()

Unnamed: 0,IsomericSMILES,Updated_Desc_v2,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(CN)O,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)C(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C(CC(=O)O)C(=O)C(=O)O,odorless,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCC(=O)C(=O)O,creamy;caramellic;lactonic;sweet,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC(C)CC(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [214]:
# check for molecules with no descriptors
required_encoded = cid_behaviors_mol_encoded.drop(columns=['IsomericSMILES', 'Updated_Desc_v2'])
no_desc_df = pd.DataFrame(required_encoded.sum(axis=1).sort_values(), columns=['count']).query('count==0')
no_desc_df

Unnamed: 0,count
3712,0
2342,0
4330,0
872,0
4328,0
...,...
90,0
2633,0
1024,0
1642,0


In [215]:
# remove molecules with no descriptors
required_goodscents_dataset = cid_behaviors_mol_encoded.drop(no_desc_df.index).reset_index(drop=True)
required_goodscents_dataset.head()

Unnamed: 0,IsomericSMILES,Updated_Desc_v2,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(CN)O,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)C(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C(CC(=O)O)C(=O)C(=O)O,odorless,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCC(=O)C(=O)O,creamy;caramellic;lactonic;sweet,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC(C)CC(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:
required_goodscents_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4392 entries, 0 to 4391
Columns: 140 entries, IsomericSMILES to woody
dtypes: int64(138), object(2)
memory usage: 4.7+ MB


In [217]:
# frequency per descriptor in the required_goodscents_dataset
required_encoded_v2 = required_goodscents_dataset.drop(columns=['IsomericSMILES', 'Updated_Desc_v2'])
required_encoded_v2.sum().sort_values(ascending=False)

fruity       1257
green        1078
sweet        1001
floral        918
woody         621
             ... 
ketonic        18
brandy         16
chamomile      14
gassy          12
malty          10
Length: 138, dtype: int64

In [219]:
# save the curated dataset
required_goodscents_dataset.to_csv('./curated_datasets/curated_goodcents.csv', index=False)