# Clean Leffingwell dataset

In [86]:
import pyrfume
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

```
Info on leffingwell dataset in Pyrfume

[raw]
"leffingwell_data.csv" = "Odor labels for molecules as originally compiled by John Leffingwell and cleaned by Sanchez-Lengeling et al"
"leffingwell_readme.pdf" = "Information about the dataset"
LICENSE = "Licensing information and use restrictions according to the terms of John Leffingwell and Google"

[processed]
"molecules.csv" = "Information about odorant molecules used"
"behavior.csv" = "Odor labels for each molecule (one column per label)"
"behavior_sparse.csv" = "Odor labels for each molecule (single column)"
"stimuli.csv" = "Maps stimulus to CID, negative numbers assigned to compounds without CIDs"
```

In [87]:
# Load leffingwell datasets from pyrfume

molecules = pyrfume.load_data('leffingwell/molecules.csv', remote=True)
behavior = pyrfume.load_data('leffingwell/behavior.csv', remote=True)
behavior_sparse = pyrfume.load_data('leffingwell/behavior_sparse.csv', remote=True)

In [88]:
"""
Required descriptors based on the preprint:

Brian K. Lee, Emily J. Mayhew, Benjamin Sanchez-Lengeling,
Jennifer N. Wei, Wesley W. Qian, Kelsie Little, Matthew Andres,
Britney B. Nguyen, Theresa Moloy, Jane K. Parker, Richard C. Gerkin,
Joel D. Mainland, Alexander B. Wiltschko

`A Principal Odor Map Unifies Diverse Tasks in Human Olfactory Perception preprint
<https://www.biorxiv.org/content/10.1101/2022.09.01.504602v4>`_.
"""

required_desc = [
'alcoholic', 'aldehydic', 'alliaceous', 'almond', 'amber', 'animal',
'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy',
'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt',
'buttery', 'cabbage', 'camphoreous', 'caramellic', 'cedar', 'celery',
'chamomile', 'cheesy', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clean',
'clove', 'cocoa', 'coconut', 'coffee', 'cognac', 'cooked', 'cooling',
'cortex', 'coumarinic', 'creamy', 'cucumber', 'dairy', 'dry', 'earthy',
'ethereal', 'fatty', 'fermented', 'fishy', 'floral', 'fresh', 'fruit skin',
'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grassy',
'green', 'hawthorn', 'hay', 'hazelnut', 'herbal', 'honey', 'hyacinth',
'jasmin', 'juicy', 'ketonic', 'lactonic', 'lavender', 'leafy', 'leathery',
'lemon', 'lily', 'malty', 'meaty', 'medicinal', 'melon', 'metallic',
'milky', 'mint', 'muguet', 'mushroom', 'musk', 'musty', 'natural', 'nutty',
'odorless', 'oily', 'onion', 'orange', 'orangeflower', 'orris', 'ozone',
'peach', 'pear', 'phenolic', 'pine', 'pineapple', 'plum', 'popcorn',
'potato', 'powdery', 'pungent', 'radish', 'raspberry', 'ripe', 'roasted',
'rose', 'rummy', 'sandalwood', 'savory', 'sharp', 'smoky', 'soapy',
'solvent', 'sour', 'spicy', 'strawberry', 'sulfurous', 'sweaty', 'sweet',
'tea', 'terpenic', 'tobacco', 'tomato', 'tropical', 'vanilla', 'vegetable',
'vetiver', 'violet', 'warm', 'waxy', 'weedy', 'winey', 'woody'
]

### Analysis of molecules.csv

In [89]:
molecules.head()

Unnamed: 0_level_0,MolecularWeight,IsomericSMILES,IUPACName,name
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-955348933095,240.387,CCCCC=COC(=O)CCCCCCCC,,Hexenyl nonanoate
-923209957509,196.29,CC(=O)OCC1C=CC(C(C)C)CC1,,Tetrahydrocuminyl acetate
-874408321546,244.331,CCCCCCCCC(OC(C)=O)C(=O)OC,,Methyl acetoxydecanoate
-873963935677,198.306,CCCCC=COC(=O)C(C)CCC,,Hexenyl methylvalerate
-862841422647,148.271,CCCC(S)COCC,,Ethoxymethylbutanethiol


In [90]:
molecules.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, -955348933095 to 162353069
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MolecularWeight  3522 non-null   float64
 1   IsomericSMILES   3522 non-null   object 
 2   IUPACName        3486 non-null   object 
 3   name             3522 non-null   object 
dtypes: float64(1), object(3)
memory usage: 137.6+ KB


In [91]:
assert len(molecules['IsomericSMILES'].drop_duplicates()) == len(molecules)
# check for duplicates in molecules

In [92]:
from rdkit import Chem

def canonical_smiles(smiles):
    """
    Function return canonical smiles for a given smiles
    """
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

In [93]:
total_molecules = len(molecules['IsomericSMILES'].value_counts())
total_molecules

3522

In [94]:
molecules['canonicalSMILES'] = molecules['IsomericSMILES'].progress_apply(lambda x: canonical_smiles(x))
assert len(molecules['canonicalSMILES'].value_counts()) == total_molecules

# all isomeric smiles are in canonical from
molecules = molecules.drop(columns=['canonicalSMILES'])

100%|██████████| 3522/3522 [00:00<00:00, 19212.45it/s]


### Analysis of behavior.csv

In [95]:
behavior.head()

Unnamed: 0_level_0,alcoholic,aldehydic,alliaceous,almond,animal,anisic,apple,apricot,aromatic,balsamic,...,tobacco,tomato,tropical,vanilla,vegetable,violet,warm,waxy,winey,woody
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-955348933095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
-923209957509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
-874408321546,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
-873963935677,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
-862841422647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
behavior.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, -955348933095 to 162353069
Columns: 113 entries, alcoholic to woody
dtypes: int64(113)
memory usage: 3.1 MB


In [97]:
odors = list(behavior.columns)
len(odors)

113

In [98]:
# check for molecules with no descriptors
pd.DataFrame(behavior.sum(axis=1).sort_values(), columns=['count']).query('count==0')

Unnamed: 0_level_0,count
Stimulus,Unnamed: 1_level_1


### Analysis of behavior_sparse.csv

In [99]:
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']"
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']"
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']"
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']"
-862841422647,Catty urine; cassis in dilution,['catty']


In [100]:
behavior_sparse.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, -955348933095 to 162353069
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Raw Labels  3522 non-null   object
 1   Labels      3522 non-null   object
dtypes: object(2)
memory usage: 82.5+ KB


In [101]:
# analyse types of data in raw label
info_counts = {}
def find_counts(string):
    if len(string.split(';')) in info_counts.keys():
        info_counts[len(string.split(';'))]+=1
    else:
        info_counts[len(string.split(';'))] = 1

behavior_sparse['Raw Labels'].apply(lambda x: find_counts(x))
print(info_counts)

from collections import defaultdict
raw_labels_store = defaultdict(list)
def get_raw_label_count_based(string):
    raw_labels_store[len(string.split(';'))].append(string)

behavior_sparse['Raw Labels'].apply(lambda x: get_raw_label_count_based(x))

for key, value in raw_labels_store.items():
    print(f"info type {key}")
    print(value)

{1: 2450, 2: 1011, 3: 60, 4: 1}
info type 1
['Herbal-green, waxy, oily, fruity', 'Herbaceous, woody, slight spicy fruity odor', 'Green, fruity, apple-like', 'Fatty, fruity, Chinese quince-like', 'Catty, green, lemon, thyme, sulfury, tropical', 'Sulfurous, roasted meaty', 'Green, apple-like', 'Green, sweet, tropical fruit, faintly caramellic', 'Green, tropical fruity', 'Sulfurous, catty, black currant and tropical fruity, roasted on dilution', 'A precursor for butter flavor (i.e. delta-decalactone)', 'Sharp, fruity-ethereal odor with apple, plum notes', 'Hint of herbaceous citrus peel', 'Roasted meat, a grilled or burned-meat flavor', 'Fruity, berry, jammy, cooked fruit notes', 'Mild citrus-fruity on dilution', 'Sulfurous, roasted meaty notes', 'Ethereal, green, alcoholic, cognac', 'Green, tropical fruiy with waxy melon & apple aspects', 'Mild floral and sweet, slightly phenolic', 'Sulfurous, green herbal, spicy, somewhat tropical', 'Sulfurous, meaty, liver, onion (See Comments)', 'Plea

It can be seen that raw labels contain upto 4 types of information based on `;` in the raw label string.

It is observed that in most cases:
- 1st value in raw labels is odor information.
- Others are flavour info and additional comments 

In [102]:
# make descriptors format similar to goodcents
behavior_sparse['descriptors'] = behavior_sparse['Labels'].apply(lambda x: ';'.join(eval(x)))
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels,descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']",green;oily;fruity;waxy;herbal
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']",woody;spicy;fruity;herbal
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']",peach;apricot;buttery
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']",green;fruity;apple;tropical
-862841422647,Catty urine; cassis in dilution,['catty'],catty


In [103]:
# check if given labels are accurately extracted from raw labels

# get list of odors currently in descriptors
def get_odors(desc, odor_list):
    odors = desc.split(';')
    for i in odors:
        if i not in odor_list:
            odor_list.append(i)

odor_list = []
behavior_sparse['descriptors'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

# check for specific descriptors
print("reuired desc not in descriptors (labels): ")
missing_required_desc = []
for desc in required_desc:
    if desc not in odor_list:
        print(desc)
        missing_required_desc.append(desc)

no of odor descriptors:  113
reuired desc not in descriptors (labels): 
amber
bergamot
bitter
cedar
celery
clean
clove
cooked
cooling
cortex
fruit skin
gassy
geranium
hawthorn
hyacinth
jasmin
juicy
lactonic
lavender
lily
muguet
natural
orangeflower
ozone
powdery
raspberry
rummy
sandalwood
soapy
sweaty
terpenic
vetiver
weedy


In [104]:
# find desc in raw labels
def find(string, word):
    string = string.split(';')[0]
    if word in string:
        print(string)

for desc in missing_required_desc:
    print(f"for {desc}: ")
    behavior_sparse['Raw Labels'].apply(lambda x: find(x, desc))
    print()

for amber: 
Phenolic, slightly terpeny, leather spicy amber odor
Dry, fruity-amber like, fatty odor
Woody-cedar-amber with slight fruity character
Mild balsamic, amber, slightly woody & citrus
Aldehydic, amber, lemon, green, waxy
Strong and substantive ambergris note, earthy and animalistic
Woody, floral, orris, slightly ambery, fruity
Grassy-sweet, minty, somewhat amber like odor
Balsamic-amber, sweet odor
Woody-cedar-amber with slight fruity character

for bergamot: 
Floral-tuberose, citrus-bergamot-fruity odor
Fresh, citrus, herbal-bergamot odor
Sweet fruity floral (rose-bergamot) odor
Refreshing, sweet herbaceous fruity with bergamot notes
Citrus, floral complex reminiscent of bergamot, lime and lavender
Herbaceous, sweet spicy bergamot odor
Fresh floral (bergamot- and muguet-like), with herbal aspects

for bitter: 
Odor of bitter almond oil
Odorless, slight bitter taste
Aromatic, weak benzoic odor with a sour bitter taste
Faint, sweet, balsamic with slight bitter, fruity notes
Odo

It can observed that the most of the missing descriptors can be found in raw labels.

Note: There are lot of spelling errors in the raw labels.

### Correct spell errors in raw labels

In [105]:
# code to get probable spell errors

# v1
# import enchant

# # Create an instance of the English dictionary
# dictionary = enchant.Dict("en_US")

# # Check if a word is spelled correctly
# def spellcheck(word):
#     if dictionary.check(word):
#         # print(f"{word} is spelled correctly.")
#         pass
#     else:
#         if word in required_desc: # ignore spell error if word in required desc
#             return
#         for root in root_req_desc:
#             if root in word:    # only find spell error for words that have the certain root words
#                 suggestions = dictionary.suggest(word)
#                 if suggestions:
#                     print(f"{word} is misspelled. Suggestions: {', '.join(suggestions)}")
#                 else:
#                     print(f"{word} is misspelled, but no suggestions available.")
#                 break

# import re
# words_set = set()
# def get_spell_errors(string):
#     string = string.lower()
#     string = string.split(';')[0] #choose odor and not flavor
#     list_desc=re.sub(r'[!"#$%&\'()*\-+,./:;<=>?@\[\]^_`{|}~]'," ",string).split(" ")

#     # # handle black currant
#     # if "black" in list_desc:
#     #     idx = list_desc.index('black')
#     #     if idx<len(list_desc)-1:
#     #         if list_desc[idx+1] == 'currant':
#     #             list_desc.pop(idx) #removes black
#     #             list_desc.pop(idx) # removes currant
#     #             list_desc.append("black currant")
    
#     for word in list_desc:
#         if len(word) == 0:
#             continue
#         elif word == ' ':
#             continue
#         else:
#             words_set.update([word])
#     # odor_set = set()
#     # for req_odor in required_desc:
#     #     if req_odor in list_desc:
#     #         odor_set

# behavior_sparse['Raw Labels'].apply(lambda x: get_spell_errors(x))
# list_of_words = list(words_set)
# list_of_words.sort()
# for word in list_of_words:
#     spellcheck(word)

# v2

# suff_root_req_desc = required_desc.copy()
# for i in range(len(suff_root_req_desc)):
#     suff_root_req_desc[i] = suff_root_req_desc[i][(int(0.4*len(suff_root_req_desc[i]))):]

# # tried with different values like: 0.4, 0.6, 0.3

# def spellcheck(word):
#     if dictionary.check(word):
#         # print(f"{word} is spelled correctly.")
#         pass
#     else:
#         if word in required_desc: # ignore spell error if word in required desc
#             return
#         for root in suff_root_req_desc:
#             if root in word:    # only find spell error for words that have the certain root words
#                 suggestions = dictionary.suggest(word)
#                 if suggestions:
#                     print(f"{word} is misspelled. Suggestions: {', '.join(suggestions)}")
#                 else:
#                     print(f"{word} is misspelled, but no suggestions available.")
#                 break

# import re
# words_set_v2 = set()
# def get_spell_errors_v2(string):
#     string = string.lower()
#     list_desc=string.split(" ")

#     # handle black currant
#     if "black" in list_desc:
#         idx = list_desc.index('black')
#         if idx<len(list_desc)-1:
#             if list_desc[idx+1] == 'currant':
#                 list_desc.pop(idx) #removes black
#                 list_desc.pop(idx) # removes currant
#                 list_desc.append("black currant")

#     # handle orange flower
#     if "orange" in list_desc:
#         idx = list_desc.index('orange')
#         if idx<len(list_desc)-1:
#             if list_desc[idx+1] == 'flower':
#                 list_desc.pop(idx) #removes orange
#                 list_desc.pop(idx) # removes flower
#                 list_desc.append("orangeflower")
    
#     # handle fruit skin
#     for idx in range(len(list_desc)):
#         if list_desc[idx] == 'skin' or list_desc[idx] == 'peel':
#             if idx != 0:
#                 if list_desc[idx-1] in ['citrus', 'orange', 'apple', 'banana', 'lemon', 'pear', 'fruit', 'peach']:
#                     list_desc.append("fruit skin")
    
#     for word in list_desc:
#         if len(word) == 0:
#             continue
#         elif word == ' ':
#             continue
#         else:
#             words_set_v2.update([word])
#     # odor_set = set()
#     # for req_odor in required_desc:
#     #     if req_odor in list_desc:
#     #         odor_set

# behavior_sparse['cleaned raw labels'].apply(lambda x: get_spell_errors_v2(x))
# list_of_words = list(words_set_v2)
# list_of_words.sort()
# for word in list_of_words:
#     spellcheck(word)

In [106]:
# manually curated spellcheck and filter based on above code

spellcorrect = {}
spellcorrect['acetaldehyde'] = ['acetataldehyde']
spellcorrect['alliaceous'] = ['alliaceious', 'alliacious', 'alliacous']
spellcorrect['amber'] = ['ambery']
spellcorrect['animal'] = ['animalic', 'animalistic', 'animallic']
spellcorrect['apple'] = ['applle']
spellcorrect['balsamic'] = ['balsum']
spellcorrect['black currant'] = ['blackurrant']
spellcorrect['buttery'] = ['butterry']
spellcorrect['camphoreous'] = ['camhorous', 'camphaceous', 'camphoraceous', 'camphoraceousl', 'camphorous']
spellcorrect['caramellic'] = ['carameellic', 'caramelic', 'caramellized']
spellcorrect['cedar'] = ['cedary']
spellcorrect['chamomile'] = ['camomille', 'chamomille']
spellcorrect['cheesy'] = ['cheesey']
spellcorrect['cocoa'] = ['tcocoa']
spellcorrect['cinnamon'] = ['cinnamate', 'cinnamic', 'cinnamyl']
spellcorrect['citrus'] = ['cistus', 'citral', 'citronellal', 'citronellol', 'citrusy']
spellcorrect['coumarinic'] = ['coumarin']
spellcorrect['creamy'] = ['ceamy']
spellcorrect['earthy'] = ['eartthy', 'eathy']
spellcorrect['ethereal'] = ['etheral', 'etthereal']
spellcorrect['fermented'] = ['fermenty', 'ferrmented']
spellcorrect['fruity'] = ['frui', 'fruiity', 'fruiti', 'fruitti', 'fruiy', 'frutti', 'fruty', 'fuity', 'ruity']
spellcorrect['floral'] = ['floralodor', 'flouve', 'foral']
spellcorrect['green'] = ['freen', 'grreen', 'geen']
spellcorrect['grapefruit'] = ['grapefuit']
spellcorrect['hawthorn'] = ['hawthorne']
spellcorrect['hay'] = ['haylike']
spellcorrect['herbal'] = ['hebal', 'herby', 'hebaceous']
spellcorrect['honey'] = ['honeylike']
spellcorrect['jasmin'] = ['jasmone']
spellcorrect['lemon'] = ['iime', 'limonene']
spellcorrect['lactonic'] = ['decalactone', 'dodecalactone', 'iactonic', 'lactone', 'octalactone']
spellcorrect['malty'] = ['maltol']
spellcorrect['milky'] = ['mildl', 'milkfat']
spellcorrect['mushroom'] = ['mushoom']
spellcorrect['musk'] = ['ambrette']
spellcorrect['onion'] = ['oinion']
spellcorrect['odorless'] = ['odourless']
spellcorrect['orange blossom'] = ['orangeblossom']
spellcorrect['orange'] = ['ornge'] 
spellcorrect['ozone'] = ['ozonic']
spellcorrect['passion fruit'] = ['passionfruit']
spellcorrect['pineapple'] = ['pienapple', 'pieneapple']
spellcorrect['popcorn'] = ['poocorn']
spellcorrect['pungent'] = ['pumgent']
spellcorrect['pine'] = ['pune', 'pne']
spellcorrect['roasted'] = ['roasty']
spellcorrect['rose'] = ['rosemay', 'rosey']
spellcorrect['savory'] = ['savoury']
spellcorrect['solvent'] = ['solventy']
spellcorrect['strawberry'] = ['strawberr']
spellcorrect['sulfurous'] = ['sufurous', 'sulfuraceos', 'sulfuraceous', 'sulfureous', 'sulfury', 'sulphur', 'sulphurol', 'sulphurous', 'sulphury']
spellcorrect['spicy'] = ['sicy']
spellcorrect['tobacco'] = ['tabac']
spellcorrect['terpenic'] = ['terpene', 'terpenes', 'terpentine', 'terpeny', 'terpineol', 'terpy']
spellcorrect['tropical'] = ['ltropical', 'tropicall']
spellcorrect['vanilla'] = ['vanillic', 'vanillin', 'vanlla']
spellcorrect['vegetable'] = ['vegetal']
spellcorrect['vetiver'] = ['vertivert', 'vetivert', 'vetiveryl']

In [107]:
# correct spell errors
from collections import defaultdict
reverse_spellcorrect = defaultdict(list)
for key, values in spellcorrect.items():
    for value in values:
        reverse_spellcorrect[value].append(key)

import re
def correct_spell_errors_v1(string):
    string = string.lower()
    string = string.split(';')[0] #choose odor and not flavor
    list_desc=re.sub(r'[!"#$%&\'()*\-+,./:;<=>?@\[\]^_`{|}~]'," ",string).split(" ")

    # handle black currant
    if "black" in list_desc:
        idx = list_desc.index('black')
        if idx<len(list_desc)-1:
            if list_desc[idx+1] == 'currant':
                list_desc.pop(idx) #removes black
                list_desc.pop(idx) # removes currant
                list_desc.append("black currant")
    
    # handle orange flower
    if "orange" in list_desc:
        idx = list_desc.index('orange')
        if idx<len(list_desc)-1:
            if list_desc[idx+1] == 'flower':
                list_desc.pop(idx) #removes orange
                list_desc.pop(idx) # removes flower
                list_desc.append("orangeflower")
    
    # handle fruit skin
    for idx in range(len(list_desc)):
        if list_desc[idx] == 'skin' or list_desc[idx] == 'peel':
            if idx != 0:
                if list_desc[idx-1] in ['citrus', 'orange', 'apple', 'banana', 'lemon', 'pear', 'fruit', 'peach']:
                    list_desc.append("fruit skin")


    words_set = set()
    for word in list_desc:
        flag = 0
        if len(word) == 0:
            continue
        elif word == ' ':
            continue
        else:
            for mispelled, corrected in reverse_spellcorrect.items():
                if word == mispelled:
                    words_set.update(corrected)
                    flag = 1
                    break
            if flag == 0:
                words_set.update([word])
    new_string = " ".join(words_set)
    return new_string


In [108]:
# test spell correct function

print(correct_spell_errors_v1('sulfuraceous, ltropical-over-ripe fruity, strawberry, cream & cheese notes'))
print(correct_spell_errors_v1('sulfury, alliaceous, roasted, savoury, truffle, pne'))

sulfurous notes strawberry cheese fruity ripe cream tropical over
roasted alliaceous savory sulfurous truffle pine


In [109]:
# spell correct

behavior_sparse['cleaned raw labels'] = behavior_sparse['Raw Labels'].apply(lambda x: correct_spell_errors_v1(x))
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']",green;oily;fruity;waxy;herbal,fruity herbal oily green waxy
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']",woody;spicy;fruity;herbal,spicy woody herbaceous fruity slight odor
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']",peach;apricot;buttery,lactonic delta precursor
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']",green;fruity;apple;tropical,green apple fruity like
-862841422647,Catty urine; cassis in dilution,['catty'],catty,urine catty


### Clean and get new descriptors from 'cleaned raw labels

In [110]:
# code to get cleaned desc root combination

# from collections import defaultdict
# roots_dict = defaultdict(set)
# def connect_roots(cleaned_raw_labels):
#     raw_list = cleaned_raw_labels.split(' ')
#     for desc in required_desc:
#         root_set = set()
#         for word in raw_list:
#             if word == desc:
#                 root_set.update([word])
#             elif re.match(f'{desc[:-(int(0.6*len(desc)))]}.*', word):
#                 root_set.update([word])
#         roots_dict[desc].update(list(root_set))

# behavior_sparse['cleaned raw labels'].apply(lambda x: connect_roots(x))

In [111]:
# manually cleaned desc root combination based above code
merger_root_dict = {}
merger_root_dict['alcoholic'] = ['alcohol', 'alcoholic']
merger_root_dict['aldehydic'] = ['aldehydic']
merger_root_dict['alliaceous'] = ['alliaceous', 'allium']
merger_root_dict['almond'] = ['almond', 'almonds']
merger_root_dict['amber'] = ['amber', 'ambergris']
merger_root_dict['animal'] = ['animal']
merger_root_dict['anisic'] = ['anise', 'anisic']
merger_root_dict['apple'] = ['apple']
merger_root_dict['apricot'] = ['apricot']
merger_root_dict['aromatic'] = ['aromatic']
merger_root_dict['balsamic'] = ['balsam', 'balsamic']
merger_root_dict['banana'] = ['banana']
merger_root_dict['beefy'] = ['beef', 'beefsteak', 'beefy']
merger_root_dict['bergamot'] = ['bergamot']
merger_root_dict['berry'] = ['berry']
merger_root_dict['bitter'] = ['bitter', 'bitterness', 'bittersweet']
merger_root_dict['black currant'] = ['black currant']
merger_root_dict['brandy'] = ['brandy']
merger_root_dict['burnt'] = ['burned', 'burning', 'burnt']
merger_root_dict['buttery'] = ['butter', 'buttery']
merger_root_dict['cabbage'] = ['cabbage']
merger_root_dict['camphoreous'] = ['camphor', 'camphoreous']
merger_root_dict['caramellic'] = ['caramel', 'caramelized', 'caramellic']
merger_root_dict['cedar'] = ['cedar']
merger_root_dict['celery'] = ['celery']
merger_root_dict['chamomile'] = ['chamomile']
merger_root_dict['cheesy'] = ['cheese', 'cheesy']
merger_root_dict['cherry'] = ['cherries', 'cherry']
merger_root_dict['chocolate'] = ['chocolate']
merger_root_dict['cinnamon'] = ['cinnamon']
merger_root_dict['citrus'] = ['citronella', 'citrus']
merger_root_dict['clean'] = ['clean']
merger_root_dict['clove'] = ['clove', 'clover']
merger_root_dict['cocoa'] = ['cocoa']
merger_root_dict['coconut'] = ['coconut']
merger_root_dict['coffee'] = ['coffee']
merger_root_dict['cognac'] = ['cognac']
merger_root_dict['cooked'] = ['cooked']
merger_root_dict['cooling'] = ['cool', 'cooling', 'coolness']
merger_root_dict['cortex'] = ['cortex']
merger_root_dict['coumarinic'] = ['coumarinic']
merger_root_dict['creamy'] = ['cream', 'creamy']
merger_root_dict['cucumber'] = ['cucumber']
merger_root_dict['dairy'] = ['dairy']
merger_root_dict['dry'] = ['dried', 'dry']
merger_root_dict['earthy'] = ['earth', 'earthy']
merger_root_dict['ethereal'] = ['ethereal']
merger_root_dict['fatty'] = ['fat', 'fats', 'fatty']
merger_root_dict['fermented'] = ['fermented', 'fermenting']
merger_root_dict['fishy'] = ['fish', 'fishy']
merger_root_dict['floral'] = ['flora', 'floral', 'flower', 'flowery']
merger_root_dict['fresh'] = ['fresh', 'freshly', 'freshness']
merger_root_dict['fruit skin'] = ['fruit skin']
merger_root_dict['fruity'] = ['fruit', 'fruitier', 'fruitiness', 'fruits', 'fruity']
merger_root_dict['garlic'] = ['garlic']
merger_root_dict['gassy'] = ['gassy']
merger_root_dict['geranium'] = ['geranium']
merger_root_dict['grape'] = ['grape', 'grapes']
merger_root_dict['grapefruit'] = ['grapefruit']
merger_root_dict['grassy'] = ['grass', 'grassy']
merger_root_dict['green'] = ['green', 'greenery', 'greenish']
merger_root_dict['hawthorn'] = ['hawthorn']
merger_root_dict['hay'] = ['hay']
merger_root_dict['hazelnut'] = ['hazel', 'hazelnut', 'hazelnuts']
merger_root_dict['herbal'] = ['herb', 'herbaceous', 'herbal']
merger_root_dict['honey'] = ['honey', 'honeydew', 'honeysuckle']
merger_root_dict['hyacinth'] = ['hyacinth']
merger_root_dict['jasmin'] = ['jasmin', 'jasmine']
merger_root_dict['juicy'] = ['juice', 'juiciness', 'juicy']
merger_root_dict['ketonic'] = ['ketonic']
merger_root_dict['lactonic'] = ['lactonic']
merger_root_dict['lavender'] = ['lavender']
merger_root_dict['leafy'] = ['leaf', 'leafy', 'lettuce']
merger_root_dict['leathery'] = ['leather', 'leathery']
merger_root_dict['lemon'] = ['lemon', 'lemony', 'lime']
merger_root_dict['lily'] = ['lilac', 'lily']
merger_root_dict['malty'] = ['malt', 'malty']
merger_root_dict['meaty'] = ['meat', 'meats', 'meaty']
merger_root_dict['medicinal'] = ['medicinal']
merger_root_dict['melon'] = ['melon']
merger_root_dict['metallic'] = ['metallic']
merger_root_dict['milky'] = ['milk', 'milky']
merger_root_dict['mint'] = ['mint', 'minty']
merger_root_dict['muguet'] = ['muguet']
merger_root_dict['mushroom'] = ['mushroom']
merger_root_dict['musk'] = ['musk', 'musky']
merger_root_dict['musty'] = ['musty']
merger_root_dict['natural'] = ['natural']
merger_root_dict['nutty'] = ['nut', 'nuts', 'nutty']
merger_root_dict['odorless'] = ['odorless']
merger_root_dict['oily'] = ['oil', 'oils', 'oily']
merger_root_dict['onion'] = ['onion', 'onions']
merger_root_dict['orange'] = ['orange']
merger_root_dict['orangeflower'] = ['orangeflower']
merger_root_dict['orris'] = ['orris']
merger_root_dict['ozone'] = ['ozone']
merger_root_dict['peach'] = ['peach', 'peachy']
merger_root_dict['pear'] = ['pear']
merger_root_dict['phenolic'] = ['phenol', 'phenolc', 'phenolic']
merger_root_dict['pine'] = ['pine', 'piney']
merger_root_dict['pineapple'] = ['pineapple']
merger_root_dict['plum'] = ['plum']
merger_root_dict['popcorn'] = ['popcorn']
merger_root_dict['potato'] = ['potato']
merger_root_dict['powdery'] = ['powdery']
merger_root_dict['pungent'] = ['pungency', 'pungent']
merger_root_dict['radish'] = ['radish', 'horseradish']
merger_root_dict['raspberry'] = ['raspberry']
merger_root_dict['ripe'] = ['ripe', 'ripened']
merger_root_dict['roasted'] = ['roast', 'roasted']
merger_root_dict['rose'] = ['rose', 'rosemary', 'rosy']
merger_root_dict['rummy'] = ['rum', 'rummy']
merger_root_dict['sandalwood'] = ['sandalwood']
merger_root_dict['savory'] = ['savory']
merger_root_dict['sharp'] = ['sharp']
merger_root_dict['smoky'] = ['smoke', 'smoked', 'smokey', 'smoky']
merger_root_dict['soapy'] = ['soapy']
merger_root_dict['solvent'] = ['solvent']
merger_root_dict['sour'] = ['sour']
merger_root_dict['spicy'] = ['spice', 'spicy']
merger_root_dict['strawberry'] = ['strawberry']
merger_root_dict['sulfurous'] = ['sulfide', 'sulfur', 'sulfuric', 'sulfurous']
merger_root_dict['sweaty'] = ['sweat','sweaty']
merger_root_dict['sweet'] = ['sweet', 'sweeter', 'sweetish', 'sweetness']
merger_root_dict['tea'] = ['tea']
merger_root_dict['terpenic'] = ['terpenic']
merger_root_dict['tobacco'] = ['tobacco', 'tobaccos']
merger_root_dict['tomato'] = ['tomato']
merger_root_dict['tropical'] = ['tropical']
merger_root_dict['vanilla'] = ['vanilla']
merger_root_dict['vegetable'] = ['vegetable', 'vegetables', 'vegetative']
merger_root_dict['vetiver'] = ['vetiver']
merger_root_dict['violet'] = ['violet']
merger_root_dict['warm'] = ['warm', 'warming']
merger_root_dict['waxy'] = ['waxy']
merger_root_dict['weedy'] = ['weedy']
merger_root_dict['winey'] = ['wine', 'wines', 'winey']
merger_root_dict['woody'] = ['wood', 'woody']

In [112]:
from collections import defaultdict
reverse_merger_root_dict = defaultdict(list)
for key, values in merger_root_dict.items():
    for value in values:
        reverse_merger_root_dict[value].append(key)

def clean_desc(cleaned_raw_labels):
    desc_set = set()
    raw_list = cleaned_raw_labels.split(' ')

    # handle black currant
    if "black" in raw_list:
        idx = raw_list.index('black')
        if idx<len(raw_list)-1:
            if raw_list[idx+1] == 'currant':
                raw_list.pop(idx) #removes black
                raw_list.pop(idx) # removes currant
                raw_list.append("black currant")
    
    # handle orange flower
    if "orange" in raw_list:
        idx = raw_list.index('orange')
        if idx<len(raw_list)-1:
            if raw_list[idx+1] == 'flower':
                raw_list.pop(idx) #removes orange
                raw_list.pop(idx) # removes flower
                raw_list.append("orangeflower")
    
    # handle fruit skin
    for idx in range(len(raw_list)):
        if raw_list[idx] == 'skin' or raw_list[idx] == 'peel':
            if idx != 0:
                if raw_list[idx-1] in ['citrus', 'orange', 'apple', 'banana', 'lemon', 'pear', 'fruit', 'peach']:
                    raw_list.append("fruit skin")

    for word in raw_list:
        for root, desc in reverse_merger_root_dict.items():
            if word == root:
                desc_set.update(desc)
                break
    if len(desc_set) == 0:
        return None
    return ';'.join(desc_set)


In [113]:
# test clean descriptors function

clean_desc('fruity odorless herbaceous spicy odor slight woody')

'spicy;woody;fruity;herbal;odorless'

In [114]:
behavior_sparse['new_descriptors'] = behavior_sparse['cleaned raw labels'].apply(lambda x: clean_desc(x))
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels,new_descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']",green;oily;fruity;waxy;herbal,fruity herbal oily green waxy,fruity;herbal;oily;green;waxy
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']",woody;spicy;fruity;herbal,spicy woody herbaceous fruity slight odor,fruity;spicy;herbal;woody
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']",peach;apricot;buttery,lactonic delta precursor,lactonic
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']",green;fruity;apple;tropical,green apple fruity like,green;apple;fruity
-862841422647,Catty urine; cassis in dilution,['catty'],catty,urine catty,


In [115]:
# check for nans in new descriptors
behavior_sparse[behavior_sparse['new_descriptors'].isna()]


Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels,new_descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-862841422647,Catty urine; cassis in dilution,['catty'],catty,urine catty,
-848964121442,See comments; components of Milk lactone,['milky'],milky,see comments,
-86422482515,"Unstable; citrus, fatty, tropical","['fatty', 'citrus', 'tropical']",fatty;citrus;tropical,unstable,
356,Hydrocarbon odor (gasoline-like),['gasoline'],gasoline,gasoline like hydrocarbon odor,
4837,"Mild salty taste; Somewhat fishy, ammonia odor",['fishy'],fishy,taste salty mild,
6736,"Strong, offensive fecal odor; diluted - floral...","['fruity', 'animal', 'floral']",fruity;animal;floral,offensive fecal strong odor,
7363,"Powerful; burnt, coffee-like, somewhat caramel...","['alliaceous', 'burnt', 'caramellic', 'coffee']",alliaceous;burnt;caramellic;coffee,powerful,
7438,Caraway-like; sweet spearmint-like odor,['sweet'],sweet,caraway like,
7894,"Ammoniacal odor; fishy taste, on dilution a bl...",['cheesy'],cheesy,ammoniacal odor,
8141,Hydrocarbon odor (gasoline-like),['gasoline'],gasoline,gasoline like hydrocarbon odor,


### merge `descriptors` and `new_descriptors`

In [116]:
def update_desc(descriptors, new_descriptors):
    desc_set = set()
    if new_descriptors is None:
        return descriptors
    else:
        concat_desc = descriptors+';'+new_descriptors
        desc_set.update(concat_desc.split(';'))
    return ';'.join(desc_set)

# test update_desc

print(update_desc('alliaceous;burnt;caramellic;coffee', None))
print(update_desc('green;fruity;apple;tropical', 'green;fruity;apple'))

alliaceous;burnt;caramellic;coffee
green;tropical;apple;fruity


In [117]:
behavior_sparse['new_descriptors'] = behavior_sparse.apply(lambda x: update_desc(x['descriptors'], x['new_descriptors']), axis=1)
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels,new_descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']",green;oily;fruity;waxy;herbal,fruity herbal oily green waxy,fruity;herbal;oily;green;waxy
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']",woody;spicy;fruity;herbal,spicy woody herbaceous fruity slight odor,spicy;fruity;herbal;woody
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']",peach;apricot;buttery,lactonic delta precursor,peach;lactonic;apricot;buttery
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']",green;fruity;apple;tropical,green apple fruity like,green;tropical;apple;fruity
-862841422647,Catty urine; cassis in dilution,['catty'],catty,urine catty,catty


In [118]:
# check for odorless mix up
behavior_sparse[behavior_sparse['new_descriptors'].str.match('.*odorless.*')]

Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels,new_descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
135,Nearly odorless; slight tingling taste; sweetn...,['odorless'],odorless,nearly odorless,odorless
196,Pleasant sour taste; odorless,['odorless'],odorless,taste sour pleasant,odorless;sour
311,"Odorless with a clean, acid, tart taste",['odorless'],odorless,tart acid taste odorless with clean a,odorless;clean
453,"Odorless with a pleasant, sweet taste",['odorless'],odorless,taste odorless sweet with pleasant a,odorless;sweet
525,Almost odorless with a sharp tart acidic taste,['odorless'],odorless,acidic sharp tart almost odorless taste with a,sharp;odorless
...,...,...,...,...,...
54675810,Odorless with a clean acidulous taste,['odorless'],odorless,taste odorless with acidulous clean a,odorless;clean
54676860,"Odorless with a sharp, pleasant acid taste",['odorless'],odorless,sharp acid taste odorless with pleasant a,sharp;odorless
57353225,Nearly odorless; Delicate floral waxy with ros...,"['rose', 'odorless', 'waxy', 'floral']",rose;odorless;waxy;floral,nearly odorless,floral;odorless;waxy;rose
67120007,"Virtually odorless, waxy","['waxy', 'odorless']",waxy;odorless,virtually waxy odorless,odorless;waxy


In [119]:
# handle odorless
def handle_odorless(desc_string):
    desc_list = desc_string.split(';')
    desc_set = set()
    for desc in desc_list:
        if desc == 'odorless':
            continue
        desc_set.update([desc])
    if len(desc_set) == 0:
        desc_set.update(['odorless'])
    return ';'.join(desc_set)

# test handle_odorless
print(handle_odorless('fruity;spicy;odorless;herbal;woody'))
print(handle_odorless('odorless'))

spicy;fruity;woody;herbal
odorless


In [120]:
behavior_sparse['new_descriptors'] = behavior_sparse['new_descriptors'].apply(lambda x: handle_odorless(x))
behavior_sparse.head()

Unnamed: 0_level_0,Raw Labels,Labels,descriptors,cleaned raw labels,new_descriptors
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-955348933095,"Herbal-green, waxy, oily, fruity","['green', 'oily', 'fruity', 'waxy', 'herbal']",green;oily;fruity;waxy;herbal,fruity herbal oily green waxy,fruity;herbal;oily;green;waxy
-923209957509,"Herbaceous, woody, slight spicy fruity odor","['woody', 'spicy', 'fruity', 'herbal']",woody;spicy;fruity;herbal,spicy woody herbaceous fruity slight odor,spicy;fruity;woody;herbal
-874408321546,"delta-decalactone precursor; peach, apricot, b...","['peach', 'apricot', 'buttery']",peach;apricot;buttery,lactonic delta precursor,peach;lactonic;apricot;buttery
-873963935677,"Green, fruity, apple-like","['green', 'fruity', 'apple', 'tropical']",green;fruity;apple;tropical,green apple fruity like,green;apple;tropical;fruity
-862841422647,Catty urine; cassis in dilution,['catty'],catty,urine catty,catty


### Get required odor descriptors

In [121]:
mol_behaviors = pd.merge(molecules['IsomericSMILES'], behavior_sparse['new_descriptors'], how='inner', left_index=True, right_index=True)
mol_behaviors = mol_behaviors.rename(columns={'new_descriptors':'descriptors'})
mol_behaviors.head()

Unnamed: 0_level_0,IsomericSMILES,descriptors
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
-955348933095,CCCCC=COC(=O)CCCCCCCC,fruity;herbal;oily;green;waxy
-923209957509,CC(=O)OCC1C=CC(C(C)C)CC1,spicy;fruity;woody;herbal
-874408321546,CCCCCCCCC(OC(C)=O)C(=O)OC,peach;lactonic;apricot;buttery
-873963935677,CCCCC=COC(=O)C(C)CCC,green;apple;tropical;fruity
-862841422647,CCCC(S)COCC,catty


In [122]:
odor_list = []
mol_behaviors['descriptors'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

odors_df = pd.DataFrame(odor_list, columns=['desc'])
odors_df

no of odor descriptors:  145


Unnamed: 0,desc
0,alcoholic
1,aldehydic
2,alliaceous
3,almond
4,amber
...,...
140,warm
141,waxy
142,weedy
143,winey


In [123]:
# check for required desc not in odor list
missing_desc = []
for req in required_desc:
    if req not in odor_list:
        missing_desc.append(req)
missing_desc

['cortex']

In [124]:
# update required desc

new_required_desc = required_desc
new_required_desc.remove('cortex')

In [125]:
# Get required odor descriptors
def get_req_desc(desc_string):
    desc_list = desc_string.split(';')
    desc_set = set()
    for desc in desc_list:
        if desc in new_required_desc:
            desc_set.update([desc])
    return ';'.join(desc_set)

mol_behaviors['Updated_Desc'] = mol_behaviors['descriptors'].apply(lambda x: get_req_desc(x))

odor_list = []
mol_behaviors['Updated_Desc'].apply(lambda x: get_odors(x, odor_list))
odor_list.sort()

print("no of odor descriptors: ", len(odor_list))

odors_df = pd.DataFrame(odor_list, columns=['desc'])
odors_df

no of odor descriptors:  138


Unnamed: 0,desc
0,
1,alcoholic
2,aldehydic
3,alliaceous
4,almond
...,...
133,warm
134,waxy
135,weedy
136,winey


In [131]:
# encode the descriptors
odor_dummies = mol_behaviors['Updated_Desc'].str.get_dummies(sep=';')
mol_behaviors_encoded = pd.concat([mol_behaviors, odor_dummies], axis=1)
mol_behaviors_encoded = mol_behaviors_encoded.drop(columns=['descriptors'])
mol_behaviors_encoded.head()

Unnamed: 0_level_0,IsomericSMILES,Updated_Desc,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-955348933095,CCCCC=COC(=O)CCCCCCCC,fruity;herbal;oily;green;waxy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
-923209957509,CC(=O)OCC1C=CC(C(C)C)CC1,spicy;fruity;herbal;woody,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
-874408321546,CCCCCCCCC(OC(C)=O)C(=O)OC,peach;lactonic;apricot;buttery,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-873963935677,CCCCC=COC(=O)C(C)CCC,green;apple;tropical;fruity,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
-862841422647,CCCC(S)COCC,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
# check for molecules with no descriptors
required_encoded = mol_behaviors_encoded.drop(columns=['IsomericSMILES', 'Updated_Desc'])
no_desc_df = pd.DataFrame(required_encoded.sum(axis=1).sort_values(), columns=['count']).query('count==0')
no_desc_df

Unnamed: 0_level_0,count
CID,Unnamed: 1_level_1
175606,0
12388,0
8141,0
-862841422647,0
11511,0
31285,0
11507,0
15600,0
12389,0
356,0


In [133]:
# remove molecules with no descriptors
required_leffingwell_dataset = mol_behaviors_encoded.drop(no_desc_df.index).reset_index(drop=True)
required_leffingwell_dataset.head()

Unnamed: 0,IsomericSMILES,Updated_Desc,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CCCCC=COC(=O)CCCCCCCC,fruity;herbal;oily;green;waxy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,CC(=O)OCC1C=CC(C(C)C)CC1,spicy;fruity;herbal;woody,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,CCCCCCCCC(OC(C)=O)C(=O)OC,peach;lactonic;apricot;buttery,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCCC=COC(=O)C(C)CCC,green;apple;tropical;fruity,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,CCCCCCCC=CC(=O)OC(CCCCCCCC)C(=O)O,milky,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
required_leffingwell_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3510 entries, 0 to 3509
Columns: 139 entries, IsomericSMILES to woody
dtypes: int64(137), object(2)
memory usage: 3.7+ MB


In [135]:
# frequency per descriptor in the required_leffingwell_dataset
required_encoded_v2 = required_leffingwell_dataset.drop(columns=['IsomericSMILES', 'Updated_Desc'])
required_encoded_v2.sum().sort_values(ascending=False)

fruity          1413
green            913
sweet            842
floral           558
fatty            412
                ... 
sandalwood         4
powdery            3
orangeflower       3
soapy              3
weedy              1
Length: 137, dtype: int64

In [136]:
# save the curated dataset
required_leffingwell_dataset.to_csv('./curated_datasets/curated_leffingwell.csv', index=False)