# Merge cleaned Leffingwell and Goodcents datasets

In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
# load cleaned datasets
goods_df = pd.read_csv('curated_goodcents.csv')
leff_df = pd.read_csv('curated_leffingwell.csv')

In [3]:
goods_df.head()

Unnamed: 0,IsomericSMILES,Updated_Desc_v2,acidic,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(CN)O,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)C(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C(CC(=O)O)C(=O)C(=O)O,odorless,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCC(=O)C(=O)O,brown;sweet;caramellic;creamy;lactonic,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC(C)CC(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
goods_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Columns: 154 entries, IsomericSMILES to woody
dtypes: int64(152), object(2)
memory usage: 5.2+ MB


In [5]:
leff_df.head()

Unnamed: 0,IsomericSMILES,Updated_Desc,acidic,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CCCCC=COC(=O)CCCCCCCC,waxy;herbal;oily;fruity;green,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,CC(=O)OCC1C=CC(C(C)C)CC1,herbal;woody;fruity;spicy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,CCCCCCCCC(OC(C)=O)C(=O)OC,lactonic;buttery;apricot;peach,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCCC=COC(=O)C(C)CCC,tropical;fruity;green;apple,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,CCCCCCCC=CC(=O)OC(CCCCCCCC)C(=O)O,milky,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
leff_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3510 entries, 0 to 3509
Columns: 153 entries, IsomericSMILES to woody
dtypes: int64(151), object(2)
memory usage: 4.1+ MB


In [7]:
goods_df = goods_df.rename(columns={'Updated_Desc_v2': 'descriptorsG'})

In [8]:
leff_df = leff_df.rename(columns={'Updated_Desc': 'descriptorsL'})

### Merge descriptors from smiles common in both datasets

In [9]:
common_smileslist = (leff_df['IsomericSMILES'][leff_df['IsomericSMILES'].isin(goods_df['IsomericSMILES'])])

In [10]:
list_idx_common_leff = list(common_smileslist.index)
# list_idx_common_leff

In [11]:
common_smileslist = (goods_df['IsomericSMILES'][goods_df['IsomericSMILES'].isin(leff_df['IsomericSMILES'])])

In [12]:
list_idx_common_goods = list(common_smileslist.index)
# list_idx_common_goods

In [13]:
# inner join common smiles
inner_join_df = pd.merge(leff_df[['IsomericSMILES', 'descriptorsL']], goods_df[['IsomericSMILES', 'descriptorsG']], how='inner', on='IsomericSMILES')

In [14]:
inner_join_df.head()

Unnamed: 0,IsomericSMILES,descriptorsL,descriptorsG
0,CC(CN)O,fishy,fishy
1,CCC(=O)C(=O)O,creamy;caramellic;brown;fatty;sweet,brown;sweet;caramellic;creamy;lactonic
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,floral;sweet;cinnamon;rose;musk;fatty
3,C1=CC(=CC=C1CO)O,medicinal;nutty;phenolic;sweet;fruity,sweet;fruity;almond;bitter;coconut
4,C1=CC(=CC=C1C=O)O,almond;nutty;phenolic;vanilla;hay;woody;balsamic,balsamic;metallic;nutty;brown;sweet;almond;hon...


In [15]:
inner_join_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197 entries, 0 to 2196
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2197 non-null   object
 1   descriptorsL    2197 non-null   object
 2   descriptorsG    2197 non-null   object
dtypes: object(3)
memory usage: 51.6+ KB


In [16]:
inner_join_df = inner_join_df.reset_index(names='index', drop=False)
inner_join_df = inner_join_df.set_index('index')

In [17]:
inner_join_df

Unnamed: 0_level_0,IsomericSMILES,descriptorsL,descriptorsG
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,CC(CN)O,fishy,fishy
1,CCC(=O)C(=O)O,creamy;caramellic;brown;fatty;sweet,brown;sweet;caramellic;creamy;lactonic
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,floral;sweet;cinnamon;rose;musk;fatty
3,C1=CC(=CC=C1CO)O,medicinal;nutty;phenolic;sweet;fruity,sweet;fruity;almond;bitter;coconut
4,C1=CC(=CC=C1C=O)O,almond;nutty;phenolic;vanilla;hay;woody;balsamic,balsamic;metallic;nutty;brown;sweet;almond;hon...
...,...,...,...
2192,CC1=C(C=CO1)SSCSC,beefy;meaty;sulfurous;roasted,meaty;brown
2193,CCCCCCCC=CC(=O)OCCCC,woody;fatty,peach;clean;fruity;oily;apricot;dairy;citrus;f...
2194,CCC=CCCCC1CCCC(=O)O1,fatty;dairy;waxy,fatty;fruity
2195,CC1C(=O)C=C2C1(C=CCC2(C)C)C,fruity,fruity


In [18]:
# merge descriptors from both the datasets
def merge_desc(desc_combined):
    desc_list = desc_combined.split(';')
    new_desc_set = set()
    for desc in desc_list:
        if desc == 'odorless':
            continue
        new_desc_set.update([desc])
    if len(new_desc_set) == 0:
        new_desc_set.update(['odorless'])
    return ';'.join(new_desc_set)

In [19]:
# test merge_desc()
print(merge_desc('odorless;phenolic;nutty;nutty'))
print(merge_desc('odorless;odorless'))

phenolic;nutty
odorless


In [20]:
inner_join_df['merged_descriptors'] = inner_join_df.apply(lambda x: merge_desc(x['descriptorsL']+';'+x['descriptorsG']), axis=1)

In [21]:
inner_join_df

Unnamed: 0_level_0,IsomericSMILES,descriptorsL,descriptorsG,merged_descriptors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,CC(CN)O,fishy,fishy,fishy
1,CCC(=O)C(=O)O,creamy;caramellic;brown;fatty;sweet,brown;sweet;caramellic;creamy;lactonic,creamy;fatty;lactonic;brown;caramellic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,floral;sweet;cinnamon;rose;musk;fatty,balsamic;musk;rose;floral;fatty;cinnamon;sweet
3,C1=CC(=CC=C1CO)O,medicinal;nutty;phenolic;sweet;fruity,sweet;fruity;almond;bitter;coconut,almond;phenolic;coconut;nutty;fruity;sweet;med...
4,C1=CC(=CC=C1C=O)O,almond;nutty;phenolic;vanilla;hay;woody;balsamic,balsamic;metallic;nutty;brown;sweet;almond;hon...,vanilla;honey;phenolic;balsamic;woody;metallic...
...,...,...,...,...
2192,CC1=C(C=CO1)SSCSC,beefy;meaty;sulfurous;roasted,meaty;brown,roasted;sulfurous;meaty;brown;beefy
2193,CCCCCCCC=CC(=O)OCCCC,woody;fatty,peach;clean;fruity;oily;apricot;dairy;citrus;f...,green;clean;woody;apricot;citrus;fatty;fruity;...
2194,CCC=CCCCC1CCCC(=O)O1,fatty;dairy;waxy,fatty;fruity,waxy;fatty;dairy;fruity
2195,CC1C(=O)C=C2C1(C=CCC2(C)C)C,fruity,fruity,fruity


In [22]:
common_merged_df = inner_join_df[['IsomericSMILES', 'merged_descriptors']]
common_merged_df = common_merged_df.rename(columns={'merged_descriptors':'descriptors'})
common_merged_df.head()

Unnamed: 0_level_0,IsomericSMILES,descriptors
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CC(CN)O,fishy
1,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,balsamic;musk;rose;floral;fatty;cinnamon;sweet
3,C1=CC(=CC=C1CO)O,almond;phenolic;coconut;nutty;fruity;sweet;med...
4,C1=CC(=CC=C1C=O)O,vanilla;honey;phenolic;balsamic;woody;metallic...


In [23]:
common_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2197 entries, 0 to 2196
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2197 non-null   object
 1   descriptors     2197 non-null   object
dtypes: object(2)
memory usage: 51.5+ KB


### Merge other smiles from both datasets

In [24]:
uncommon_goods = goods_df[['IsomericSMILES', 'descriptorsG']].drop(list_idx_common_goods)
uncommon_goods = uncommon_goods.rename(columns={'descriptorsG':'descriptors'})
uncommon_goods.head()

Unnamed: 0,IsomericSMILES,descriptors
1,CC(C)C(=O)C(=O)O,fruity
2,C(CC(=O)O)C(=O)C(=O)O,odorless
4,CC(C)CC(=O)C(=O)O,fruity
5,C1=CC(=C(C=C1C(=O)O)O)O,balsamic;phenolic
7,C(CC(=O)O)CN,savory;meaty


In [25]:
uncommon_goods.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2201 entries, 1 to 4397
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2201 non-null   object
 1   descriptors     2201 non-null   object
dtypes: object(2)
memory usage: 51.6+ KB


In [26]:
uncommon_leff = leff_df[['IsomericSMILES', 'descriptorsL']].drop(list_idx_common_leff)
uncommon_leff = uncommon_leff.rename(columns={'descriptorsL':'descriptors'})
uncommon_leff.head()

Unnamed: 0,IsomericSMILES,descriptors
0,CCCCC=COC(=O)CCCCCCCC,waxy;herbal;oily;fruity;green
1,CC(=O)OCC1C=CC(C(C)C)CC1,herbal;woody;fruity;spicy
2,CCCCCCCCC(OC(C)=O)C(=O)OC,lactonic;buttery;apricot;peach
3,CCCCC=COC(=O)C(C)CCC,tropical;fruity;green;apple
4,CCCCCCCC=CC(=O)OC(CCCCCCCC)C(=O)O,milky


In [27]:
uncommon_leff.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1313 entries, 0 to 3509
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  1313 non-null   object
 1   descriptors     1313 non-null   object
dtypes: object(2)
memory usage: 30.8+ KB


In [28]:
# combine dfs with uncommon smiles along with df with common smiles
combined_dataset = pd.concat([common_merged_df, uncommon_leff, uncommon_goods])
combined_dataset.head()

Unnamed: 0,IsomericSMILES,descriptors
0,CC(CN)O,fishy
1,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,balsamic;musk;rose;floral;fatty;cinnamon;sweet
3,C1=CC(=CC=C1CO)O,almond;phenolic;coconut;nutty;fruity;sweet;med...
4,C1=CC(=CC=C1C=O)O,vanilla;honey;phenolic;balsamic;woody;metallic...


In [29]:
combined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5711 entries, 0 to 4397
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  5711 non-null   object
 1   descriptors     5711 non-null   object
dtypes: object(2)
memory usage: 133.9+ KB


In [30]:
# duplicate check
combined_dataset[combined_dataset.duplicated(keep=False)]

Unnamed: 0,IsomericSMILES,descriptors


In [31]:
combined_dataset = combined_dataset.reset_index(drop=True)

### Handle stereo isomers


In [32]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

def remove_stereo(smiles):
    smiles = smiles.replace('@','')
    smiles = smiles.replace('/','')
    smiles = smiles.replace('\\','')
    return canonical_smiles(smiles)

# test remove_stereo
print(remove_stereo('CC/C=C\CC(C)OC(=O)@C'))
print(remove_stereo('CCC/C=C(\C)/C(=O)O'))
print(remove_stereo('CC1=CC(=C(O1)C)SC(=O)CC(C)C'))

CCC=CCC(C)OC(C)=O
CCCC=C(C)C(=O)O
Cc1cc(SC(=O)CC(C)C)c(C)o1


In [33]:
combined_dataset['nonStereoSMILES'] = combined_dataset['IsomericSMILES'].apply(lambda x: remove_stereo(x))

Now we can merge descriptors of duplicates that are created after removing stereochemistry.

In [34]:
combined_dataset[combined_dataset['nonStereoSMILES'].duplicated(keep=False)]


Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES
27,C(C(C(=O)O)O)C(=O)O,acidic;sharp,O=C(O)CC(O)C(=O)O
30,CC(C(=O)O)O,buttery;sour,CC(O)C(=O)O
37,CC(C)CC(C(=O)O)N,bitter,CC(C)CC(N)C(=O)O
38,C(C(C(=O)O)O)(C(=O)O)O,caramellic,O=C(O)C(O)C(O)C(=O)O
39,CSCCC(C(=O)O)N,acidic;sulfurous,CSCCC(N)C(=O)O
...,...,...,...
5664,C[C@@H](C=O)[C@@H]1CC[C@](O1)(C)C=C,floral;fresh,C=CC1(C)CCC(C(C)C=O)O1
5671,CC[C@]1([C@@]2(CC[C@@H](C2)C1(C)C)C)O,earthy,CCC1(O)C2(C)CCC(C2)C1(C)C
5674,CC(C)C=C(CSC)C=O,sulfurous,CSCC(C=O)=CC(C)C
5686,C1COC(O1)/C=C\C2=CC=CC=C2,cinnamon;spicy;warm,C(=CC1OCCO1)c1ccccc1


In [35]:
# get duplicates
dup = combined_dataset[['nonStereoSMILES', 'descriptors']][combined_dataset['nonStereoSMILES'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['nonStereoSMILES'].to_list())
for i in index_set:
    print(dup[dup['nonStereoSMILES'] == i])
    print()

               nonStereoSMILES                descriptors
1407  C=C(C)C1CC(=O)C2(C)OC2C1                       mint
2133  C=C(C)C1CC(=O)C2(C)OC2C1  mint;sweet;spicy;ethereal

          nonStereoSMILES                        descriptors
3108  CC=CC=CCOC(=O)C(C)C    tropical;fruity;pineapple;sweet
4059  CC=CC=CCOC(=O)C(C)C  apple;fruity;pear;pineapple;green
4947  CC=CC=CCOC(=O)C(C)C  apple;fruity;pear;pineapple;green

      nonStereoSMILES descriptors
3532  NCCCCC(N)C(=O)O    odorless
3547  NCCCCC(N)C(=O)O    odorless

     nonStereoSMILES                                  descriptors
4504   C=CC(O)CC=CCC  leafy;geranium;mushroom;marine;earthy;green
4989   C=CC(O)CC=CCC                                       earthy
5018   C=CC(O)CC=CCC               mushroom;metallic;green;marine

     nonStereoSMILES                 descriptors
166   CCC(O)c1ccccc1  sweet;floral;balsamic;rose
4685  CCC(O)c1ccccc1                      floral
4746  CCC(O)c1ccccc1                      floral

     nonStereoS

In [36]:
dup = combined_dataset[['nonStereoSMILES', 'descriptors']][combined_dataset['nonStereoSMILES'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['nonStereoSMILES'].to_list())
list_idx_of_duplicates = []
for i in index_set:
    # print(list(dup[dup['CID'] == i].index))
    list_idx_of_duplicates.append(list(dup[dup['nonStereoSMILES'] == i].index))
# list_idx_of_duplicates


In [37]:
combined_dataset['descriptors_old'] = combined_dataset.loc[:, 'descriptors']
combined_dataset.head()

Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES,descriptors_old
0,CC(CN)O,fishy,CC(O)CN,fishy
1,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,balsamic;musk;rose;floral;fatty;cinnamon;sweet,O=C(O)CCc1ccccc1,balsamic;musk;rose;floral;fatty;cinnamon;sweet
3,C1=CC(=CC=C1CO)O,almond;phenolic;coconut;nutty;fruity;sweet;med...,OCc1ccc(O)cc1,almond;phenolic;coconut;nutty;fruity;sweet;med...
4,C1=CC(=CC=C1C=O)O,vanilla;honey;phenolic;balsamic;woody;metallic...,O=Cc1ccc(O)cc1,vanilla;honey;phenolic;balsamic;woody;metallic...


In [38]:
for dup_set in list_idx_of_duplicates:
    desc_set = set()
    desc_string = ';'.join(combined_dataset['descriptors'][dup_set].to_list())
    for desc in desc_string.split(';'):
        if desc == 'odorless':
            continue
        desc_set.update([desc])
    if len(desc_set) == 0:
        desc_set.update(['odorless'])
    new_desc_string = ';'.join(desc_set)
    combined_dataset['descriptors'][dup_set] = new_desc_string

combined_dataset_cleaned = combined_dataset.drop_duplicates('nonStereoSMILES', keep='first').reset_index(drop=True)

In [39]:
combined_dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   IsomericSMILES   4986 non-null   object
 1   descriptors      4986 non-null   object
 2   nonStereoSMILES  4986 non-null   object
 3   descriptors_old  4986 non-null   object
dtypes: object(4)
memory usage: 155.9+ KB


In [40]:
# check for duplicates
combined_dataset_cleaned[combined_dataset_cleaned['nonStereoSMILES'].duplicated(keep=False)]

Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES,descriptors_old


In [41]:
combined_dataset_v2 = combined_dataset_cleaned[['nonStereoSMILES', 'descriptors']]
combined_dataset_v2.head()

Unnamed: 0,nonStereoSMILES,descriptors
0,CC(O)CN,fishy
1,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet
2,O=C(O)CCc1ccccc1,balsamic;musk;rose;floral;fatty;cinnamon;sweet
3,OCc1ccc(O)cc1,almond;phenolic;coconut;nutty;fruity;sweet;med...
4,O=Cc1ccc(O)cc1,vanilla;honey;phenolic;balsamic;woody;metallic...


In [42]:
combined_dataset_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   nonStereoSMILES  4986 non-null   object
 1   descriptors      4986 non-null   object
dtypes: object(2)
memory usage: 78.0+ KB


In [43]:
# encode descriptors
odor_dummies = combined_dataset_v2['descriptors'].str.get_dummies(sep=';')
combined_dataset_v2_encoded = pd.concat([combined_dataset_v2, odor_dummies], axis=1)

In [44]:
combined_dataset_v2_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Columns: 154 entries, nonStereoSMILES to woody
dtypes: int64(152), object(2)
memory usage: 5.9+ MB


In [45]:
combined_dataset_v2_encoded.head()

Unnamed: 0,nonStereoSMILES,descriptors,acidic,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(O)CN,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCC(=O)C(=O)O,creamy;fatty;lactonic;brown;caramellic;sweet,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,O=C(O)CCc1ccccc1,balsamic;musk;rose;floral;fatty;cinnamon;sweet,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,OCc1ccc(O)cc1,almond;phenolic;coconut;nutty;fruity;sweet;med...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,O=Cc1ccc(O)cc1,vanilla;honey;phenolic;balsamic;woody;metallic...,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1


Refernce from:
Brian K. Lee, Emily J. Mayhew, Benjamin Sanchez-Lengeling,
Jennifer N. Wei, Wesley W. Qian, Kelsie Little, Matthew Andres,
Britney B. Nguyen, Theresa Moloy, Jane K. Parker, Richard C. Gerkin,
Joel D. Mainland, Alexander B. Wiltschko

`A Principal Odor Map Unifies Diverse Tasks in Human Olfactory Perception preprint
<https://www.biorxiv.org/content/10.1101/2022.09.01.504602v4>`_.

The GoodScents (http://www.thegoodscentscompany.com/) and Leffingwell PMP 2001
(https://zenodo.org/record/4085098#.YqoYk8jMIUE) datasets each contain odorant molecules and
corresponding odor descriptors. Variations and misspellings of odor descriptors were merged, and
`any odor descriptor with <=30 occurrences in the dataset were discarded`.

In [46]:
encoded_combined_v2 = combined_dataset_v2_encoded.drop(columns=['nonStereoSMILES', 'descriptors'])
odors_df = pd.DataFrame(encoded_combined_v2.sum().sort_values(ascending=False), columns=['sum'])
odors_df.query('sum>0')

Unnamed: 0,sum
fruity,1902
green,1446
sweet,1429
floral,1100
herbal,756
...,...
tonka,28
mentholic,26
fungal,20
mossy,19


Below are test molecules from pre-print

In [47]:
# 2-methyl-2-hexenoic acid
print('2-methyl-2-hexenoic acid\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='CCCC=C(C)C(=O)O'])
print()
# 2,5-dimethyl-3-thioisovalerylfuran
print('2,5-dimethyl-3-thioisovalerylfuran\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='Cc1cc(SC(=O)CC(C)C)c(C)o1'])
print()
# 1-methyl-3-hexenyl acetate
print('1-methyl-3-hexenyl acetate\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='CCC=CCC(C)OC(C)=O'])


2-methyl-2-hexenoic acid
       nonStereoSMILES    descriptors
4381  CCCC=C(C)C(=O)O  cheesy;fruity

2,5-dimethyl-3-thioisovalerylfuran
                nonStereoSMILES           descriptors
893  Cc1cc(SC(=O)CC(C)C)c(C)o1  creamy;cocoa;roasted

1-methyl-3-hexenyl acetate
         nonStereoSMILES   descriptors
2933  CCC=CCC(C)OC(C)=O  fruity;green


In [48]:
# size of final dataset
size = len(combined_dataset_v2_encoded)
size

4986

In [49]:

combined_dataset_v2_encoded.to_csv(f'curated_GS_LF_merged_{size}.csv', index=False)