# Merge cleaned Leffingwell and Goodcents datasets

In [158]:
import pandas as pd
from rdkit import Chem

In [159]:
# load cleaned datasets
goods_df = pd.read_csv('./curated_datasets/curated_goodcents.csv')
leff_df = pd.read_csv('./curated_datasets/curated_leffingwell.csv')

In [160]:
goods_df.head()

Unnamed: 0,IsomericSMILES,Updated_Desc_v2,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(CN)O,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)C(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C(CC(=O)O)C(=O)C(=O)O,odorless,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCC(=O)C(=O)O,creamy;caramellic;lactonic;sweet,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC(C)CC(=O)C(=O)O,fruity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
goods_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4392 entries, 0 to 4391
Columns: 140 entries, IsomericSMILES to woody
dtypes: int64(138), object(2)
memory usage: 4.7+ MB


In [162]:
leff_df.head()

Unnamed: 0,IsomericSMILES,Updated_Desc,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CCCCC=COC(=O)CCCCCCCC,fruity;herbal;oily;green;waxy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,CC(=O)OCC1C=CC(C(C)C)CC1,spicy;fruity;herbal;woody,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,CCCCCCCCC(OC(C)=O)C(=O)OC,peach;lactonic;apricot;buttery,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCCC=COC(=O)C(C)CCC,green;apple;tropical;fruity,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,CCCCCCCC=CC(=O)OC(CCCCCCCC)C(=O)O,milky,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
leff_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3510 entries, 0 to 3509
Columns: 139 entries, IsomericSMILES to woody
dtypes: int64(137), object(2)
memory usage: 3.7+ MB


In [164]:
goods_df = goods_df.rename(columns={'Updated_Desc_v2': 'descriptorsG'})

In [165]:
leff_df = leff_df.rename(columns={'Updated_Desc': 'descriptorsL'})

### Merge descriptors from smiles common in both datasets

In [166]:
common_smileslist = (leff_df['IsomericSMILES'][leff_df['IsomericSMILES'].isin(goods_df['IsomericSMILES'])])

In [167]:
list_idx_common_leff = list(common_smileslist.index)
# list_idx_common_leff

In [168]:
common_smileslist = (goods_df['IsomericSMILES'][goods_df['IsomericSMILES'].isin(leff_df['IsomericSMILES'])])

In [169]:
list_idx_common_goods = list(common_smileslist.index)
# list_idx_common_goods

In [170]:
# inner join common smiles
inner_join_df = pd.merge(leff_df[['IsomericSMILES', 'descriptorsL']], goods_df[['IsomericSMILES', 'descriptorsG']], how='inner', on='IsomericSMILES')

In [171]:
inner_join_df.head()

Unnamed: 0,IsomericSMILES,descriptorsL,descriptorsG
0,CC(CN)O,fishy,fishy
1,CCC(=O)C(=O)O,creamy;fatty;caramellic;sweet,creamy;caramellic;lactonic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,cinnamon;floral;rose;musk;fatty;sweet
3,C1=CC(=CC=C1CO)O,phenolic;medicinal;fruity;sweet;nutty,almond;coconut;bitter;fruity;sweet
4,C1=CC(=CC=C1C=O)O,almond;phenolic;woody;balsamic;vanilla;hay;nutty,almond;nutty;woody;honey;balsamic;metallic;van...


In [172]:
inner_join_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2195 entries, 0 to 2194
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2195 non-null   object
 1   descriptorsL    2195 non-null   object
 2   descriptorsG    2195 non-null   object
dtypes: object(3)
memory usage: 51.6+ KB


In [173]:
inner_join_df = inner_join_df.reset_index(names='index', drop=False)
inner_join_df = inner_join_df.set_index('index')

In [174]:
inner_join_df

Unnamed: 0_level_0,IsomericSMILES,descriptorsL,descriptorsG
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,CC(CN)O,fishy,fishy
1,CCC(=O)C(=O)O,creamy;fatty;caramellic;sweet,creamy;caramellic;lactonic;sweet
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,cinnamon;floral;rose;musk;fatty;sweet
3,C1=CC(=CC=C1CO)O,phenolic;medicinal;fruity;sweet;nutty,almond;coconut;bitter;fruity;sweet
4,C1=CC(=CC=C1C=O)O,almond;phenolic;woody;balsamic;vanilla;hay;nutty,almond;nutty;woody;honey;balsamic;metallic;van...
...,...,...,...
2190,CC1=C(C=CO1)SSCSC,beefy;meaty;sulfurous;roasted,meaty
2191,CCCCCCCC=CC(=O)OCCCC,fatty;woody,apricot;clean;peach;oily;dairy;green;fruity;ci...
2192,CCC=CCCCC1CCCC(=O)O1,fatty;dairy;waxy,fruity;fatty
2193,CC1C(=O)C=C2C1(C=CCC2(C)C)C,fruity,fruity


In [175]:
# merge descriptors from both the datasets
def merge_desc(desc_combined):
    desc_list = desc_combined.split(';')
    new_desc_set = set()
    for desc in desc_list:
        if desc == 'odorless':
            continue
        new_desc_set.update([desc])
    if len(new_desc_set) == 0:
        new_desc_set.update(['odorless'])
    return ';'.join(new_desc_set)

In [176]:
# test merge_desc()
print(merge_desc('odorless;phenolic;nutty;nutty'))
print(merge_desc('odorless;odorless'))

nutty;phenolic
odorless


In [177]:
inner_join_df['merged_descriptors'] = inner_join_df.apply(lambda x: merge_desc(x['descriptorsL']+';'+x['descriptorsG']), axis=1)

In [178]:
inner_join_df

Unnamed: 0_level_0,IsomericSMILES,descriptorsL,descriptorsG,merged_descriptors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,CC(CN)O,fishy,fishy,fishy
1,CCC(=O)C(=O)O,creamy;fatty;caramellic;sweet,creamy;caramellic;lactonic;sweet,lactonic;sweet;creamy;caramellic;fatty
2,C1=CC=C(C=C1)CCC(=O)O,sweet;balsamic,cinnamon;floral;rose;musk;fatty;sweet,floral;musk;balsamic;sweet;rose;cinnamon;fatty
3,C1=CC(=CC=C1CO)O,phenolic;medicinal;fruity;sweet;nutty,almond;coconut;bitter;fruity;sweet,nutty;phenolic;medicinal;sweet;coconut;fruity;...
4,C1=CC(=CC=C1C=O)O,almond;phenolic;woody;balsamic;vanilla;hay;nutty,almond;nutty;woody;honey;balsamic;metallic;van...,hay;nutty;phenolic;balsamic;vanilla;woody;hone...
...,...,...,...,...
2190,CC1=C(C=CO1)SSCSC,beefy;meaty;sulfurous;roasted,meaty,roasted;sulfurous;beefy;meaty
2191,CCCCCCCC=CC(=O)OCCCC,fatty;woody,apricot;clean;peach;oily;dairy;green;fruity;ci...,green;oily;citrus;peach;apricot;woody;fruity;d...
2192,CCC=CCCCC1CCCC(=O)O1,fatty;dairy;waxy,fruity;fatty,dairy;waxy;fatty;fruity
2193,CC1C(=O)C=C2C1(C=CCC2(C)C)C,fruity,fruity,fruity


In [179]:
common_merged_df = inner_join_df[['IsomericSMILES', 'merged_descriptors']]
common_merged_df = common_merged_df.rename(columns={'merged_descriptors':'descriptors'})
common_merged_df.head()

Unnamed: 0_level_0,IsomericSMILES,descriptors
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CC(CN)O,fishy
1,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty
2,C1=CC=C(C=C1)CCC(=O)O,floral;musk;balsamic;sweet;rose;cinnamon;fatty
3,C1=CC(=CC=C1CO)O,nutty;phenolic;medicinal;sweet;coconut;fruity;...
4,C1=CC(=CC=C1C=O)O,hay;nutty;phenolic;balsamic;vanilla;woody;hone...


In [180]:
common_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2195 entries, 0 to 2194
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2195 non-null   object
 1   descriptors     2195 non-null   object
dtypes: object(2)
memory usage: 51.4+ KB


### Merge other smiles from both datasets

In [181]:
uncommon_goods = goods_df[['IsomericSMILES', 'descriptorsG']].drop(list_idx_common_goods)
uncommon_goods = uncommon_goods.rename(columns={'descriptorsG':'descriptors'})
uncommon_goods.head()

Unnamed: 0,IsomericSMILES,descriptors
1,CC(C)C(=O)C(=O)O,fruity
2,C(CC(=O)O)C(=O)C(=O)O,odorless
4,CC(C)CC(=O)C(=O)O,fruity
5,C1=CC(=C(C=C1C(=O)O)O)O,phenolic;balsamic
7,C(CC(=O)O)CN,savory;meaty


In [182]:
uncommon_goods.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2197 entries, 1 to 4391
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  2197 non-null   object
 1   descriptors     2197 non-null   object
dtypes: object(2)
memory usage: 51.5+ KB


In [183]:
uncommon_leff = leff_df[['IsomericSMILES', 'descriptorsL']].drop(list_idx_common_leff)
uncommon_leff = uncommon_leff.rename(columns={'descriptorsL':'descriptors'})
uncommon_leff.head()

Unnamed: 0,IsomericSMILES,descriptors
0,CCCCC=COC(=O)CCCCCCCC,fruity;herbal;oily;green;waxy
1,CC(=O)OCC1C=CC(C(C)C)CC1,spicy;fruity;herbal;woody
2,CCCCCCCCC(OC(C)=O)C(=O)OC,peach;lactonic;apricot;buttery
3,CCCCC=COC(=O)C(C)CCC,green;apple;tropical;fruity
4,CCCCCCCC=CC(=O)OC(CCCCCCCC)C(=O)O,milky


In [184]:
uncommon_leff.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1315 entries, 0 to 3509
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  1315 non-null   object
 1   descriptors     1315 non-null   object
dtypes: object(2)
memory usage: 30.8+ KB


In [185]:
# combine dfs with uncommon smiles along with df with common smiles
combined_dataset = pd.concat([common_merged_df, uncommon_leff, uncommon_goods])
combined_dataset.head()

Unnamed: 0,IsomericSMILES,descriptors
0,CC(CN)O,fishy
1,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty
2,C1=CC=C(C=C1)CCC(=O)O,floral;musk;balsamic;sweet;rose;cinnamon;fatty
3,C1=CC(=CC=C1CO)O,nutty;phenolic;medicinal;sweet;coconut;fruity;...
4,C1=CC(=CC=C1C=O)O,hay;nutty;phenolic;balsamic;vanilla;woody;hone...


In [186]:
combined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5707 entries, 0 to 4391
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IsomericSMILES  5707 non-null   object
 1   descriptors     5707 non-null   object
dtypes: object(2)
memory usage: 133.8+ KB


In [187]:
# duplicate check
combined_dataset[combined_dataset.duplicated(keep=False)]

Unnamed: 0,IsomericSMILES,descriptors


In [188]:
combined_dataset = combined_dataset.reset_index(drop=True)

### Handle stereo isomers


In [189]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

def remove_stereo(smiles):
    smiles = smiles.replace('@','')
    smiles = smiles.replace('/','')
    smiles = smiles.replace('\\','')
    return canonical_smiles(smiles)

# test remove_stereo
print(remove_stereo('CC/C=C\CC(C)OC(=O)@C'))
print(remove_stereo('CCC/C=C(\C)/C(=O)O'))
print(remove_stereo('CC1=CC(=C(O1)C)SC(=O)CC(C)C'))

CCC=CCC(C)OC(C)=O
CCCC=C(C)C(=O)O
Cc1cc(SC(=O)CC(C)C)c(C)o1


In [190]:
combined_dataset['nonStereoSMILES'] = combined_dataset['IsomericSMILES'].apply(lambda x: remove_stereo(x))

Now we can merge descriptors of duplicates that are created after removing stereochemistry.

In [191]:
combined_dataset[combined_dataset['nonStereoSMILES'].duplicated(keep=False)]


Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES
27,C(C(C(=O)O)O)C(=O)O,sharp,O=C(O)CC(O)C(=O)O
30,CC(C(=O)O)O,buttery;sour,CC(O)C(=O)O
37,CC(C)CC(C(=O)O)N,bitter,CC(C)CC(N)C(=O)O
38,C(C(C(=O)O)O)(C(=O)O)O,caramellic,O=C(O)C(O)C(O)C(=O)O
39,CSCCC(C(=O)O)N,sulfurous,CSCCC(N)C(=O)O
...,...,...,...
5660,C[C@@H](C=O)[C@@H]1CC[C@](O1)(C)C=C,fresh;floral,C=CC1(C)CCC(C(C)C=O)O1
5667,CC[C@]1([C@@]2(CC[C@@H](C2)C1(C)C)C)O,earthy,CCC1(O)C2(C)CCC(C2)C1(C)C
5670,CC(C)C=C(CSC)C=O,sulfurous,CSCC(C=O)=CC(C)C
5682,C1COC(O1)/C=C\C2=CC=CC=C2,spicy;warm;cinnamon,C(=CC1OCCO1)c1ccccc1


In [192]:
# get duplicates
dup = combined_dataset[['nonStereoSMILES', 'descriptors']][combined_dataset['nonStereoSMILES'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['nonStereoSMILES'].to_list())
for i in index_set:
    print(dup[dup['nonStereoSMILES'] == i])
    print()

           nonStereoSMILES descriptors
3133  CCC=CCC=CC=CC(=O)OCC  green;pear
4754  CCC=CCC=CC=CC(=O)OCC       fatty

                            nonStereoSMILES  \
2804  CC(=O)OC(C)(C)C1CCC(C)C2=C(C1)C(C)CC2   
3890  CC(=O)OC(C)(C)C1CCC(C)C2=C(C1)C(C)CC2   

                                    descriptors  
2804         floral;woody;rose;sweet;earthy;tea  
3890  spicy;woody;balsamic;green;rose;fatty;tea  

           nonStereoSMILES             descriptors
446   CC1=CC(O)C(C(C)C)CC1  mint;sour;musty;herbal
4049  CC1=CC(O)C(C(C)C)CC1                  herbal
4050  CC1=CC(O)C(C(C)C)CC1                  herbal

         nonStereoSMILES                                      descriptors
3187  CC=CCCC1CCCC(=O)O1                 floral;fruity;peach;oily;apricot
4382  CC=CCCC1CCCC(=O)O1  apricot;oily;peach;floral;coconut;fruity;jasmin

     nonStereoSMILES                                 descriptors
1870      CCCCCC=CCO  green;oily;citrus;vegetable;cucumber;fatty
1972      CCCCCC=CCO    green;f

In [193]:
dup = combined_dataset[['nonStereoSMILES', 'descriptors']][combined_dataset['nonStereoSMILES'].duplicated(keep=False)]
index_set = set()
index_set.update(dup['nonStereoSMILES'].to_list())
list_idx_of_duplicates = []
for i in index_set:
    # print(list(dup[dup['CID'] == i].index))
    list_idx_of_duplicates.append(list(dup[dup['nonStereoSMILES'] == i].index))
# list_idx_of_duplicates


In [194]:
combined_dataset['descriptors_old'] = combined_dataset.loc[:, 'descriptors']
combined_dataset.head()

Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES,descriptors_old
0,CC(CN)O,fishy,CC(O)CN,fishy
1,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty
2,C1=CC=C(C=C1)CCC(=O)O,floral;musk;balsamic;sweet;rose;cinnamon;fatty,O=C(O)CCc1ccccc1,floral;musk;balsamic;sweet;rose;cinnamon;fatty
3,C1=CC(=CC=C1CO)O,nutty;phenolic;medicinal;sweet;coconut;fruity;...,OCc1ccc(O)cc1,nutty;phenolic;medicinal;sweet;coconut;fruity;...
4,C1=CC(=CC=C1C=O)O,hay;nutty;phenolic;balsamic;vanilla;woody;hone...,O=Cc1ccc(O)cc1,hay;nutty;phenolic;balsamic;vanilla;woody;hone...


In [195]:
for dup_set in list_idx_of_duplicates:
    desc_set = set()
    desc_string = ';'.join(combined_dataset['descriptors'][dup_set].to_list())
    for desc in desc_string.split(';'):
        if desc == 'odorless':
            continue
        desc_set.update([desc])
    if len(desc_set) == 0:
        desc_set.update(['odorless'])
    new_desc_string = ';'.join(desc_set)
    combined_dataset['descriptors'][dup_set] = new_desc_string

combined_dataset_cleaned = combined_dataset.drop_duplicates('nonStereoSMILES', keep='first').reset_index(drop=True)

In [196]:
combined_dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4983 entries, 0 to 4982
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   IsomericSMILES   4983 non-null   object
 1   descriptors      4983 non-null   object
 2   nonStereoSMILES  4983 non-null   object
 3   descriptors_old  4983 non-null   object
dtypes: object(4)
memory usage: 155.8+ KB


In [197]:
# check for duplicates
combined_dataset_cleaned[combined_dataset_cleaned['nonStereoSMILES'].duplicated(keep=False)]

Unnamed: 0,IsomericSMILES,descriptors,nonStereoSMILES,descriptors_old


In [198]:
combined_dataset_v2 = combined_dataset_cleaned[['nonStereoSMILES', 'descriptors']]
combined_dataset_v2.head()

Unnamed: 0,nonStereoSMILES,descriptors
0,CC(O)CN,fishy
1,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty
2,O=C(O)CCc1ccccc1,floral;musk;balsamic;sweet;rose;cinnamon;fatty
3,OCc1ccc(O)cc1,nutty;phenolic;medicinal;sweet;coconut;fruity;...
4,O=Cc1ccc(O)cc1,hay;nutty;phenolic;balsamic;vanilla;woody;hone...


In [199]:
combined_dataset_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4983 entries, 0 to 4982
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   nonStereoSMILES  4983 non-null   object
 1   descriptors      4983 non-null   object
dtypes: object(2)
memory usage: 78.0+ KB


In [200]:
# encode descriptors
odor_dummies = combined_dataset_v2['descriptors'].str.get_dummies(sep=';')
combined_dataset_v2_encoded = pd.concat([combined_dataset_v2, odor_dummies], axis=1)

In [201]:
combined_dataset_v2_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4983 entries, 0 to 4982
Columns: 140 entries, nonStereoSMILES to woody
dtypes: int64(138), object(2)
memory usage: 5.3+ MB


In [202]:
combined_dataset_v2_encoded.head()

Unnamed: 0,nonStereoSMILES,descriptors,alcoholic,aldehydic,alliaceous,almond,amber,animal,anisic,apple,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
0,CC(O)CN,fishy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCC(=O)C(=O)O,lactonic;sweet;creamy;caramellic;fatty,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,O=C(O)CCc1ccccc1,floral;musk;balsamic;sweet;rose;cinnamon;fatty,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,OCc1ccc(O)cc1,nutty;phenolic;medicinal;sweet;coconut;fruity;...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,O=Cc1ccc(O)cc1,hay;nutty;phenolic;balsamic;vanilla;woody;hone...,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


Refernce from:
Brian K. Lee, Emily J. Mayhew, Benjamin Sanchez-Lengeling,
Jennifer N. Wei, Wesley W. Qian, Kelsie Little, Matthew Andres,
Britney B. Nguyen, Theresa Moloy, Jane K. Parker, Richard C. Gerkin,
Joel D. Mainland, Alexander B. Wiltschko

`A Principal Odor Map Unifies Diverse Tasks in Human Olfactory Perception preprint
<https://www.biorxiv.org/content/10.1101/2022.09.01.504602v4>`_.

The GoodScents (http://www.thegoodscentscompany.com/) and Leffingwell PMP 2001
(https://zenodo.org/record/4085098#.YqoYk8jMIUE) datasets each contain odorant molecules and
corresponding odor descriptors. Variations and misspellings of odor descriptors were merged, and
`any odor descriptor with <=30 occurrences in the dataset were discarded`.

In [203]:
encoded_combined_v2 = combined_dataset_v2_encoded.drop(columns=['nonStereoSMILES', 'descriptors'])
odors_df = pd.DataFrame(encoded_combined_v2.sum().sort_values(ascending=False), columns=['sum'])
odors_df.query('sum>0')

Unnamed: 0,sum
fruity,1902
green,1446
sweet,1429
floral,1100
herbal,756
...,...
hawthorn,34
malty,33
orangeflower,33
bergamot,32


Below are test molecules from pre-print

In [204]:
# 2-methyl-2-hexenoic acid
print('2-methyl-2-hexenoic acid\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='CCCC=C(C)C(=O)O'])
print()
# 2,5-dimethyl-3-thioisovalerylfuran
print('2,5-dimethyl-3-thioisovalerylfuran\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='Cc1cc(SC(=O)CC(C)C)c(C)o1'])
print()
# 1-methyl-3-hexenyl acetate
print('1-methyl-3-hexenyl acetate\n', combined_dataset_v2_encoded[['nonStereoSMILES', 'descriptors']][combined_dataset_v2_encoded['nonStereoSMILES']=='CCC=CCC(C)OC(C)=O'])


2-methyl-2-hexenoic acid
       nonStereoSMILES    descriptors
4379  CCCC=C(C)C(=O)O  cheesy;fruity

2,5-dimethyl-3-thioisovalerylfuran
                nonStereoSMILES           descriptors
892  Cc1cc(SC(=O)CC(C)C)c(C)o1  roasted;cocoa;creamy

1-methyl-3-hexenyl acetate
         nonStereoSMILES   descriptors
2933  CCC=CCC(C)OC(C)=O  green;fruity


In [205]:
# size of final dataset
size = len(combined_dataset_v2_encoded)
size

4983

In [157]:

combined_dataset_v2_encoded.to_csv(f'./curated_datasets/curated_GS_LF_merged_{size}.csv', index=False)