# A dataset of laymen olfactory perception for 74 monomolecular odors

Antonie L. Bierling, Alexander Croy, Tim Jesgarzewsky, Maria Rommel, Gianaurelio Cuniberti, Thomas Hummel and Ilona Croy

The molecular structure of an odor determines whether and how it is perceived by humans. However, the principles of how  odorant chemistry links to perceptual patterns remain largely unknown and are primarily studied using odor rating datasets from highly trained olfactory experts, such as perfumers. This limits our knowledge of typical odor perception and its variability over individuals. We provide a dataset featuring free descriptions, evaluative ratings, and qualitative labels for 74 chemically diverse mono-molecular odors, rated by a large sample of young adults. A total of 1,227 participants described and rated the odors, and completed questionnaires covering their demographic background, personality traits, and the role of olfaction in their daily lives. The dataset offers a valuable foundation for research aimed at understanding the fundamentals of olfactory perception.

Import packages and load files

In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as LA
import re
import pubchempy as pcp
from scipy import stats
from scipy.stats import levene, skew, kurtosis, linregress, pearsonr, spearmanr, pearsonr
from scipy.spatial import distance
from scipy.io import loadmat

import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

import pingouin as pg
from pingouin import mixed_anova, read_dataset

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import rcParams

# Set font size and family globally for readability
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams.update({'font.size': 8, 'font.family': 'Open Sans'})

import seaborn as sns
import selfies
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

import requests, time
from typing import Iterable, List, Dict
from rdkit import Chem
import re

In [2]:
def to_canonical(smi):
    if pd.isna(smi):
        return None
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)

In [3]:
def to_isosmiles(cid):
    if pd.isna(cid):
        return None
    compound = pcp.Compound.from_cid(cid)
    if compound is None:
    
        return None
    print(compound.isomeric_smiles)
    return compound.isomeric_smiles

In [4]:
!ls '../ds'

CID-SMILES.gz


In [5]:
cid_smiles_pubchem = pd.read_csv(
    # "../../../KINGSTON/CID-SMILES.gz",
    "../ds/CID-SMILES.gz",
    sep="\t",
    header=None,
    names=["cid", "IsomericSMILES"],
    compression="gzip",
    dtype={"cid": "int64", "IsomericSMILES": "string"}
)


In [None]:
# cid_smiles_pubchem = pd.read_csv(
#     # "../../../KINGSTON/CID-SMILES.gz",
#     sep="\t",
#     header=None,
#     names=["cid", "IsomericSMILES"],
#     compression="gzip",
#     dtype={"cid": "int64", "IsomericSMILES": "string"}
# )


TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

## Bierling 2025

In [10]:
## Load the main data file
data_file = pd.read_excel("rawdatasets/bierling2025/data.xlsx") 

# Create dataframe containing the data of the main study and the retest study
data_all = data_file

data = data_all.loc[(data_all["study"]=="main") & (data_file["inclusion"]==1)& (data_file["sampling_group"]!="pat")] 
retest = data_all.loc[data_all["study"]=="retest"]
patients = data_all.loc[data_all["sampling_group"]=="pat"]

# Create dataframe for excluded participants
excluded = data_all.loc[data_all["inclusion"]==0] 

## Load odors and intensity piloting data
odors = pd.read_excel("rawdatasets/bierling2025/odors.xlsx")
pilot = pd.read_excel("rawdatasets/bierling2025/intensity_piloting.xlsx") 
new_column_names = {
    'intensive': 'intensity',
    'pleasant': 'pleasantness', 
    'familiar': 'familiarity',
    'ammonia/urinous':'ammonia'
}

#make all the column names lowercase
odors.columns = odors.columns.str.lower()

# Rename the columns
data = data.rename(columns=new_column_names)

In [11]:
# Step 1: Add merged odor group column
odors_copy = odors.copy()
odors_copy = odors_copy.rename(columns={"smiles": "IsomericSMILES","concentration_final": "concentration"})
#convert the concentration column to numeric
odors_copy["cid"] = (
    odors_copy["cid"]
        .astype(str)
        .str.replace("\u00A0", "", regex=False)          # strip NBSPs
        .str.replace(r"[^\d]", "", regex=True)           # keep digits only (drops 'CID', commas, spaces)
        .str.strip()
)

# Convert to numeric; invalids become <NA> (nullable integer)
odors_copy["cid"] = pd.to_numeric(odors_copy["cid"], errors="coerce").astype("Int64")


#create a new column for odors_copy which is the selfies representation of the odor
odors_copy['CanonicalSMILES'] = odors_copy['IsomericSMILES'].apply(to_canonical)
odors_copy['IsomericSELFIES'] = odors_copy['IsomericSMILES'].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)
odors_copy['CanonicalSELFIES'] = odors_copy['CanonicalSMILES'].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)

#merge all odor_group columns into one column
odors_copy['odor_group_all'] = (
    odors_copy.groupby('molcode')['odor_group']
       .transform(lambda s: ','.join(sorted({str(x) for x in s.dropna()})))
)
#remove duplicates in odor_group_all
odors_copy_unique = odors_copy.drop_duplicates(subset='molcode', keep='first').reset_index(drop=True)
#remove the original odor_group column
odors_copy_unique = odors_copy_unique.drop(columns=['odor_group'])

#rename a column in dataframe
data = data.rename(columns={"code": "participant_id"})
data.to_csv("datasets/bierling2025/bierling2025_responses.csv", index=False)
odors_copy_unique.to_csv("datasets/bierling2025/bierling2025_odors.csv", index=False)
all_data= pd.merge(data, odors_copy_unique, on='molcode', how='left')
all_data.columns = all_data.columns.str.lower()
all_data.to_csv("datasets/bierling2025/bierling2025_data.csv", index=False)

#sanity: this works perfectly

#read the data

pd.read_csv("datasets/bierling2025/bierling2025_odors.csv")


Unnamed: 0,molcode,name,iupac names,purity,IsomericSMILES,cid,cas,catalogue,concentration,volume_final,CanonicalSMILES,IsomericSELFIES,CanonicalSELFIES,odor_group_all
0,Benzyl,benzyl acetate,benzyl acetate,0.990,CC(=O)OCC1=CC=CC=C1,8785,140-11-4,8031811000,1/100,0.5ml,CC(=O)OCc1ccccc1,[C][C][=Branch1][C][=O][O][C][C][=C][C][=C][C]...,[C][C][=Branch1][C][=O][O][C][C][=C][C][=C][C]...,0
1,Decan,4-decanolide,5-hexyloxolan-2-one,0.980,CCCCCCC1CCC(=O)O1,12813,706-14-9,SAFAW236004-1KG-K,1/10,0.5ml,CCCCCCC1CCC(=O)O1,[C][C][C][C][C][C][C][C][C][C][=Branch1][C][=O...,[C][C][C][C][C][C][C][C][C][C][=Branch1][C][=O...,0
2,Benzal,benzaldehyde,benzaldehyde,0.990,O=CC1=CC=CC=C1,240,100-52-7,SAFF12010-250ML-F,1/100,0.5ml,O=Cc1ccccc1,[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1],[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1],1
3,Isocinna,isobutyl cinnamate,2-methylpropyl (E)-3-phenylprop-2-enoate,0.980,CC(C)COC(=O)\C=C\C1=CC=CC=C1,778574,122-67-8,SAFA537152-100G,undiluted,0.5ml,CC(C)COC(=O)C=Cc1ccccc1,[C][C][Branch1][C][C][C][O][C][=Branch1][C][=O...,[C][C][Branch1][C][C][C][O][C][=Branch1][C][=O...,1
4,Anisa,p-anisaldehyde,4-methoxybenzaldehyde,0.975,COC1=CC=C(C=O)C=C1,31244,123-11-5,8223140250,1/100,0.5ml,COc1ccc(C=O)cc1,[C][O][C][=C][C][=C][Branch1][Ring1][C][=O][C]...,[C][O][C][=C][C][=C][Branch1][Ring1][C][=O][C]...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,1Unde,1-undecanol,undecan-1-ol,0.980,CCCCCCCCCCCO,8184,112-42-5,ACRO140681000,1/10,0.5 ml,CCCCCCCCCCCO,[C][C][C][C][C][C][C][C][C][C][C][O],[C][C][C][C][C][C][C][C][C][C][C][O],109
70,Capron,caproic acid,hexanoic acid,0.980,CCCCCC(O)=O,8892,142-62-1,A13789AE,1/10,0.75 ml,CCCCCC(=O)O,[C][C][C][C][C][C][Branch1][C][O][=O],[C][C][C][C][C][C][=Branch1][C][=O][O],109
71,E2Hex,E-hex-2-enylacetat,E-hex-2-enylacetat,0.980,[H]\C(CCC)=C(\[H])COC(C)=O,17243,2497-18-9,A18786.18,1/10,0.5 ml,CCCC=CCOC(C)=O,[H][\C][Branch1][Ring2][C][C][C][=C][Branch1][...,[C][C][C][C][=C][C][O][C][Branch1][C][C][=O],109
72,Rcamph,(R)-(+)-camphor,"(1R,4R)-1,7,7-trimethylbicyclo[2.2.1]heptan-2-one",0.980,CC1(C)[C@H]2CC[C@@]1(C)C(=O)C2,159055,464-49-3,A10708.14,2g/10ml,0.75 ml,CC12CCC(CC1=O)C2(C)C,[C][C][Branch1][C][C][C@H1][C][C][C@@][Ring1][...,[C][C][C][C][C][Branch1][=Branch1][C][C][Ring1...,109


In [16]:
! ls 

Makefile                      [34membeddings[m[m
Mol.py                        extract_representations.ipynb
[34mdatasets[m[m                      llm_response.py
[31mds_curation.ipynb[m[m             [34mrawdatasets[m[m
ds_prep.ipynb                 requirements-conda.txt
ds_rep.ipynb                  requirements.txt
ds_utils.py                   ttt.py


## Keller

In [6]:
keller_vosshall_raw = pd.read_excel("rawdatasets/keller2016/12868_2016_287_MOESM1_ESM.xlsx", skiprows=2)
## Rename columns to match format of the dataset
# Dictionary to rename columns
new_column_names = {
    'C.A.S.': 'cas', 'Odor': 'name','Odor dilution': 'concentration', 
    'Subject # (this study) ': 'id','Gender': 'sex', 'Age': 'age', 
    'HOW STRONG IS THE SMELL?': 'intensity',
    'HOW PLEASANT IS THE SMELL?': 'pleasantness', 
    'HOW FAMILIAR IS THE SMELL?': 'familiarity',
    'subject # (dream challenge)':'participant_id',
    'ammonia/urinous':'ammonia'
}

# Rename the columns
keller_vosshall_raw = keller_vosshall_raw.rename(columns=new_column_names)
keller_vosshall_raw.columns = [col.strip().lower() for col in keller_vosshall_raw.columns]

# Function to convert concentration string to float
def convert_concentration(concentration_str):
    num, denom = concentration_str.split('/')
    return float(num) / float(denom.replace(',', ''))  # removing commas before conversion

# Apply conversion to each concentration in the DataFrame
keller_vosshall_raw['concentration'] = keller_vosshall_raw['concentration'].apply(convert_concentration)
# 3) Merge to get Isomeric SMILES
df = pd.merge(keller_vosshall_raw,cid_smiles_pubchem, on="cid", how="left")
# 4) Generate Canonical SMILES from Isomeric SMILES
df["CanonicalSMILES"] = df["IsomericSMILES"].apply(to_canonical)
df["IsomericSELFIES"] = df["IsomericSMILES"].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)
df["CanonicalSELFIES"] = df["CanonicalSMILES"].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)
df.columns = df.columns.str.lower()
df = df.rename(columns=new_column_names)
df.to_csv("datasets/keller2016/keller2016_data.csv", index=False)


In [10]:
df.columns

Index(['cas', 'catalogue #*', 'cid', 'name', 'concentration', 'id',
       'participant_id', 'sex',
       'race\n ("unknown" indicates\n that the subject did not\n wish to specify)',
       'ethnicity', 'age', 'vial #', 'can or can't smell',
       'know or don't know the smell', 'the odor is:', 'intensity',
       'pleasantness', 'familiarity', 'edible', 'bakery', 'sweet', 'fruit',
       'fish', 'garlic', 'spices', 'cold', 'sour', 'burnt', 'acid', 'warm',
       'musky', 'sweaty', 'ammonia', 'decayed', 'wood', 'grass', 'flower',
       'chemical', 'isomericsmiles', 'canonicalsmiles', 'isomericselfies',
       'canonicalselfies'],
      dtype='object')

# Sagar

In [6]:

# ---------- Canonical vocabulary ----------
CANONICAL = [
    'Intensity', 'Pleasantness', 'Fishy', 'Burnt', 'Sour', 'Decayed', 'Musky',
    'Fruity', 'Sweaty', 'Cool', 'Floral', 'Sweet', 'Warm', 'Bakery', 'Spicy',
    'Ammonia', 'Edible', 'Familiarity',' Chemical', 'Garlic', 'Acid'
]

def sanitize(name: str) -> str:
    s = str(name).strip()
    s = re.sub(r'\s+', ' ', s)                # collapse inner spaces
    s = s.replace('–','-').replace('—','-')   # normalize dashes
    s = s.strip().lower()
    s = re.sub(r'[-/]+', '_', s)
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^0-9a-z_]', '', s)
    return s or 'percept'

CANONICAL_S = [sanitize(x) for x in CANONICAL]

# map common variants/typos to canonical names (sanitized)
VARIANT_TO_CANON = {
    'flowery': 'floral',
    'bakery_like': 'bakery',
    'spicey': 'spicy',
    'familiar': 'familiarity',
    'garlic_like': 'garlic',
    'chemical_like': 'chemical',
    'acidic': 'acid'
}

def norm_to_name(label: str):
    """Return normalized (sanitized) name; fix variants if they map to canonical."""
    s = sanitize(label)
    s = VARIANT_TO_CANON.get(s, s)
    return s

def get_field(behav, *names):
    """Attribute-first access for MATLAB struct; returns None if not present."""
    for n in names:
        if hasattr(behav, "_fieldnames") and n in behav._fieldnames:
            return getattr(behav, n)
    return None

subject_ids = [1, 2, 3]   # adjust if needed
dfs = []         # union (first-seen order) of non-canonical descriptors

for subject_id in subject_ids:
    mat = loadmat(
        f'rawdatasets/sagar2023/behav_ratings_NEMO0{subject_id}.mat',
        squeeze_me=True, struct_as_record=False
    )
    behav = mat['behav']

    ratings = np.asarray(behav.ratings)                 # (n_trials, n_percepts)
    n_trials, n_percepts = ratings.shape

    # percept labels -> normalized names
    percepts_raw = [str(x) for x in np.atleast_1d(behav.percepts).tolist()]
    names = [norm_to_name(p) for p in percepts_raw]

    # group column indices by normalized name (handles dups variants per subject)
    idx_groups = {}
    for j, nm in enumerate(names):
        idx_groups.setdefault(nm, []).append(j)

    # # split into canonical vs extras
    # is_canon = {nm: (nm in CANONICAL_S) for nm in idx_groups}
    # extras_here = [nm for nm in idx_groups if not is_canon[nm]]
    # # grow global union of extras
    # for nm in extras_here:
    #     if nm not in extra_union:
    #         extra_union.append(nm)

    # other fields (shape-robust)
    cid = np.asarray(behav.cid).reshape(-1) 
    detect = np.asarray(behav.detect).reshape(-1) if hasattr(behav, "detect") else np.full(n_trials, np.nan)
    rt_val = get_field(behav, "rt_detect")
    rt_detect = np.asarray(rt_val).reshape(-1) if rt_val is not None else np.full(n_trials, np.nan)

    # reliability vector (per percept)
    rel_vec_val = get_field(behav, "reliability", "rel")
    rel_vec = np.asarray(rel_vec_val).reshape(-1) if rel_vec_val is not None else None

    # collapse helper (mean across duplicates)
    def collapse_cols(cols):
        if len(cols) == 1:
            return ratings[:, cols[0]]
        else:
            raise ValueError(
                f"Multiple columns {cols} for subject {subject_id} in {names[cols[0]]}"
            )

    def collapse_rel(cols):
        if rel_vec is None:
            return np.nan
        return float(rel_vec[cols[0]])

    # base df
    df = pd.DataFrame({
        "participant_id": np.full(n_trials, subject_id, dtype=int),
        "cid": cid,
        "detect": detect,
        "rt_detect": rt_detect
    })

    # ---- add canonical ratings in fixed canonical order; fill NaN if missing ----
    for cname in CANONICAL_S:
        cols = idx_groups.get(cname, [])
        print(f"Subject {subject_id}: {cname} -> cols {cols}")
        df[cname] = collapse_cols(cols) if cols else np.nan
        df[f"rel_{cname}"] = collapse_rel(cols) if cols else np.nan

    # ---- add extras present for this subject ----
    dfs.append(df)

# ---------- align to union: canonical + all extras ----------
ALL_RATING_COLS = CANONICAL_S 
ALL_REL_COLS = [f"rel_{c}" for c in ALL_RATING_COLS]
BASE_COLS = ["participant_id", "cid", "detect", "rt_detect"]

aligned = []
for df in dfs:
    # ensure all rating & rel columns exist
    for c in ALL_RATING_COLS:
        if c not in df.columns:
            df[c] = np.nan
    for c in ALL_REL_COLS:
        if c not in df.columns:
            df[c] = np.nan
    # order columns nicely
    df = df[BASE_COLS + ALL_RATING_COLS + ALL_REL_COLS]
    aligned.append(df)

# ---------- concatenate & save ----------
#new column names

responses = pd.concat(aligned, ignore_index=True)

#read odor_names.xlsx in a dataframe
odor_names = pd.read_excel("rawdatasets/sagar2023/odor_names.xlsx")
odor_names = odor_names.rename(columns={"CID_NEMO": "cid", "Odor": "name", "Concentration": "concentration"})
#add smiles and selfies columns
odor_names= pd.merge(odor_names, cid_smiles_pubchem, on='cid', how='left')
#generate canonical smiles and selfies
odor_names["CanonicalSMILES"] = odor_names["IsomericSMILES"].apply(to_canonical)
odor_names["IsomericSELFIES"] = odor_names["IsomericSMILES"].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)
odor_names["CanonicalSELFIES"] = odor_names["CanonicalSMILES"].apply(lambda x: selfies.encoder(x) if pd.notnull(x) else np.nan)
all_data = pd.merge(responses, odor_names, on='cid', how='left')
all_data.columns = all_data.columns.str.lower()
all_data.to_csv("datasets/sagar2023/sagar2023_data.csv", index=False)


Subject 1: intensity -> cols [0]
Subject 1: pleasantness -> cols [1]
Subject 1: fishy -> cols [2]
Subject 1: burnt -> cols [3]
Subject 1: sour -> cols [4]
Subject 1: decayed -> cols [5]
Subject 1: musky -> cols [6]
Subject 1: fruity -> cols [7]
Subject 1: sweaty -> cols [8]
Subject 1: cool -> cols [9]
Subject 1: floral -> cols [11]
Subject 1: sweet -> cols [12]
Subject 1: warm -> cols [13]
Subject 1: bakery -> cols [14]
Subject 1: spicy -> cols [16]
Subject 1: ammonia -> cols []
Subject 1: edible -> cols []
Subject 1: familiarity -> cols []
Subject 1: chemical -> cols [10]
Subject 1: garlic -> cols [15]
Subject 1: acid -> cols [17]
Subject 2: intensity -> cols [0]
Subject 2: pleasantness -> cols [1]
Subject 2: fishy -> cols [2]
Subject 2: burnt -> cols [3]
Subject 2: sour -> cols [4]
Subject 2: decayed -> cols [5]
Subject 2: musky -> cols [6]
Subject 2: fruity -> cols [7]
Subject 2: sweaty -> cols [8]
Subject 2: cool -> cols [9]
Subject 2: floral -> cols [10]
Subject 2: sweet -> cols [