In [1]:
import pyrfume
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from scipy.io import loadmat
import numpy as np
import pubchempy as pcp
from rdkit import Chem


In [2]:
def find_smiles(x):
    
    if x!=-1:
        c = pcp.Compound.from_cid(int(x))
        smiles= c.isomeric_smiles
        return smiles
    else:
        return -1
    
    
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

def remove_stereo(smiles):
    if smiles!=-1:
    # print(smiles)
        smiles = smiles.replace('@','')
        smiles = smiles.replace('/','')
        smiles = smiles.replace('\\','')
        return canonical_smiles(smiles)
    else:
        return -1

In [8]:
molecules= pyrfume.load_data('dravnieks_1985/molecules.csv')
stimuli= pyrfume.load_data('dravnieks_1985/stimuli.csv')
behavior_1= pyrfume.load_data('dravnieks_1985/behavior_1.csv')
behavior_2= pyrfume.load_data('dravnieks_1985/behavior_2.csv')

In [21]:
pyrfume.load_manifest('dravnieks_1985')

{'source': {'doi': '10.1520/DS61-EB',
  'title': 'Atlas of Odor Character Profiles',
  'authors': 'A Dravnieks',
  'tags': 'human;odorCharacter;academic',
  'extra': "Originally compiled by Dravnieks, digitization performed by Alice Roche (molecules, Bensafi lab) and Rafi Haddad (behavior, Sobel lab) to produce 'raw' material; mixtures subsequently removed in processing pipeline.  See also Dravnieks, 1982 (Science) for description of the 'applicability' and 'use' measures."},
 'raw': {'Dravnieks_molecules.xlsx': 'Information about stimuli in Dravnieks, 1985 as digitized by Bensafi lab',
  'DravnieksGrid.xlsx': 'Information about behavioral data in Dravnieks, 1985 as diitized by Sobel lab'},
 'processed': {'molecules.csv': 'Molecules used',
  'stimuli.csv': 'Stimuls mapped to CID and experimental conditions',
  'behavior_1.csv': 'Applicability of descriptors (average across panelists)',
  'behavior_2.csv': 'Use of descriptors (average across panelists)'},
 'code': {'main.py': 'Processin

In [18]:
molecules.head(5)

Unnamed: 0_level_0,MolecularWeight,IsomericSMILES,IUPACName,name
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
240,106.12,C1=CC=C(C=C1)C=O,benzaldehyde,benzaldehyde
263,74.12,CCCCO,butan-1-ol,1-butanol
264,88.11,CCCC(=O)O,butanoic acid,butyric acid
323,146.14,C1=CC=C2C(=C1)C=CC(=O)O2,chromen-2-one,coumarin
326,148.2,CC(C)C1=CC=C(C=C1)C=O,4-propan-2-ylbenzaldehyde,4-isopropylbenzaldehyde


In [19]:
behavior_1.head(5)

Unnamed: 0_level_0,"FRUITY,CITRUS",LEMON,GRAPEFRUIT,ORANGE,"FRUITY,OTHER THAN CITRUS",PINEAPPLE,GRAPE JUICE,STRAWBERRY,"APPLE, FRUIT",PEAR,...,"PUTRID, FOUL, DECAYED","FECAL, LIKE MANURE",CADAVEROUS,SICKENING,"DRY, POWDERY",CHALKY,LIGHT,HEAVY,"COOL,COOLING",WARM
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abhexone_high,0.0,0.0,0.0,0.0,0.66,0.0,0.33,0.0,0.47,0.0,...,1.87,0.33,0.47,7.44,4.19,0.0,13.83,17.58,2.58,8.17
Acetophenone_high,1.57,0.52,0.26,0.41,8.29,0.52,1.26,1.74,0.0,1.02,...,1.63,0.58,0.68,10.14,4.08,0.0,12.06,29.34,9.89,17.95
AcetylPyridine_high,0.0,0.32,0.0,0.0,0.45,0.0,0.0,1.24,0.0,0.0,...,6.26,1.01,1.46,13.18,11.31,2.58,4.18,23.51,0.0,13.79
Adoxal_high,5.0,3.66,1.09,3.95,3.53,3.27,0.0,0.77,0.39,1.44,...,2.56,0.0,0.0,20.13,3.23,1.16,15.65,12.77,5.45,9.9
AldehydeC-16highcon_high,10.15,3.39,0.0,6.39,52.76,6.64,10.35,32.24,0.79,5.11,...,0.35,0.0,0.0,1.84,0.5,0.61,20.2,5.11,11.56,4.26


In [20]:
stimuli.head(5)

Unnamed: 0_level_0,CAS,CID,Conc,Name
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abhexone_high,698-10-2,61199.0,high,Abhexone
Acetophenone_high,98-86-2,7410.0,high,Acetophenone
AcetylPyridine_high,1122-62-9,14286.0,high,AcetylPyridine
Adoxal_high,141-13-9,98403.0,high,Adoxal
AldehydeC-16highcon_high,77-83-8,6501.0,high,AldehydeC-16highcon


In [33]:
# Import pandas


# Merge molecules and behavior dataframes on 'cid' and keep Stimulus as the index
beh_stim = pd.merge(behavior_1, stimuli, left_index=True, right_index=True)
beh_stim_mol_1 = pd.merge(beh_stim, molecules, on='CID')


beh_stim = pd.merge(behavior_2, stimuli, left_index=True, right_index=True)
beh_stim_mol_2 = pd.merge(beh_stim, molecules, on='CID')




In [34]:
beh_stim_mol_1['nonStereoSMILES'] = beh_stim_mol_2['IsomericSMILES'].apply(lambda x: remove_stereo(x))
beh_stim_mol_2['nonStereoSMILES'] = beh_stim_mol_2['IsomericSMILES'].apply(lambda x: remove_stereo(x))

In [37]:
beh_stim_mol_1.to_csv('./dravnieks1985_applicability_1.csv')
beh_stim_mol_2.to_csv('./dravnieks1985_use_2.csv')