In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)

In [2]:
data_artificial = pd.read_csv("data/20210205_all_expt_data_no_duplicates_solvent_calcs.csv")


In [3]:
# several molecules have 0 in experimetnal absorption results
data_artificial = data_artificial[data_artificial["peakwavs_max"] != 0]

In [4]:
# convert nm to ev
def nm2ev(wv):
    if wv is not None:
        return 1239.8 / wv
    else:
        return wv

In [5]:
# check problematic pigments which have short SMILES strings and 
# have a strong difference between experimental and computed data
# it is highly likely they are artifacts of text mining

# slope and intercept are the parameters colaculated by Greenman et al.
# to remove systematic error of TD-DFT calculation
slope = 1.82
intercept = 226.8

data_short_smiles = data_artificial[data_artificial["smiles"].apply(len) < 15]
difference = abs(data_short_smiles["peakwavs_max"] - slope * data_short_smiles["wb97xd3_def2svpd_orca_vac"] + intercept)

# we set a threshhold for difference between experimental and predicted values
suspicious_SMILES = data_short_smiles[difference.isna() | (difference > 100.)]["smiles"]
len(suspicious_SMILES)

46

In [6]:
# Examine where calculated data is different more than 100 nm from the experiment
# In this case there are potentially molecules which are not recognized correctly
# add these data to previously selected suspicious smiles

difference = abs(data_artificial["peakwavs_max"] - slope * data_artificial["wb97xd3_def2svpd_orca_vac"] + intercept)
suspicious_SMILES = data_artificial[difference > 100.]["smiles"] + suspicious_SMILES
len(suspicious_SMILES)

1815

In [7]:
# delete these suspicious SMILES string
data_artificial = data_artificial[~data_artificial["smiles"].isin(suspicious_SMILES)] 
len(data_artificial)

21144

In [8]:
# nm to eV conversion

data_artificial["peakwavs_max"] = data_artificial["peakwavs_max"].apply(nm2ev)

In [9]:
# !!! a set of natural compounds with wB97XD4 with TDA and solvation model calculated vertical excitation energies
# !!! this file does not come with the repo
# !!! To get it the input files has to be generated, and calculations should be run to get lowest excitation energies 
# !!! and merged with the dataset of natural compounds

natural_compounds = pd.read_csv("data/pigments_wb97xd4_tda_solv.csv")

In [10]:
# wb97xd4 - tda demonstrates a systematic error to remove it use slope and intercept obtained from linear models
# this is necessary to obtain linear unbiased estimation of mean absolute errors

slope = 0.71839382
intercept = 0.3203683403913211

natural_compounds["mae"] = abs(intercept + slope * natural_compounds['wavelength_wb97xd_tda_solvent'] - natural_compounds['lambda_max'])

In [11]:
# absorption energies are here in electron-volts;
# remove outliers from natural compounds dataset

len(natural_compounds[natural_compounds["mae"] < 0.5])

595

In [12]:
natural_compounds = natural_compounds[natural_compounds["mae"] < 0.5]
# rename column 'lambda_max' to "peakwavs_max"
natural_compounds['peakwavs_max'] = natural_compounds['lambda_max']
del natural_compounds['lambda_max']

In [14]:
# create train - test splitting for both ariticial and natural compounds
train_artificial = data_artificial.sample(frac = 0.9, random_state=42)
test_artificial = data_artificial.drop(train_artificial.index)
train_natural = natural_compounds.sample(frac = 0.9, random_state=4200)
test_natural = natural_compounds.drop(train_natural.index)

# add training label column for chemprop 2.0
train_artificial["split"] = ["train"] * len(train_artificial)
test_artificial["split"] = ["val"] * len(test_artificial)
train_natural["split"] = ["train"] * len(train_natural)
test_natural["split"] = ["test"] * len(test_natural)

# union train data from artificial and natural compounds
train_artificial = train_artificial[['smiles', 'solvent', 'peakwavs_max', 'split']]
test_artificial = test_artificial[['smiles', 'solvent', 'peakwavs_max', 'split']]
train_natural = train_natural[['smiles', 'solvent', 'peakwavs_max', 'split']]
test_natural = test_natural[['smiles', 'solvent', 'peakwavs_max', 'split']]

train_all = pd.concat((train_artificial, train_natural))
# data for training for chemprop 2.0
data_all = pd.concat((train_artificial, train_natural, test_artificial, test_natural))

In [15]:
# save all training data in csv format 
train_all.to_csv("data/train_all.csv")
test_artificial.to_csv("data/test_artificial.csv")
test_natural.to_csv("data/test_natural.csv")
data_all.to_csv("data_all.csv")