In [1]:
from fcd_torch import FCD
import torch
import pandas as pd
from sklearn.metrics import mean_squared_error as MSE

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fcd = FCD(device, n_jobs=1)

Load dataframe with the SMILES and corresponding InCHIKey for each chemical in the dataset.

In [3]:
inchikey_smiles_df = pd.read_csv('/home/cmdunham/ChemicalDataGeneration/data/inchikey_smiles.csv')
inchikey_smiles_df.head()

Unnamed: 0,SMILES,InCHIKey
0,CO/N=C(/C1=CC=CO1)\C(=O)N[C@H]2[C@@H]3N(C2=O)C...,JFPVXVDWJQMJEE-IZRZKJBUSA-N
1,c(c4)cnc(n4)N(C1)CCN(CCCCN(C(=O)2)C(=O)CC(C3)(...,QWCRAEMEVRGPNT-UHFFFAOYSA-N
2,CC(N)COc(c(C)1)c(C)ccc1,VLPIATFUUWWMKC-UHFFFAOYSA-N
3,CC[C@H]1CN2CC[C@H]1C[C@@H]2[C@H](C3=C4C=C(C=CC...,LJOQGZACKSYWCH-LHHVKLHASA-N
4,NC(=O)CS(=O)(=O)C(c(c2)cccc2)c(c1)cccc1,ZESNOWZYHYRSRY-UHFFFAOYSA-N


In [4]:
chemception_embeddings = {}

for inchikey, smiles in zip(inchikey_smiles_df['InCHIKey'], inchikey_smiles_df['SMILES']):
    try:
        embedding = pd.Series(fcd.get_predictions([smiles])[0])
        chemception_embeddings[inchikey] = embedding
    except:
        pass

chemception_embeddings = pd.DataFrame(chemception_embeddings)
chemception_embeddings.head()

[17:02:05] Can't kekulize mol.  Unkekulized atoms: 2 4 6
[17:02:08] SMILES Parse Error: syntax error while parsing: InChI=1S/C15H10O3/c16-11-6-7-12-13(17)9-14(18-15(12)8-11)10-4-2-1-3-5-10/h1-9,16H
[17:02:08] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C15H10O3/c16-11-6-7-12-13(17)9-14(18-15(12)8-11)10-4-2-1-3-5-10/h1-9,16H' for input: 'InChI=1S/C15H10O3/c16-11-6-7-12-13(17)9-14(18-15(12)8-11)10-4-2-1-3-5-10/h1-9,16H'
[17:02:50] SMILES Parse Error: syntax error while parsing: COMPOUND
[17:02:50] SMILES Parse Error: Failed parsing SMILES 'COMPOUND' for input: 'COMPOUND'


Unnamed: 0,JFPVXVDWJQMJEE-IZRZKJBUSA-N,QWCRAEMEVRGPNT-UHFFFAOYSA-N,VLPIATFUUWWMKC-UHFFFAOYSA-N,LJOQGZACKSYWCH-LHHVKLHASA-N,ZESNOWZYHYRSRY-UHFFFAOYSA-N,YFGHCGITMMYXAQ-UHFFFAOYSA-N,QARQPIWTMBRJFX-UHFFFAOYSA-N,DGBIGWXXNGSACT-UHFFFAOYSA-N,JYGXADMDTFJGBT-VWUMJDOOSA-N,OGDVEMNWJVYAJL-LEPYJNQMSA-N,...,KLAONOISLHWJEE-QWRGUYRKSA-N,LZLREEUGSYITMX-JQWIXIFHSA-N,MFEVVAXTBZELLL-UHFFFAOYSA-N,MYVYPSWUSKCCHG-UHFFFAOYSA-N,NALWOULWGHTVDA-UWVGGRQHSA-N,NLKUJNGEGZDXGO-XVKPBYJWSA-N,NQIHMZLGCZNZBN-PXNSSMCTSA-N,NYQBYASWHVRESG-MIMYLULJSA-N,OHUXOEXBXPZKPT-STQMWFEESA-N,XAEWTDMGFGHWFK-IMJSIDKUSA-N
0,0.222976,-0.434184,-0.597442,-0.0529,0.046144,0.058485,0.076839,0.144537,0.002038,0.060651,...,0.218137,0.177137,0.479563,0.0979,0.358348,0.545235,0.242986,0.293669,0.100975,0.331567
1,0.055356,0.02926,0.002669,0.021672,0.003977,0.001022,0.003735,-0.000841,0.087391,0.052309,...,0.011965,0.005647,0.017953,0.004789,0.017625,0.011805,0.008076,0.01096,0.007847,0.011442
2,-0.188069,0.026573,-0.70276,0.055534,0.058832,-0.061164,0.113597,-0.63998,0.162651,0.083952,...,-0.304719,-0.322129,-0.609638,-0.085713,-0.52123,-0.66579,-0.652809,-0.444798,-0.492892,-0.11416
3,0.15591,0.477781,-0.352659,-0.276342,0.425317,0.613295,0.545351,0.481708,-0.324684,-0.110154,...,0.416946,0.754774,0.30349,0.741421,0.560687,0.268697,0.537932,0.302214,0.680996,0.172203
4,-0.13277,-0.004187,-0.040629,-0.278918,-0.383982,-0.09052,-0.291823,-0.030842,-0.336347,-0.160723,...,-0.078485,-0.04809,-0.073228,-0.096477,-0.081546,-0.042135,-0.023268,-0.057141,-0.15148,-0.034832


In [5]:
save_file = '/home/cmdunham/ChemicalDataGeneration/data/embeddings_df.csv'
with open(save_file, 'w') as f:
  chemception_embeddings.to_csv(f, index=False)