# Conversion of reaction SMILES into text descriptions

In [1]:
import pandas as pd
import pubchempy
from tqdm import tqdm
from STOUT import translate_forward

## Create a set of molecules from all datasets

In [11]:
df1 = pd.read_csv('USPTO_R_smiles.csv')
df2 = pd.read_csv('ORD_R_smiles.csv')
df3 = pd.read_csv('USPTO_C_smiles.csv')
df4 = pd.read_csv('ORD_C_smiles.csv')

In [16]:
def reaction_to_molecules(r):
  
  """"This function converts reaction SMILES into a list of molecules"""
  
  all = []
  reactants = r.split('>')[0].split('.')
  product = r.split('>')[-1].split('.')
  all.extend(reactants)
  all.extend(product)
  if r.split('>')[1] != '':
    agents = r.split('>')[1].split('.')
    all.extend(agents)
  return all

In [17]:
reaction_to_molecules('CC(C)C[C@H](NC(=O)OC(C)(C)C)C(=O)O.O=C1CCC(=O)N1O>>CC(C)C[C@H](NC(=O)OC(C)(C)C)C(=O)ON1C(=O)CCC1=O')

['CC(C)C[C@H](NC(=O)OC(C)(C)C)C(=O)O',
 'O=C1CCC(=O)N1O',
 'CC(C)C[C@H](NC(=O)OC(C)(C)C)C(=O)ON1C(=O)CCC1=O']

In [18]:
def create_mols_list(df):
  
  """This function iterates over all reaction SMILES in a dataset and creates a list of molecules for the whole dataset"""
  
  mols_list = []
  for i in df.smiles.tolist():
    mols_list.extend(reaction_to_molecules(i))
  return mols_list

In [19]:
mols_list_1 = create_mols_list(df1)
mols_list_2 = create_mols_list(df2)
mols_list_3 = create_mols_list(df3)
mols_list_4 = create_mols_list(df4)

In [22]:
mols_list = mols_list_1 + mols_list_2 + mols_list_3 + mols_list_4
mols_set = list(set(mols_list))
len(mols_set)

66426

In [None]:
df_mols = pd.DataFrame({'molecule': mols_set})
df_mols

Unnamed: 0,molecule
0,NCC(=O)NCC(F)(F)F
1,O=C1CC(c2cccc(Cl)c2)C2(C(=O)Nc3cc(Cl)ccc32)C(c...
2,C=CCOC(=O)C1=C(c2ncc3sccn23)[C@H](C)[C@@H]2[C@...
3,CCOC(=O)CNCCN
4,O=C(O)C[C@@H]1Cc2cc(Br)c3[nH]nc(Br)c3c2CN(CC(F...
...,...
64345,Oc1cc(F)ccc1Br
64346,[CH3]C([CH3])(O)C1=[CH][CH]=[CH]C2=C(C3=[CH][C...
64347,N#CC1=[CH]C(C2=NC(N3[CH2][CH2]O[CH2][CH2]3)=NC...
64348,O=CC1=CCCCC1


In [None]:
df_mols.to_csv('all_molecules.csv', index=False)

## Convert SMILES to IUPAC names with PubChem API

In [23]:
df = pd.read_csv('all_molecules.csv')

In [3]:
# Convert molecular SMILES into IUPAC names with PubChem API and STOUT
for idx in tqdm(df.index):
    try:
        name = pubchempy.get_compounds(df.loc[idx, 'molecule'], 'smiles')[0].iupac_name
    except:
        name = translate_forward(df.loc[idx, 'molecule'])
    df.loc[idx, 'name'] = name
df.to_csv('mols_names.csv', index=False)

  0%|          | 0/5001 [00:00<?, ?it/s]

100%|██████████| 5001/5001 [4:00:29<00:00,  2.89s/it]      


## Create sentences

In [3]:
names = pd.read_csv('mols_names.csv')
names

Unnamed: 0,molecule,name
0,NCC(=O)NCC(F)(F)F,"2-amino-N-(2,2,2-trifluoroethyl)acetamide"
1,O=C1CC(c2cccc(Cl)c2)C2(C(=O)Nc3cc(Cl)ccc32)C(c...,6'-[3-bromo-2-fluoro-6-(2-hydroxyethoxy)phenyl...
2,C=CCOC(=O)C1=C(c2ncc3sccn23)[C@H](C)[C@@H]2[C@...,"prop-2-enyl (4S,5R,6S)-6-[(1R)-1-hydroxyethyl]..."
3,CCOC(=O)CNCCN,ethyl 2-(2-aminoethylamino)acetate
4,O=C(O)C[C@@H]1Cc2cc(Br)c3[nH]nc(Br)c3c2CN(CC(F...,"2-[(7S)-1,4-dibromo-8-oxo-9-(2,2,2-trifluoroet..."
...,...,...
66421,[CH3]N([CH2]C(=O)[OH])C1=[CH][CH]=[CH][CH]=[CH]1,2-(N-methylanilino)acetic acid
66422,CC(=N)NO,N'-hydroxyethanimidamide
66423,CC(C)(C)[Si](C)(C)O[C@H]1CCN(CC#CCn2ccnc2)C1=O,(3S)-3-[tert-butyl(dimethyl)silyl]oxy-1-(4-imi...
66424,Cc1ccc2nnc(Sc3ccc4ncc(Br)cc4c3)n2n1,"3-bromo-6-[(6-methyl-[1,2,4]triazolo[4,3-b]pyr..."


In [5]:
# Create a dictionary with SMILES and corresponding IUPAC names
names_dict = dict(zip(names.molecule.tolist(), names.name.tolist()))
names_dict

{'NCC(=O)NCC(F)(F)F': '2-amino-N-(2,2,2-trifluoroethyl)acetamide',
 'O=C1CC(c2cccc(Cl)c2)C2(C(=O)Nc3cc(Cl)ccc32)C(c2c(OCCO)ccc(Br)c2F)N1': "6'-[3-bromo-2-fluoro-6-(2-hydroxyethoxy)phenyl]-6-chloro-4'-(3-chlorophenyl)spiro[1H-indole-3,5'-piperidine]-2,2'-dione",
 'C=CCOC(=O)C1=C(c2ncc3sccn23)[C@H](C)[C@@H]2[C@@H]([C@@H](C)O)C(=O)N12': 'prop-2-enyl (4S,5R,6S)-6-[(1R)-1-hydroxyethyl]-3-imidazo[5,1-b][1,3]thiazol-5-yl-4-methyl-7-oxo-1-azabicyclo[3.2.0]hept-2-ene-2-carboxylate',
 'CCOC(=O)CNCCN': 'ethyl 2-(2-aminoethylamino)acetate',
 'O=C(O)C[C@@H]1Cc2cc(Br)c3[nH]nc(Br)c3c2CN(CC(F)(F)F)C1=O': '2-[(7S)-1,4-dibromo-8-oxo-9-(2,2,2-trifluoroethyl)-2,6,7,10-tetrahydropyrazolo[3,4-i][2]benzazepin-7-yl]acetic acid',
 'O=C1COc2cc(F)ccc2N1': '7-fluoro-4H-1,4-benzoxazin-3-one',
 'O=C1[NH][CH](C(=O)[OH])C2=[CH][CH]=[CH][CH]=C12': '3-oxo-1,2-dihydroisoindole-1-carboxylic acid',
 '[Al+3]': 'aluminum(3+)',
 'CC(C)(C)OC[C@H](CP(c1ccccc1)c1ccccc1)P(c1ccccc1)c1ccccc1': '[(2R)-1-diphenylphosphanyl-3-[(2-met

In [6]:
def reaction_to_dict(r):
  
  """This function converts reaction SMILES into a dictionary of the format {'reactants': [...], 'agents': [...], 'product': [...]}"""
  
  d = {}
  reactants = r.split('>')[0].split('.')
  d['reactants'] = [names_dict[i] for i in reactants]
  products = r.split('>')[-1].split('.')
  d['product'] = [names_dict[i] for i in products]
  if r.split('>')[1] != '':
    agents = r.split('>')[1].split('.')
    d['agents'] = [names_dict[i] for i in agents]
  return d

In [8]:
def create_sentance(d):
  
  """This function creates a coherent reaction description from the IUPAC names of molecules"""
  
  reactsnts = d['reactants']
  if len(reactsnts) > 1:
    part1 = f"{reactsnts[0]} and {reactsnts[1]} react together"
  else:
    part1 = f'{reactsnts[0]} reacts'

  if 'agents' in list(d.keys()):
    agents = ', '.join(d['agents'])
    part2 = f' in the presence of {agents}'
  else:
    part2 = ''
  product = d['product'][0]
  part3 = f' to produce {product}.'
  prompt = part1 + part2 + part3

  return prompt.capitalize()

In [9]:
def create_text_df(df):
    
    """This function creates a new dataframe with reaction descriptions instead of SMILES strings"""
    
    dicts = [reaction_to_dict(r) for r in df.smiles.tolist()]
    df['sentences'] = [create_sentance(d) for d in dicts]
    df = df[['sentences', 'split', 'not_high_yielding', 'high_yielding']]
    return df

In [12]:
df11 = create_text_df(df1)
df11

Unnamed: 0,sentences,split,not_high_yielding,high_yielding
0,(2s)-4-methyl-2-[(2-methylpropan-2-yl)oxycarbo...,train,0,1
1,Piperidine and tert-butyl 3-iodo-5-(methylsulf...,train,1,0
2,[1-(4-fluoronaphthalen-1-yl)-3-methyl-1-oxobut...,train,1,0
3,[6-[2-[[2-(3-fluoro-4-hydroxyphenyl)-2-oxoethy...,train,1,0
4,7-(4-fluoro-2-methoxyphenyl)-8-(hydroxymethyl)...,train,1,0
...,...,...,...,...
11295,2-(benzotriazol-1-yloxy)-4-[(1-methylindol-4-y...,val,1,0
11296,Acetyl acetate and 1-azabicyclo[2.2.2]octan-3-...,val,1,0
11297,1-chloro-2-methylpropan-2-ol and 4-hydroxybenz...,val,0,1
11298,"1-(bromomethyl)-3,5-difluorobenzene and 1-(6-c...",val,1,0


In [None]:
df22 = create_text_df(df2)
df33 = create_text_df(df3)
df44 = create_text_df(df4)

In [16]:
df11.to_csv('USPTO_R_text.csv', index=False)
df22.to_csv('ORD_R_text.csv', index=False)
df33.to_csv('USPTO_C_text.csv', index=False)
df44.to_csv('ORD_C_text.csv', index=False)