# Processing of the ORD data
We used [the ORD schema](https://github.com/open-reaction-database/ord-schema) to access the ORD reactions data and covert it to the dataframe format. The resulting dataset can be downloaded from here (the link to the dataset will be provided upon paper acceptance due to the anonymity restrictions).

In [55]:
import pandas as pd
import numpy as np
from rdkit.Chem.rdChemReactions import RemoveMappingNumbersFromReactions
from rdkit.Chem import AllChem

In [34]:
df_orig = pd.read_csv('ord.tsv', sep='\t')
df_orig.drop(columns=df_orig.columns[0], inplace=True)
df_orig.head()

  df_orig = pd.read_csv('/content/drive/MyDrive/SYNCO/reactions_datasets/ord.tsv', sep='\t')


Unnamed: 0,dataset_id,reaction_id,reaction_type,reaction_smiles,inputs_REACTANT,inputs_REAGENT,inputs_SOLVENT,inputs_CATALYST,inputs_INTERNAL,inputs_UNSPECIFIED,...,outcomes_UNSPECIFIED,temperature_value,temperature_units,temperature_control_type,pressure_value,pressure_units,pressure_control_type,atmosphere,specific_conditions,yields
0,ord_dataset-00005539a1e04c809a9a78647bea649c,ord-56b1f4bfeebc4b8ab990b9804e798aa7,1.3.1 [N-arylation with Ar-X] Bromo Buchwald-H...,,"CC(C)N1CCNCC1, CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C...",C(=O)([O-])[O-].[Cs+].[Cs+],,C1=CC=C(C=C1)P(C2=CC=CC=C2)C3=C(C4=CC=CC=C4C=C...,,,...,,110.0,1,0,1.0,2,0,0,,65.38999938964844
1,ord_dataset-00005539a1e04c809a9a78647bea649c,ord-1169cbe9fa064a879ac34b2e524a4e69,1.3.4 [N-arylation with Ar-X] Iodo Buchwald-Ha...,,"C1=CC=C(C=C1)I, CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC",C(=O)([O-])[O-].[Cs+].[Cs+],COC1=CC=CC=C1,CC1(C2=C(C(=CC=C2)P(C3=CC=CC=C3)C4=CC=CC=C4)OC...,,,...,,100.0,1,0,1.0,2,0,0,,57.470001220703125
2,ord_dataset-00005539a1e04c809a9a78647bea649c,ord-13992005c22d4673aa802b5e140076e8,1.3.4 [N-arylation with Ar-X] Iodo Buchwald-Ha...,,"C1=CC=C(C=C1)I, CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC",C(=O)([O-])[O-].[Cs+].[Cs+],COC1=CC=CC=C1,CC1(C2=C(C(=CC=C2)P(C3=CC=CC=C3)C4=CC=CC=C4)OC...,,,...,,100.0,1,0,1.0,2,0,0,,65.43000030517578
3,ord_dataset-00005539a1e04c809a9a78647bea649c,ord-a36b48917c9942d1a34637511773ee1f,1.3.4 [N-arylation with Ar-X] Iodo Buchwald-Ha...,,"C1=CC=C(C=C1)I, CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC",C(=O)([O-])[O-].[Cs+].[Cs+],COC1=CC=CC=C1,CC1(C2=C(C(=CC=C2)P(C3=CC=CC=C3)C4=CC=CC=C4)OC...,,,...,,100.0,1,0,1.0,2,0,0,,75.06999969482422
4,ord_dataset-00005539a1e04c809a9a78647bea649c,ord-5fc624fd97b7430eafbe8dcc049d170b,0.0 [Unassigned] Unrecognized,,"CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C, C1=CC(=CC=C...",C(=O)([O-])[O-].[Cs+].[Cs+],CC(=O)N(C)C,CC1(C2=C(C(=CC=C2)P(C3=CC=CC=C3)C4=CC=CC=C4)OC...,,,...,,150.0,1,0,1.0,2,0,0,,46.31999969482422


In [91]:
df = df_orig.copy()

In [92]:
df.shape

(2271134, 24)

In [93]:
# Delete reactions with unreported yields
df = df[df.yields.isna()==False]
df.shape

(198950, 24)

In [94]:
# Make a list of yields where multiple yields are reported
df["yields"] = df["yields"].apply(lambda x: x.split(','))

In [95]:
# Delete reactions with multile NaN yields
idxs = []
for idx in df.index:
  if all(df.loc[idx, 'yields']) and 'None' in df.loc[idx, 'yields']:
    idxs.append(idx)
df = df.drop(index=idxs)
df.shape

(99718, 24)

In [96]:
# Leave only reactions with one reported yield
df['yields'] = df.yields.apply(lambda x: [i for i in x if i!=' None'])
df['len'] = df.yields.apply(lambda x: len(x))
df = df[df.len==1]
df.drop(columns=['len'], inplace=True)
df.shape

(98805, 24)

In [97]:
# Delete reactions with unreported SMILES
df = df[df.reaction_smiles.isna()==False]
df.shape

(85663, 24)

In [98]:
# Convert yields to floats and round to the whole number
df.yields = df.yields.apply(lambda x: np.round(float(x[0]), 1))

In [99]:
def regenerate_reaction_smiles(smiles):

  """This function regenerates and canonicalizes reaction SMILES"""

  rxn = AllChem.ReactionFromSmarts(smiles)
  RemoveMappingNumbersFromReactions(rxn)
  new_smiles = AllChem.ReactionToSmiles(rxn)
  return new_smiles

In [100]:
df['standard_reaction_smiles'] = df.reaction_smiles.apply(regenerate_reaction_smiles)

In [101]:
# Drop duplicates
ord_df = df[['standard_reaction_smiles', 'yields']]
ord_df.drop_duplicates(subset=['standard_reaction_smiles', 'yields'], inplace=True)
ord_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ord_df.drop_duplicates(subset=['standard_reaction_smiles', 'yields'], inplace=True)


(55586, 2)

In [71]:
# The datasets obtained in the 1_uspto_processing.ipynb notebook
df_train = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_train.csv', sep='\t')
df_test = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_test.csv', sep='\t')
df_val = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_valid.csv', sep='\t')

In [102]:
df_uspto = pd.concat([df_train, df_test, df_val])[['CanonicalizedReaction', 'Yield']]
df_uspto.reset_index(drop=True, inplace=True)
df_uspto.head()

Unnamed: 0,CanonicalizedReaction,Yield
0,CCOC(=O)CC1(O)Cc2ccccc2N(C)c2ccc(SCC)cc21>CCO....,82.0
1,CCOC(=O)C=C1Cc2ccccc2N(C)c2ccc(SCC)cc21>CCO.Cl...,78.1
2,CCSc1ccc2c(c1)C(CC(=O)N(C)C)=Cc1ccccc1N2C>C1CC...,82.1
3,CC(C)(C)NNC(C)(C#N)C1CC1>BrBr.ClCCl.O>CC(C)(C)...,81.9
4,CC(C)(C)NNC(C)(C#N)C1CC1.O=C1CCCCCCC1>CC(=O)C1...,100.0


In [103]:
# Find reactions that overlap with the USPTO dataset
ord = set(df.standard_reaction_smiles.tolist())
uspto = set(df_uspto.CanonicalizedReaction.tolist())
both = uspto.intersection(ord)
both

{'FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)I.[Br-]>Br[Cu]Br>FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)Br'}

In [104]:
# This reaction has the same yield, so we can drop it from the ORD data
print(df_uspto[df_uspto.CanonicalizedReaction==list(both)[0]].Yield)
print(df[df.standard_reaction_smiles==list(both)[0]].yields)

48123    83.5
Name: Yield, dtype: float64
870363    83.5
Name: yields, dtype: float64


In [105]:
# Drop the overlapping reaction
ord_df = ord_df[ord_df.standard_reaction_smiles != list(both)[0]]
ord_df.shape

(55585, 2)

In [106]:
# Leave only yields in (0;100] range
ord_df = ord_df[(ord_df.yields > 0) & (ord_df.yields <= 100)]
ord_df.shape

(55493, 2)

In [108]:
ord_df.reset_index(drop=True, inplace=True)
ord_df

Unnamed: 0,standard_reaction_smiles,yields
0,Cl.[CH3]O[CH2][C@H]([NH2])C1=[CH][CH]=[CH][CH]...,22.1
1,Cl.[CH3]O[CH2][C@H]([NH2])C1=[CH][CH]=[CH][CH]...,57.3
2,CC(=O)[O-].[CH3]C(=O)C1=[CH]C2=[CH][CH]=[CH][C...,89.0
3,CO.[CH]1=[CH][CH]=C(OC2=[CH][CH]=C(C3=[CH][CH2...,66.0
4,Cl.Cl[CH2][CH2][NH][CH2][CH2]Cl.[NH2]C1=[CH][C...,75.0
...,...,...
55488,CCCCCCCCCCC[CH2]OS(=O)(=O)[O-].O=COC=O.[NH2]C1...,69.0
55489,CC(=O)OC(C)=O.CCCCCCCCCCCCOS(=O)(=O)[O-].[NH2]...,73.0
55490,CCCCCCCCCCC[CH2]OS(=O)(=O)[O-].O=COC=O.[NH2]C1...,71.0
55491,CC(C)(C)OC(=O)N1[CH2][CH2][CH2][C@H]1C(=O)[NH]...,57.0


In [None]:
ord_df.to_csv('ord_to_add.tsv', sep='\t', index=False)