# Processing of the USPTO data

We used the Schwaller's version of the USPTO dataset, which was originally suggested in [this](https://pubs.rsc.org/en/content/articlelanding/2018/SC/C8SC02339E) paper. The datasets can be downloaded from the [link](https://ibm.ent.box.com/v/ReactionSeq2SeqDataset) provided in the Schwaller's paper.

In [None]:
import pandas as pd
import numpy as np

In [12]:
df_test = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_test.csv', sep='\t', skiprows=2)
df_train = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_train.csv', sep='\t', skiprows=2)
df_valid = pd.read_csv('US_patents_1976-Sep2016_1product_reactions_valid.csv', sep='\t', skiprows=2)
df_test.head()

Unnamed: 0,Source,Target,CanonicalizedReaction,OriginalReaction,PatentNumber,ParagraphNum,Year,TextMinedYield,CalculatedYield
0,C S ( = O ) ( = O ) Cl . O C C C Br > A_CCN(CC...,C S ( = O ) ( = O ) O C C C Br,CS(=O)(=O)Cl.OCCCBr>CCN(CC)CC.CCOCC>CS(=O)(=O)...,[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7]...,US03930836,,1976,,
1,C ( = N C 1 C C C C C 1 ) = N C 1 C C C C C 1 ...,C C S c 1 c c c 2 c ( c 1 C ) C ( C ( = O ) O ...,C(=NC1CCCCC1)=NC1CCCCC1.CCOC(C)=O.CCSc1ccc2c(c...,[CH2:1]([S:3][C:4]1[CH:23]=[CH:22][C:7]2[N:8](...,US03931151,,1976,,
2,C C ( C ) ( C ) N N C ( C ) ( C # N ) C 1 C C ...,C C ( C ) ( C ) N N C 1 ( C # N ) C C C C C C 1,CC(C)(C)NNC(C)(C#N)C1CC1.O=C1CCCCCC1>CC(=O)C1C...,[C:1]([NH:5][NH:6][C:7]([C:12]#[N:13])([CH:9]1...,US03931143,,1976,95%,
3,N c 1 c c c ( C ( = O ) O ) c c 1 . O = C ( Cl...,O = C ( O ) c 1 c c c ( N C ( = O ) C 2 C C C ...,Nc1ccc(C(=O)O)cc1.O=C(Cl)C1CCC1>>O=C(O)c1ccc(N...,[NH2:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])...,US03931153,,1976,,
4,N c 1 c c c ( C ( = O ) O ) c ( [N+] ( = O ) [...,O = C ( O ) c 1 c c c ( N C ( = O ) C 2 C C C ...,Nc1ccc(C(=O)O)c([N+](=O)[O-])c1.O=C(Cl)C1CCC1>...,[NH2:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])...,US03931153,,1976,,


In [13]:
def curate_yields(row):

  """This function curates TextMinedYield and CalculatedYield columns and returns the final yield value."""

  orig_text_yield = row["TextMinedYield"]
  orig_calc_yield = row["CalculatedYield"]
  new_text_yield = 0
  new_calc_yield = 0

  # Leave only yields in (0;100] range
  if 0 < orig_text_yield <= 100:
    new_text_yield = orig_text_yield
  if 0 < orig_calc_yield <= 100:
    new_calc_yield = orig_calc_yield

  # Leave only CalculatedYield if CalculatedYield is higher that TextMinedYield
  out_yield = new_text_yield
  if new_calc_yield > new_text_yield:
    out_yield = new_calc_yield

  return out_yield

In [18]:
def prepare_dataset(data, output_name):

  """This function processes the original dataset and saves the final cleaned dataset."""

  # Create ID for reactions
  l = len(data)
  data["myID"] = np.arange(l)
  id_prefix = "ID"
  data["myID"] = data["myID"].apply(lambda x: id_prefix + "{0:0>8}".format(x))
  cols = data.columns.tolist()
  cols = cols[-1:] + cols[:-1]
  data = data[cols]

  # Remove NaNs if TextMinedYield and CalculatedYield both have NaNs
  data.dropna(subset=["TextMinedYield", "CalculatedYield"], how="all", inplace=True)

  # Remove unnecessary columns
  data.drop(["PatentNumber", "ParagraphNum", "Year"], axis=1, inplace=True)

  # Remove all signs such as >, %, etc.
  data["CalculatedYield"] = data["CalculatedYield"].str.rstrip("%")
  data["TextMinedYield"] = data["TextMinedYield"].str.lstrip("~")
  data["TextMinedYield"] = data["TextMinedYield"].str.rstrip("%")
  data["TextMinedYield"] = data["TextMinedYield"].str.replace(">=", "", regex=True)
  data["TextMinedYield"] = data["TextMinedYield"].str.replace(">", "", regex=True)
  data["TextMinedYield"] = data["TextMinedYield"].str.replace("<", "", regex=True)
  data["TextMinedYield"] = data["TextMinedYield"].str.replace("\d{1,2}\sto\s", "", regex=True)
  data["TextMinedYield"] = data["TextMinedYield"].replace(np.nan, 0)
  data["CalculatedYield"] = data["CalculatedYield"].replace(np.nan, 0)
  data["TextMinedYield"] = pd.to_numeric(data["TextMinedYield"], errors="coerce")
  data["CalculatedYield"] = pd.to_numeric(data["CalculatedYield"], errors="coerce")

  # Curate yields and leave only the final yield value
  data["Yield"] = data.apply(curate_yields, axis=1)
  data.drop(["TextMinedYield", "CalculatedYield"], axis=1, inplace=True)
  data = data[data["Yield"] != 0]

  # Save the dataset
  data.to_csv(output_name, sep="\t", index=False)

  return None

In [None]:
prepare_dataset(df_test, 'US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_test.csv')
prepare_dataset(df_train, 'US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_train.csv')
prepare_dataset(df_valid, 'US_patents_1976-Sep2016_1product_reactions_yield_ok_cropped_data_valid.csv')