In [None]:
import json
import os
import copy
file_path = './Larrea/requests/result_uspto_requests.json'
file_path2 = './Larrea/requests/uspto_requests.json'
output_path = './Training_Info/'

In [10]:
with open(file_path, 'r') as raw_response: 
    lines = raw_response.readlines()
    total_data = [json.loads(line) for line in lines]

def extract_description(example_data):
    total_description = example_data[0]['messages'][1]['content']

    total_description = total_description.split('[Input]\nReaction ')[1:]

    for i, each_description in enumerate(total_description):
        first_split = each_description.index('\n')
        total_description[i] = each_description[first_split+1:]
    
    return total_description

example_data = total_data[0]

extract_description(example_data)

['To a suspension of glycinamide hydrochloride (0.55 g, 5 mmol) in DMF (10 mL) was added triethylamine (1.7 mL, 12 mmol) and the resulting mixture was stirred for 15 min at room temperature. This was added to a solution of 5-chloro-3-methoxy-[1,2,4]thiadiazole (0.38 g, 2.5 mmol, prepared according to the procedure by Goerdeler, J. et al. in Chem. Ber. 1955, 88, 843) in DMF (15 mL). Then, tetra-butylammonium bromide (50 mg) was added and the resulting suspension was heated at 80-85° C. for 1.5 h. On cooling to room temperature, the reaction mixture was diluted with water and ethyl acetate. The organic layer was collected, washed with brine, sat. Sodium bicarbonate solution, brine, dried (sodium sulfate), filtered and concentrated in vacuo. Purification by column chromatography on silica gel using a solvent mixture of dichloromethane and methanol (95/5 and 9/1) afforded 2-(3-methoxy-[1,2,4]thiadiazol-5-ylamino)-acetamide (75 mg, 16%) as a white solid. 1H-NMR (MeOD) δ 4.03 (s, 2H, NCH2), 

## LLM response extraction
return the reaction processes and their corresponding reaction id from the LLM model

In [11]:
def extract_gpt_response(example_data):
    total_response = example_data[1]['choices'][0]['message']['content'].split('Reaction ')[1:]
    total_response = [each.replace('ID', "").replace(' ', '').replace('*', '').replace('`', '').replace(':', '') for each in total_response]
    total_description = example_data[0]['messages'][1]['content'].split('[Input]\nReaction ')[1:]
    total_description_id=[each[:each.index(' description')] for each in total_description]


    reaction_id = []
    reaction_process = []
    for response in total_response:
        try:
            if 'description' in response:
                # API wrongly generates the whole description instead of the reaction process
                continue
            first_split = response.index('\n')
            if 'P1' not in response:
                # Modify errors where 'P1' is not the final product
                last_split = response.rindex('>')+1
                response = response[:last_split] + 'P1'
            else:
                last_split = response.index('P1')
            if response[:first_split] not in total_description_id or response[:first_split] in reaction_id:
                continue
            else:
                # delete all blankspaces in the reaction process
                response = response.replace(' ', '')
                reaction_id.append(response[:first_split])
                reaction_process.append(response[first_split+1:last_split+2])
        except:
            continue
    
    return reaction_process, reaction_id

example_data = total_data[0]
extract_gpt_response(example_data)

(['R2.S4.R3>T2.M1\nM1.R1.S3.E1>T1>M2\nM2.C1.E3.T2>M3\nM3.>S2>M4\nM4>>P1',
  'R1.R2.S1>R3.T1.M1\nM1.S4.R4.T2>M2\nM2.>M3\nM3>>P1',
  'R2.S1>R1.E1.T1.M1\nM1.E2.T2>S1>M2\nM2>>P1'],
 ['20021022-US06468977B1-0002',
  '20120329-US20120077781A1-0719',
  '20130411-US20130090327A1-0338'])

## Pattern Repair
Ex. Wrong Format in LLM response
> `R1>S1.T1>E1>P1` -> `R1>S1.T1.E1>P1`

Ex. Incorrect inclusion of post-processes
> `R1>S1.T1.E1>M1\nM1>>P1` -> `R1>S1.T1.E1>P1`

In [12]:
def pattern_repair(answer):
    returned_substeps = []
    flag = 0
    substeps = answer.replace(' ', '').split('\n')
    for substep in substeps:
        if 'S' in substep:
            S_index = substep.index('S')
        else:
            S_index = float('inf')
        if 'T' in substep:
            T_index = substep.index('T')
        else:
            T_index = float('inf')
        if 'E' in substep:
            E_index = substep.index('E')
        else:
            E_index = float('inf')
        if 'C' in substep:
            C_index = substep.index('C')
        else:
            C_index = float('inf')
        if 'M' in substep:
            M_index = substep.index('M')
        else:
            M_index = float('inf')

        try:
            if 'P' in substep:
                P_index = substep.index('P')
            elif 'M' in substep and substep.rindex('M') > substep.rindex('>'):
                P_index = substep.rindex('M')
            else:
                P_index = float('inf')
        except:
            P_index = float('inf')
        index_dict1 = {'S': S_index, 'T': T_index, 'E': E_index, 'C': C_index}
        index_dict2 = {'M': M_index, 'P': P_index}

        # Condition1: too many ">" in the substep / the ">" is not in the right order
        if substep.count('>') >= 2:
            if index_dict1 == {'S': float('inf'), 'T': float('inf'), 'E': float('inf'), 'C': float('inf')}:
                # Condition2: There are no conditions in the substep
                begin_condition = substep.index('>') + 1
                end_condition = substep.rindex('>') - 1
                flag = 1
            else:
                begin_condition = min([index_dict1[key] for key in index_dict1 if index_dict1[key] != float('inf')])
                end_condition = max([index_dict1[key] for key in index_dict1 if index_dict1[key] != float('inf')])
            if index_dict2 == {'M': float('inf'), 'P': float('inf')}:
                begin_product = substep.rindex('>') + 1
            elif [index_dict2[key] for key in index_dict2 if index_dict2[key] != float('inf') and index_dict2[key] > end_condition] != []:
                begin_product = min([index_dict2[key] for key in index_dict2 if index_dict2[key] != float('inf') and index_dict2[key] > end_condition])
            else:
                returned_substeps.append(substep)
                continue

            # Condition3: R1.R2.S1.E1.T1>R4>P1, this should be decomposed into 2 substeps
            # We will not deal with this situation in this function
            if 'R' in substep:
                if substep.rindex('R') > end_condition:
                    returned_substeps.append(substep)
                    continue
            if begin_condition >= begin_product:         # Invalid syntax, we will not deal with it
                returned_substeps.append(substep)
            else:
                substep = substep.replace('>', '.')
                if flag == 1:   # There are no conditions in the substep
                    modified_substep = substep[:begin_condition-1] + '>' + '>' + substep[begin_product:]
                    flag = 0
                else:
                    modified_substep = substep[:begin_condition-1] + '>' + substep[begin_condition:end_condition+2] + '>' + substep[begin_product:]
                returned_substeps.append(modified_substep)
        # Condition5: If the substep is fine / If the error cannot be fixed easily
        else:
            returned_substeps.append(substep)
    return '\n'.join(returned_substeps)

test1 = "R1.R2.R3.C1>S1.E1.T1>M1  \nM1>P1"  # Condition 1 and 5
test2 = "R1.R3.S1>T1>E1>M1  \nM1>R4.T2>>P1" # Condition 1 and 4
test3 = "R1.R2.S1.E1.T1 > R4 > P1"          # Condition 3
test4 = "R1.M1>>>> P1"                      # Condition 2
test5 = "R1.R2.R3.R4>S1.E1.T1>M1\nM1>S2.E2.T2>P1" # Normal data
test6 = "R1.M1>>P1"
test7 = "R3.R2.R4>S1.E1.T1>M1  \nR1>>M1  \nM1>>P1"
test8 = "R1.C1.S1.E1.T1.M1\nM1.R3.R4.S2.T4>M2\nM2.S5.T5>P1"
pattern_repair("R1.R3.R4>S1.T1>M1\nM1>R2>S2.T2>M2\nM2>>P1")

'R1.R3.R4>S1.T1>M1\nM1.R2>S2.T2>M2\nM2>>P1'

In [13]:
def check_invalid_answer(answer)->bool:
    # Check the validity of output heterogeneous graph
    # answer is a string in the form of ""R1.R2.R4>S1.E1.T1>M1\nM1.R3>E2.T2>P1""
    # split it by '\n' to get each 'substep'
    valid_alphabet = ['R', 'M', 'S', 'E', 'T', 'C', 'P', '>', '.', '\n']
    substeps = answer.split('\n')
    all_entities = []
    for substep in substeps:
        # If each step has more than 2 '>', it is invalid
        if substep.count('>') != 2:
            return False
        # If any char is not in valid_alphabet or not number, it is invalid
        for char in substep:
            if char not in valid_alphabet and not char.isdigit():
                return False
        # If any entity with R and M is not in front of the first '>', it is invalid
        if 'R' in substep:
            if substep.rindex('R') > substep.index('>'):
                return False
        if 'M' in substep:
            if substep.index('M') > substep.index('>') and substep.index('M') < substep.rindex('>'):
                return False
        if 'P' in substep:
            if substep.index('P') < substep.rindex('>'):
                return False
        # If any entity with S, E, T, C is not behind the second '>', it is invalid
        if 'S' in substep:
            if substep.index('S') < substep.index('>') or substep.index('S') > substep.rindex('>'):
                return False
        if 'E' in substep:
            if substep.index('E') < substep.index('>') or substep.index('E') > substep.rindex('>'):
                return False
        if 'T' in substep:
            if substep.index('T') < substep.index('>') or substep.index('T') > substep.rindex('>'):
                return False
        if 'C' in substep:
            if substep.index('C') < substep.index('>') or substep.index('C') > substep.rindex('>'):
                return False
    # Check repeated entities, "R3.R2.R2>E1.T1>P1" is invalid
        each_step_entities = substep.replace('>', '.').split('.')
        # If any entity is repeated, it is invalid
        if len(each_step_entities) != len(set(each_step_entities)):
            return False
        else:
            all_entities += each_step_entities
    # Remove entities with "M" from all_entities
    all_entities = [each for each in all_entities if 'M' not in each]
    if len(all_entities) != len(set(all_entities)):
        return False
    else:
        return True

check_invalid_answer('R1.R2.R3>S1.E1.T1>M1\nM1>S2.T2>P1')

True

In [14]:
emphasis = "A standard format is \"Rx.Ry>Sx.Cx.Ex.Tx>Mx\" with at most two \">\", separating reactants, reaction conditions, and products!"

clarification = "Give me heterogeneous graphs only with no introduction or explanation."

def modified_reaction_string(reaction_id, total_description, response, mode=0):
    if mode == 0:   # Valid reaction, write into uspto_multiple_step.json
        # If "response" has a substring of "Don't make the mistake of", find the start of the substring
        pattern = "\n\n[Output]:"
        start = total_description.find(pattern)
        if start != -1:
            total_description = total_description[:start]
        return total_description
    elif mode == 1: # Invalid reaction, Write into uspto_invalid.json
        pattern = "(Do not make the mistake of"
        start = total_description.find(pattern)
        if start != -1:
            total_description = total_description[:start]
        return "[Input]\nReaction " + reaction_id + " description:\n" + total_description + "(Do not make the mistake of '" + response + "')"

test_string = "Thiosemicarbazide (9.11 g, 0.1 mol) was added under nitrogen to a solution 4-chloro-2'-butyrothienone (16.2 mL, 0.1 mol) in 350 mL of methanol plus 27 mL 1N HCl plus 25 mL of water. After stirring at room temperature for approximately 2 hours, all of the solid had dissolved. The reaction was then stirred at room temperature for 24 hours (overnight). By TLC starting material remained. An additional 27 mL of 1N HCl was added and the reaction stirred at room temperature for 6 hours. The solid formed was removed by filtration and dried under high vacuum to give 14.87 g (57%) of the title compound as a brown solid, mp 120-122° C.\n\nREACTANTs:\nR1: 4-chloro-2'-butyrothienone,75;\nR2: Thiosemicarbazide,0;\n\n\nPRODUCTs:\nP1: title compound,585;\n\n\nSOLVENTs:\nS1: methanol,134;\nS2: water,175;\nS3: HCl,157;\nS4: HCl,413;\n\n\nCATALYSTs:\nNone\n\nTIMEs:\nT1: approximately 2 hours,221;\nT2: 24 hours,330;\nT3: overnight,340;\nT4: 6 hours,476;\n\n\nTEMPERATUREs:\nE1: room temperature,200;\nE2: room temperature,309;\nE3: room temperature,455;\n\n\nYIELD:\nY1: 57%,573;\n\n[Output]:\n"

modified_reaction_string("123",test_string, "a>b>c", mode=0)

"Thiosemicarbazide (9.11 g, 0.1 mol) was added under nitrogen to a solution 4-chloro-2'-butyrothienone (16.2 mL, 0.1 mol) in 350 mL of methanol plus 27 mL 1N HCl plus 25 mL of water. After stirring at room temperature for approximately 2 hours, all of the solid had dissolved. The reaction was then stirred at room temperature for 24 hours (overnight). By TLC starting material remained. An additional 27 mL of 1N HCl was added and the reaction stirred at room temperature for 6 hours. The solid formed was removed by filtration and dried under high vacuum to give 14.87 g (57%) of the title compound as a brown solid, mp 120-122° C.\n\nREACTANTs:\nR1: 4-chloro-2'-butyrothienone,75;\nR2: Thiosemicarbazide,0;\n\n\nPRODUCTs:\nP1: title compound,585;\n\n\nSOLVENTs:\nS1: methanol,134;\nS2: water,175;\nS3: HCl,157;\nS4: HCl,413;\n\n\nCATALYSTs:\nNone\n\nTIMEs:\nT1: approximately 2 hours,221;\nT2: 24 hours,330;\nT3: overnight,340;\nT4: 6 hours,476;\n\n\nTEMPERATUREs:\nE1: room temperature,200;\nE2: 

In [15]:
def output_structured_data(request_in_each_line = 0, threshold = 3, selected_lines = 10000, repair = True):
    # Read raw response first
    with open(file_path, 'r') as raw_response: 
        lines = raw_response.readlines()
        total_data = [json.loads(line) for line in lines]
    
    # Fixed Prompt
    modified_request = copy.deepcopy(total_data[0][0])     
    modified_request['messages'][1]['content'] = ""
    
    # Process the first selected_lines of data
    if selected_lines > len(total_data):                   
        selected_lines = len(total_data)

    for dataline in total_data[:selected_lines]:
        total_description = extract_description(dataline)
        total_response, reaction_id = extract_gpt_response(dataline)

        for idx, response in enumerate(total_response):
            if repair:
                response = pattern_repair(response)
            if check_invalid_answer(response):
                with open(output_path + "uspto_multiple_step.json", 'a') as structured_data:
                    structured_data.write(json.dumps({'reaction_id': reaction_id[idx], 'description': modified_reaction_string(reaction_id[idx], total_description[idx], response, mode=0), 'response': response}) + '\n')
            else:
                with open(output_path + "uspto_invalid.json", 'a') as invalid_data:
                    if request_in_each_line < threshold:
                        modified_request['messages'][1]['content'] += modified_reaction_string(reaction_id[idx], total_description[idx], response, mode=1)
                        request_in_each_line += 1
                    else:
                        request_in_each_line = 0
                        invalid_data.write(json.dumps(modified_request) + '\n')
                        modified_request['messages'][1]['content'] = ""
                        modified_request['messages'][1]['content'] += modified_reaction_string(reaction_id[idx], total_description[idx], response, mode=1)
                        request_in_each_line += 1
        
    # Less than threshold requests in one line, still need to write them
    if request_in_each_line != 0:
        with open(output_path + "uspto_invalid.json", 'a') as invalid_data:
            invalid_data.write(json.dumps(modified_request) + '\n')
    # If no error arise, print success
    print("Structured data output success!")


In [16]:
def write_over_original_file(
        read_file = output_path + "uspto_invalid.json",
        write_path = file_path2
        ):
    with open(read_file, 'r') as invalid_file:
        lines = invalid_file.readlines()
        with open(write_path, 'w') as raw_file:
            for line in lines:
                raw_file.write(line)


##### (This block is optional)
Delete all the relevant files.

In [34]:
if os.path.exists(output_path + "uspto_multiple_step.json"):
    os.remove(output_path + "uspto_multiple_step.json")
if os.path.exists(output_path + "uspto_invalid.json"):
    os.remove(output_path + "uspto_invalid.json")
if os.path.exists(file_path):   # result_uspto_requests.json
    os.remove(file_path)
if os.path.exists(file_path2):  # uspto_requests.json
    os.remove(file_path2)
print("all relevant files removed.")

all relevant files removed.


##### First, run the block below to extract our first-time structured data of reaction process
Two files are generated
> `uspto_multiple_step.json`: structured reaction process (heterogenous graph).
> `uspto_invalid.json`: invalid structured reaction process. Need to recall API on those.

In [64]:
if os.path.exists(output_path + "uspto_multiple_step.json"):
    os.remove(output_path + "uspto_multiple_step.json")
if os.path.exists(output_path + "uspto_invalid.json"):
    os.remove(output_path + "uspto_invalid.json")
output_structured_data(repair=True)

Structured data output success!


##### Second, run the block below to modify `uspto_invalid.json` into API-callable format
We write `uspto_invalid.json` over `uspto_request.json` for API-recalling

In [65]:
if os.path.exists(file_path):
    print("result_uspto_request.json removed!")
    os.remove(file_path)
else:
    print("uspto_request.json does not exist!")
    
print("Writing uspto_invalid.json over uspto_request.json for API-recalling")
write_over_original_file()


# API calling on the new uspto_request.json
# Getting the new result_uspto_request.json

result_uspto_request.json removed!
Writing uspto_invalid.json over uspto_request.json for API-recalling


##### Before running the below block, you need to first call API to get a new `result_uspto_request.json` based on our modified `uspto_request.json` from last procedure.

Run the block below to append new valid structured reaction processes to `uspto_multiple_step.json`.

In [66]:
if os.path.exists(output_path + "uspto_invalid.json"):
    os.remove(output_path + "uspto_invalid.json")
output_structured_data()

Structured data output success!


A final structured reaction process is generated in `uspto_multiple_step.json` file.

In [32]:
import json
uspto_multiple = "/Users/gongshukai/Desktop/result_uspto_request backup/valid_reactions/uspto_multiple_step.json"
uspto_multiple_fixed = "/Users/gongshukai/Desktop/result_uspto_request backup/valid_reactions/uspto_multiple_step_fixed.json"

def check_dataset_validity(filepath):
    with open(filepath, 'r') as raw_response: 
        lines = raw_response.readlines()
        total_data = [json.loads(line) for line in lines]

    for data in total_data:
        if check_invalid_answer(data['response']) == False:
            print("Invalid Reaction:", data['reaction_id'], data['response'])

def check_dataset_repetition(filepath):
    with open(filepath, 'r') as raw_response: 
        lines = raw_response.readlines()
        total_data = [json.loads(line) for line in lines]

    # Make the search O(1)
    reaction_id_set = set()
    for data in total_data:
        if data['reaction_id'] in reaction_id_set:
            print("Repeated Reaction ID:", data['reaction_id'])
        else:
            reaction_id_set.add(data['reaction_id'])

In [23]:
with open(uspto_multiple_fixed, 'w') as structured_data:
    for data in total_data:
        fixed_data = copy.deepcopy(data)
        fixed_data['response'] = pattern_repair(data['response'])
        if check_invalid_answer(fixed_data['response']):
            structured_data.write(json.dumps(fixed_data) + '\n')
check_dataset_validity(uspto_multiple_fixed)

In [33]:
check_dataset_repetition(uspto_multiple_fixed)

Repeated Reaction ID: 20140805-US08796295B2-0506
Repeated Reaction ID: 20160609-US20160159842A1-0133
Repeated Reaction ID: 20100722-US20100184815A1-0918
Repeated Reaction ID: 20151231-US20150376185A1-0115
Repeated Reaction ID: 20150625-US20150175601A1-2272
Repeated Reaction ID: 20160324-US20160083346A1-0257
Repeated Reaction ID: 20160218-US20160046631A1-0642
Repeated Reaction ID: 20160505-US20160122331A1-1407
Repeated Reaction ID: 20121023-US08293735B2-0384
Repeated Reaction ID: 20110301-US07897595B2-0312
Repeated Reaction ID: 20150421-US09012642B2-0425
Repeated Reaction ID: 20150922-US09139565B2-0103
Repeated Reaction ID: 20100826-US20100216750A1-0099
Repeated Reaction ID: 20120306-US08129376B2-0530
Repeated Reaction ID: 20111101-US08048909B2-1416
Repeated Reaction ID: 20141125-US08895712B2-0312
Repeated Reaction ID: 20110421-US20110092554A1-0364
Repeated Reaction ID: 20160517-US09340525B2-0249
Repeated Reaction ID: 20150409-US20150099781A1-0524
Repeated Reaction ID: 20150702-US201501