In [1]:
import json
from tqdm import tqdm_notebook

In [2]:
def get_reaction(json_path: str)->list:
    reactions = []
    with open(json_path) as fsream:
        for line in fsream:
            reactions.append(json.loads(line))
    return reactions

def get_vocab_from_tokenized(reations: list):
    from collections import Counter
    counter = Counter()
    for react in tqdm_notebook(reactions):
        for char in react['reaction_core'].split():
            counter[char] += 1
        for reag in react['reactants']:
            for char in reag.split():
                counter[char] += 1
        for char in react['products'][0].split():
            counter[char] += 1
    return counter
        
def get_reaction_str(reaction: dict)->str:
    result = reaction['reactants'][0]
    if len(reaction['reactants']) == 2:
        result += " . " + reaction['reactants'][1]
    result += " >> " + reaction['products'][0]
    return result
    
def filter_by_tokens(reactions: list, tokens: set)->list:
    result = []
    for react in tqdm_notebook(reactions):
        if get_tokenset(react) <= tokens:
            result.append(react)
    return result

def get_tokenset(reaction: dict)->set:
    result = set()
    for char in reaction['reaction_core'].split():
        result.add(char)
    for reag in reaction['reactants']:
        for char in reag.split():
            result.add(char)
    for char in reaction['products'][0].split():
        result.add(char)
    return result

def vocab_to_tokenset(vocab: dict, threshold: int=0)->set:
    result = set()
    for k, v in vocab.items():
        if v >= threshold:
            result.add(k)
    return result

def to_jsonl(reactions: list, outfile: str):
    with open(outfile, 'w') as fstream:
        for r in reactions:
            json.dump(r, fstream)
            fstream.write('\n')

In [3]:
reactions = get_reaction('tokenized_reactions.json')

In [8]:
reactions[0]['meta']['reagents']

[{'reaxys_id': 1098214,
  'molecule_name': 'hydrogenchloride',
  'smiles': '[H]Cl'}]

In [46]:
vocab = get_vocab_from_tokenized(reactions)
vocab_list = sorted(vocab.items(), key=lambda x: x[1])

HBox(children=(IntProgress(value=0, max=380733), HTML(value='')))




In [47]:
total_chars = sum([v for k, v in vocab.items()])
vocab_freq = {k: v / total_chars for (k, v) in vocab.items()}

In [54]:
filter_set = vocab_to_tokenset(vocab, 10000)

In [58]:
filtered = filter_by_tokens(reactions, filter_set)

HBox(children=(IntProgress(value=0, max=380733), HTML(value='')))




In [59]:
len(filtered)

374394

In [62]:
to_jsonl(filtered, 'filtered.json')

In [66]:
reactions[0]['reactants'][0] + " . " + reactions

'[H] c 1 c ( [H] ) c ( [H] ) c ( N ( [H] ) C ( = O ) C ( [H] ) ( [H] ) [H] ) c ( - c 2 c ( [H] ) c 3 c ( [H] ) c ( [H] ) c ( [H] ) c ( [H] ) c 3 n 2 [H] ) c 1 [H]'

In [74]:
get_reaction_str(reactions[3])

'[H] c 1 c ( [H] ) c ( [H] ) c ( N ( [H] ) [H] ) c ( C # C c 2 c ( [H] ) c ( Cl ) c ( [H] ) c ( [H] ) c 2 N ( [H] ) C ( = O ) C ( [H] ) ( [H] ) [H] ) c 1 [H] >> [H] c 1 c ( Cl ) c ( [H] ) c 2 c ( n c ( C ( [H] ) ( [H] ) [H] ) n 3 c 4 c ( [H] ) c ( [H] ) c ( [H] ) c ( [H] ) c 4 c ( [H] ) c 2 3 ) c 1 [H]'

In [None]:
rea