In [1]:
###################################################
# 0. Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from tqdm.notebook import tqdm
import re
import pickle as pkl

from PIL import Image
pd.set_option('max_colwidth', None)

# https://iupac.org/100/stories/what-on-earth-is-inchi/
# tokenizer -> https://www.kaggle.com/yasufuminakama/inchi-preprocess-2
# atom count -> https://www.kaggle.com/tuckerarrants/inchi-allowed-external-data
###################################################

In [2]:
###################################################
# 1. Paths

PATH_DATA = '../01_Data/'
PATH_MODELS = '../03_Models/'

PATH_DATA_TRAIN = PATH_DATA + 'train/'
PATH_DATA_TEST = PATH_DATA + 'test/'

list_imgs_train_paths = glob.glob(PATH_DATA_TRAIN + '/*/*/*/*')
list_imgs_test_paths = glob.glob(PATH_DATA_TEST + '/*/*/*/*')

print(f'Num Imgs train: {len(list_imgs_train_paths)}')
print(f'Num Imgs test: {len(list_imgs_test_paths)}')

list_imgs_train = [path.split('\\')[-1].split('.')[0] for path in tqdm(list_imgs_train_paths)]
list_imgs_test = [path.split('\\')[-1].split('.')[0] for path in tqdm(list_imgs_test_paths)]

dict_imgs_train_paths = {path.split('\\')[-1].split('.')[0] : path for path in tqdm(list_imgs_train_paths)}
dict_imgs_test_paths = {path.split('\\')[-1].split('.')[0] : path for path in tqdm(list_imgs_test_paths)}


VERSION = '001'

###################################################

Num Imgs train: 2424186
Num Imgs test: 1616107


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




In [3]:
###################################################
# 2. Load Data

df_train_labels = pd.read_csv(PATH_DATA + 'train_labels.csv') 
print(df_train_labels.shape)
df_train_labels.head()

# Layer1 - Formula
# Layer2 - Connectivity of the atoms in the molecule
# Layer3 - Position of the hydrogen atoms in the molecule
# Layer4 - stereochemical information in the molecule
# Layer5 - more info
# ...

###################################################

(2424186, 2)


Unnamed: 0,image_id,InChI
0,000011a64c74,"InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3"
1,000019cc0cd2,"InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(10-14)11-17(23)19-15-4-5-18(24)21(15,3)9-7-16(19)20/h13-16,19H,4-11H2,1-3H3/t13-,14+,15+,16-,19-,20+,21+/m1/s1"
2,0000252b6d2b,"InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-12-10-20(28)30)27-11-9-16-21(23(25)31)26-29(22(16)24(27)32)18-5-3-4-6-19(18)33-2/h3-8,13H,9-12H2,1-2H3,(H2,25,31)"
3,000026b49b7e,"InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-24-14)11-15(21)19-17(16(22)23)8-4-2-3-5-9-17/h6-7,10,13H,2-5,8-9,11H2,1H3,(H,18,20)(H,19,21)(H,22,23)"
4,000026fc6c36,"InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7-8)5-2-3-9(11)16/h8H,2-7H2,1H3,(H2,11,16)(H,12,14)"


In [4]:
###################################################
# 3. Global Functions

def split_form(form):
    string = ''
    for i in re.findall(r"[A-Z][^A-Z]*", form):
        elem = re.match(r"\D+", i).group()
        num = i.replace(elem, "")
        if num == "":
            string += f"{elem} "
        else:
            string += f"{elem} {str(num)} "
    return string.rstrip(' ')


def split_form2(form):
    string = ''
    for i in re.findall(r"[a-z][^a-z]*", form):
        elem = i[0]
        num = i.replace(elem, "").replace('/', "")
        num_string = ''
        for j in re.findall(r"[0-9]+[^0-9]*", num):
            num_list = list(re.findall(r'\d+', j))
            assert len(num_list) == 1, f"len(num_list) != 1"
            _num = num_list[0]
            if j == _num:
                num_string += f"{_num} "
            else:
                extra = j.replace(_num, "")
                num_string += f"{_num} {' '.join(list(extra))} "
        string += f"/{elem} {num_string}"
    return string.rstrip(' ')

###################################################

In [5]:
###################################################
# 4. Classes

class Tokenizer(object):
    
    def __init__(self):
        self.stoi = {}
        self.itos = {}

    def __len__(self):
        return len(self.stoi)
    
    def fit_on_texts(self, texts):
        vocab = set(['<pad>', '<sos>', '<eos>'])
        for text in texts:
            vocab.update(text.split(' '))
        self.stoi['<pad>'] = 0
        i = 1
        for s in vocab:
            if s != '<pad>':
                self.stoi[s] = i
                i += 1
        self.itos = {item[1]: item[0] for item in self.stoi.items()}
        
    def text_to_sequence(self, text):
        sequence = []
        sequence.append(self.stoi['<sos>'])
        for s in text.split(' '):
            sequence.append(self.stoi[s])
        sequence.append(self.stoi['<eos>'])
        return sequence
    
    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = self.text_to_sequence(text)
            sequences.append(sequence)
        return sequences

    def sequence_to_text(self, sequence):
        return ''.join(list(map(lambda i: self.itos[i], sequence)))
    
    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = self.sequence_to_text(sequence)
            texts.append(text)
        return texts
    
    def predict_caption(self, sequence):
        caption = ''
        for i in sequence:
            if i == self.stoi['<eos>'] or i == self.stoi['<pad>']:
                break
            caption += self.itos[i]
        return caption
    
    def predict_captions(self, sequences):
        captions = []
        for sequence in sequences:
            caption = self.predict_caption(sequence)
            captions.append(caption)
        return captions

###################################################

In [6]:
df_train_labels['l_0'] = df_train_labels['InChI'].apply(lambda x: x.split('/')[1])
df_train_labels['l_0']

0                C13H20OS
1                C21H30O4
2              C24H23N5O4
3             C17H24N2O4S
4             C10H19N3O2S
                ...      
2424181      C10H12F2N2O3
2424182     C19H20F2N4O3S
2424183    C22H26Cl2N2O4S
2424184       C17H26N2O6S
2424185      C10H18N2O9P2
Name: l_0, Length: 2424186, dtype: object

In [14]:
###################################################
# 5. Data Generation

df_train_labels['l_0'] = df_train_labels['InChI'].apply(lambda x: x.split('/')[1])
df_train_labels['text'] = df_train_labels['l_0'].apply(split_form) + ' ' + \
                          df_train_labels['InChI'].apply(lambda x: '/'.join(x.split('/')[2:])).\
                          apply(split_form2).values

# Fit and save tokenizer 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train_labels['text'].values)

with open(f'{PATH_MODELS}tokenizer_experiment_{VERSION}.pkl', 'wb') as f:
    pkl.dump(tokenizer, f, protocol=pkl.HIGHEST_PROTOCOL)
    print('Saved tokenizer.')

# Create sequences    
lengths = []
iterator = tqdm(df_train_labels['text'].values, total=len(df_train_labels))
for text in iterator:
    seq = tokenizer.text_to_sequence(text)
    length = len(seq) - 2
    lengths.append(length)
    
df_train_labels['length'] = lengths
with open(f'{PATH_MODELS}df_train_labels_{VERSION}.pkl', 'wb') as f:
    pkl.dump(df_train_labels, f, protocol=pkl.HIGHEST_PROTOCOL)
    print('Saved df_train_labels.')

###################################################

Saved tokenizer.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))


Saved df_train_labels.


In [13]:
df_train_labels.head()

Unnamed: 0,image_id,InChI,l_0,text
0,000011a64c74,"InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3",C13H20OS,"C 13 H 20 O S /c 1 - 9 ( 2 ) 8 - 15 - 13 - 6 - 5 - 10 ( 3 ) 7 - 12 ( 13 ) 11 ( 4 ) 14 /h 5 - 7 , 9 , 11 , 14 H , 8 H 2 , 1 - 4 H 3"
1,000019cc0cd2,"InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(10-14)11-17(23)19-15-4-5-18(24)21(15,3)9-7-16(19)20/h13-16,19H,4-11H2,1-3H3/t13-,14+,15+,16-,19-,20+,21+/m1/s1",C21H30O4,"C 21 H 30 O 4 /c 1 - 12 ( 22 ) 25 - 14 - 6 - 8 - 20 ( 2 ) 13 ( 10 - 14 ) 11 - 17 ( 23 ) 19 - 15 - 4 - 5 - 18 ( 24 ) 21 ( 15 , 3 ) 9 - 7 - 16 ( 19 ) 20 /h 13 - 16 , 19 H , 4 - 11 H 2 , 1 - 3 H 3 /t 13 - , 14 + , 15 + , 16 - , 19 - , 20 + , 21 + /m 1 /s 1"
2,0000252b6d2b,"InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-12-10-20(28)30)27-11-9-16-21(23(25)31)26-29(22(16)24(27)32)18-5-3-4-6-19(18)33-2/h3-8,13H,9-12H2,1-2H3,(H2,25,31)",C24H23N5O4,"C 24 H 23 N 5 O 4 /c 1 - 14 - 13 - 15 ( 7 - 8 - 17 ( 14 ) 28 - 12 - 10 - 20 ( 28 ) 30 ) 27 - 11 - 9 - 16 - 21 ( 23 ( 25 ) 31 ) 26 - 29 ( 22 ( 16 ) 24 ( 27 ) 32 ) 18 - 5 - 3 - 4 - 6 - 19 ( 18 ) 33 - 2 /h 3 - 8 , 13 H , 9 - 12 H 2 , 1 - 2 H 3 , ( H 2 , 25 , 31 )"
3,000026b49b7e,"InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-24-14)11-15(21)19-17(16(22)23)8-4-2-3-5-9-17/h6-7,10,13H,2-5,8-9,11H2,1H3,(H,18,20)(H,19,21)(H,22,23)",C17H24N2O4S,"C 17 H 24 N 2 O 4 S /c 1 - 12 ( 20 ) 18 - 13 ( 14 - 7 - 6 - 10 - 24 - 14 ) 11 - 15 ( 21 ) 19 - 17 ( 16 ( 22 ) 23 ) 8 - 4 - 2 - 3 - 5 - 9 - 17 /h 6 - 7 , 10 , 13 H , 2 - 5 , 8 - 9 , 11 H 2 , 1 H 3 , ( H , 18 , 20 ) ( H , 19 , 21 ) ( H , 22 , 23 )"
4,000026fc6c36,"InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7-8)5-2-3-9(11)16/h8H,2-7H2,1H3,(H2,11,16)(H,12,14)",C10H19N3O2S,"C 10 H 19 N 3 O 2 S /c 1 - 15 - 10 ( 14 ) 12 - 8 - 4 - 6 - 13 ( 7 - 8 ) 5 - 2 - 3 - 9 ( 11 ) 16 /h 8 H , 2 - 7 H 2 , 1 H 3 , ( H 2 , 11 , 16 ) ( H , 12 , 14 )"


In [7]:
# img = Image.open(dict_imgs_train_paths['000011a64c74']) #000011a64c74
# img = np.asarray(img)# /255.
# plt.imshow(img, cmap='gray')
# plt.show()