# Note
* The initial data preprocessing code (e.g., SMILES tokenization and .npy generation) was provided as part of the course materials. I used it to filter the dataset and create a smaller subset for training. However, the creation of the train.pkl that was used for evaluation was my own work. 

# Imports

In [None]:
import gzip
import numpy as np
import pickle

# Lang class

In [None]:
class Lang:
    '''Predefined mapping from characters to indices for our
    reduced alphabet of SMILES with methods for converting.
    You must use this mapping.'''

    def __init__(self):
        # $ is the end of sequence token
        # ^ is the start of sequence token, which should never be generated
        self.chartoindex = {'$': 0,'^': 1, 'C': 2, '(': 3,
                '=': 4, 'O': 5, ')': 6, '[': 7, '-': 8, ']': 9,
                'N': 10, '+': 11, '1': 12, 'P': 13, '2': 14,'3': 15,
                '4': 16, 'S': 17, '#': 18, '5': 19,'6': 20, '7': 21,
                'H': 22, 'I': 23, 'B': 24, 'F': 25, '8': 26, '9': 27
                }
        self.indextochar = {0: '$', 1: '^', 2: 'C', 3: '(',
                4: '=', 5: 'O', 6: ')', 7: '[', 8: '-', 9: ']',
                10: 'N', 11: '+', 12: '1', 13: 'P', 14: '2', 15: '3',
                16: '4', 17: 'S', 18: '#', 19: '5', 20: '6', 21: '7',
                22: 'H', 23: 'I', 24: 'B', 25: 'F', 26: '8', 27: '9'
                }
        self.nchars = 28

    def indexesFromSMILES(self, smiles_str):
        '''convert smiles string into numpy array of integers'''
        index_list = [self.chartoindex[char] for char in smiles_str]
        index_list.append(self.chartoindex["$"])
        return np.array(index_list, dtype=np.uint8)

    def indexToSmiles(self,indices):
        '''convert list of indices into a smiles string'''
        smiles_str = ''.join(list(map(lambda x: self.indextochar[int(x)] if x != 0.0 else 'E',indices)))
        return smiles_str.split('E')[0] #Only want values before output $ end of sequence token


# SMILES Filtering: Count and Select Short Molecules for Training

In [None]:
max_length = 20
language = Lang()
data_path = 'training_zip_file'

In [None]:
N = 0
tot_N = 0

with gzip.open(data_path,'rt') as f:
    # N = sum(1 for line in f if len(line.rstrip()) < 50) # create a smaller dataset for faster training

    for line in f:
        tot_N += 1 # only for sanity check
        if len(line.rstrip()) < 20:
            N += 1 # create a smaller dataset for faster training

print(tot_N)
print(N)

## Create npy

In [None]:
examples = np.zeros((N,max_length),dtype=np.uint8)
with gzip.open(data_path,'rt') as f:
    idx = 0
    for i,line in enumerate(f):
        example = line.rstrip()

        if len(example) < 10:
            ex = language.indexesFromSMILES(example)
            examples[idx][:len(ex)] = ex
            idx += 1

print('Preprocessed data shape:',examples.shape)

In [None]:
np.save('train_data.npy',examples)

# Create a pickle

In [None]:
smiles_set = set()

# Read SMILES from data file.
with gzip.open(data_path, 'rt') as f:
    for line in f:
        smile = line.strip()
        if smile:
            smiles_set.add(smile)

# Save to a pickle
with open('train.pkl', 'wb') as f:
    pickle.dump(smiles_set, f)