In [1]:
import pandas as pd
import numpy as np

import os

os.chdir('../')
print(os.getcwd())

/Users/gbemidebe/Documents/GitHub/SolutionTransformer


### load data

In [2]:
data = pd.read_csv('data/processed/cleaned_Brouwer_2021.csv')
print(data.shape)
data.head()

(20870, 4)


Unnamed: 0,solute,solvent,T,log_gamma
0,C,CCCCCCCCCCCCCCCC,40.0,-0.261365
1,C,CCCCCCCCCCCCCCCC,70.0,-0.287682
2,C,CCCCCCCCCCCCCCCC,90.0,-0.301105
3,CC,CCCCCCCCCCCCCCCC,40.0,-0.235722
4,CC,CCCCCCCCCCCCCCCC,70.0,-0.248461


## Split Data

In [4]:
import numpy as np
import pandas as pd

class SmilesData:
    def __init__(self, data):
        self.data = data

    def get_split(self, train_ratio=0.8, seed=None):
        n = len(self.data)
        indices = np.arange(n)
        if seed is not None:
            np.random.seed(seed)
        np.random.shuffle(indices)
        train_size = int(train_ratio * n)
        train_indices = indices[:train_size]
        test_indices = indices[train_size:]
        train_data = self.data.iloc[train_indices].reset_index(drop=True)
        test_data = self.data.iloc[test_indices].reset_index(drop=True)
        return train_data, test_data

In [5]:
train_data, test_data = SmilesData(data).get_split()

In [7]:
len(train_data) / (len(train_data) + len(test_data))

0.8

### tokenizer data

In [12]:
from transformers import AutoTokenizer #, 
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
tokenizer_name = 'seyonec/PubChem10M_SMILES_BPE_450k'
# load in the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [14]:
data.columns.tolist()

['solute', 'solvent', 'T', 'log_gamma']

In [18]:
import torch
from torch.utils.data import Dataset
class Input(Dataset):
    def __init__(self, data, tokenizer, column_names = ['solute', 'solvent', 'T', 'log_gamma']):
        '''
        data: pandas dataframe with columns "solute", "solvent", "T", "log_gamma
        tokenizer: tokenizer to use'''

        self.data = data
        self.tokenizer = tokenizer
        self.column_names = column_names
        self.max_length = self.max_len(column_names[0]) + self.max_len(column_names[1]) + 3 # The total character length including [CLS], [SEP], and [PAD]

    def __len__(self):
        '''Returns the length of the dataset'''
        return len(self.data)

    def max_len(self, idx):
        """
        Returns the maximum length of the input sequence
        """
        return max([len(x) for x in self.data[idx]])

    def __getitem__(self, idx):
        """
        Returns the input data as a dictionary with keys "solute", "solvent", "input_ids", "attention_mask", "Temp", and "label"
        """
        solute = self.data.iloc[idx][self.column_names[0]]
        solvent = self.data.iloc[idx][self.column_names[0]]
        inputs = self.tokenizer(solute, solvent, return_tensors="pt", padding='max_length', truncation=True,
                                max_length=self.max_length)
        
        sample = {
                    'solute': solute,
                    'solvent': solvent,
                    'input_ids': inputs["input_ids"].squeeze(0),
                    'attention_mask': inputs["attention_mask"].squeeze(0),
                    'Temp': torch.tensor(self.data.iloc[idx][self.column_names[2]], dtype=torch.float).unsqueeze(0),
                    'label': torch.tensor(self.data.iloc[idx][self.column_names[3]], dtype=torch.float).unsqueeze(0)
                    }
        return sample
       

In [19]:
TrainData = Input(train_data, tokenizer)

In [20]:
TrainData[0]

{'solute': 'CCCCCC',
 'solvent': 'CCCCCC',
 'input_ids': tensor([  0, 329,   2,   2, 329,   2,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [22]:
TestData = Input(test_data, tokenizer)


In [23]:
TestData[0]

{'solute': 'CCCC(C)C',
 'solvent': 'CCCC(C)C',
 'input_ids': tensor([  0, 273,  12,  39,  13,  39,   2,   2, 273,  12,  39,  13,  39,   2,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 