In [8]:
import pandas as pd
import numpy as np

import os

print(f'Current working directory: {os.getcwd()}')
os.chdir('../')
print(f'Changed working directory: {os.getcwd()}')

Current working directory: /
Changed working directory: /


### load data

In [9]:
data = pd.read_csv('/content/sample_data/cleaned_Brouwer_2021.csv')
print(data.shape)
data.head()

(20870, 4)


Unnamed: 0,solute,solvent,T,log_gamma
0,C,CCCCCCCCCCCCCCCC,40.0,-0.261365
1,C,CCCCCCCCCCCCCCCC,70.0,-0.287682
2,C,CCCCCCCCCCCCCCCC,90.0,-0.301105
3,CC,CCCCCCCCCCCCCCCC,40.0,-0.235722
4,CC,CCCCCCCCCCCCCCCC,70.0,-0.248461


## Split Data

In [10]:
!pip install datasets



In [11]:
!pip install accelerate -U



In [12]:
from datasets import Dataset, DatasetDict

In [13]:
data.columns = ['solute', 'solvent', 'T', 'label']

# convert to dataset
dataset = Dataset.from_pandas(data, preserve_index=False)

# using split function to split dataset into train, test, and validation sets
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['solute', 'solvent', 'T', 'label'],
        num_rows: 16696
    })
    test: Dataset({
        features: ['solute', 'solvent', 'T', 'label'],
        num_rows: 2087
    })
    valid: Dataset({
        features: ['solute', 'solvent', 'T', 'label'],
        num_rows: 2087
    })
})

In [14]:
from transformers import AutoTokenizer, AutoConfig

In [15]:
# load in the tokenizer
model_name = 'seyonec/PubChem10M_SMILES_BPE_450k'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# get maximum length of input sequence
def max_len(data, idx):
    """
    Returns the maximum length of the input sequence"""
    return max([len(x) for x in data[idx]])

maximum_length=max_len(data, "solute") + max_len(data, "solvent") + 3
print(maximum_length)

# tokenize function
def tokenize_function(examples):
    '''
    Tokenizes the input sequence'''
    return tokenizer(examples["solute"], examples["solvent"], padding='max_length', truncation=True, max_length=maximum_length)

tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True).remove_columns(["solute", 'solvent', 'T'])

train_data = tokenized_datasets["train"]
eval_data = tokenized_datasets["valid"]
test_data = tokenized_datasets['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


136


Map:   0%|          | 0/16696 [00:00<?, ? examples/s]

Map:   0%|          | 0/2087 [00:00<?, ? examples/s]

Map:   0%|          | 0/2087 [00:00<?, ? examples/s]

In [16]:
train_data

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 16696
})

In [17]:
# key references
"""
# trainer
https://github.com/huggingface/transformers/blob/v4.36.1/src/transformers/trainer.py
# roberta
https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_roberta.py
# modeling_outputs
https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py
"""

'\n# trainer\nhttps://github.com/huggingface/transformers/blob/v4.36.1/src/transformers/trainer.py\n# roberta\nhttps://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_roberta.py\n# modeling_outputs\nhttps://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py\n'

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput

from typing import List, Optional, Tuple, Union

class ClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_labels=1):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class STM(nn.Module):
    def __init__(self,
                 model_name = "seyonec/PubChem10M_SMILES_BPE_450k", # model name
                 n_classes=1):
        super(STM, self).__init__()
        # pretrained model
        config = AutoConfig.from_pretrained(model_name) # configuration
        self.transformer = RobertaModel.from_pretrained(model_name, config=config, add_pooling_layer=False) # load in the model
        self.classifier = ClassificationHead(config, n_classes)


    def forward(self,
                input_ids: Optional[torch.LongTensor] = None,
                attention_mask: Optional[torch.FloatTensor] = None,
                temperature: Optional[torch.LongTensor] = None,
                labels: Optional[torch.LongTensor] = None,
                output_hidden_states: Optional[bool] = None,
                output_attentions: Optional[bool] = None,
                return_dict: Optional[bool] = None,)-> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        """
        :param input_ids: SMILES encodings
        :param attention_mask: attention mask (1 for non-padding token and 0 for padding)
        :param temperature: temperature
        """

        outputs = self.transformer(input_ids = input_ids,
                                    attention_mask = attention_mask)
        # transformer output
        sequence_output = outputs[0]
        # NN output
        output = self.classifier(sequence_output)
        # loss
        loss_fct = nn.MSELoss()
        loss = loss_fct(output.squeeze(), labels.squeeze())

        return SequenceClassifierOutput(
            loss=loss,
            logits=output,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [19]:
from transformers import  Trainer, EvalPrediction
from transformers.training_args import TrainingArguments

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [20]:
def compute_metrics(output: EvalPrediction):
    """
    Computes the metrics for evaluation"""
    preds, labels = output

    return {
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "r2": r2_score(labels, preds)}


In [21]:
# model
testingModel = STM()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [22]:
# choose arguments for training
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4, evaluation_strategy="epoch", save_strategy="epoch",
    do_train=True, do_eval=True, optim='adamw_torch', load_best_model_at_end=True,)

In [23]:
# trainer for training
trainer = Trainer(
    model=STM, args=training_args, compute_metrics=compute_metrics,
    train_dataset=train_data, eval_dataset=eval_data,)

# train the model
trainer.train()

AttributeError: 'torch.device' object has no attribute '_apply'