In [32]:
# %env CUDA_LAUNCH_BLOCKING=1

In [33]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset
import datasets
import evaluate
from scipy.special import softmax

In [34]:
dataset = load_dataset("csv", data_files={"train": "data/train.csv", "val": "data/validation.csv", "test": "data/test.csv"})
dataset = dataset.select_columns(["SMILES", "activity"])
dataset = dataset.rename_columns({"SMILES": "text", "activity": "label"})
dataset = dataset.cast_column("label", datasets.ClassLabel(names=["No.", "Yes."]))

Found cached dataset csv (/home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 366.93it/s]
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-d90ccb92f12d2c50.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-e2440dd5761cb571.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-6f2d1fc9cae3cec9.arrow


In [35]:
tokenizer = RobertaTokenizerFast.from_pretrained("DeepChem/ChemBERTa-77M-MTR", model_max_length=512)

model = RobertaForSequenceClassification.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length = 512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-22362df55a3d49ec.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-71513d0770ad9b82.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-f807572a0460dc1f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-188645472dd71aa5.arrow


In [37]:
auc = evaluate.load("roc_auc")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    pred_scores = softmax(predictions, axis=-1)[:, 1]
    return auc.compute(prediction_scores=pred_scores, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1000,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [39]:
preds, labels, metrics = trainer.predict(tokenized_dataset["test"])
pred_scores = softmax(preds, axis=-1)[:, 1]
auc = evaluate.load("roc_auc")
print(auc.compute(prediction_scores=pred_scores, references=labels))



{'roc_auc': 0.9635416666666667}


In [40]:
trainer.save_model("../models/chemberta")