### Installing and importing prerequisites:

In [None]:
%pip install atomInSmiles
%pip uninstall pandas -y
%pip install pandas==1.5.3
%pip install molsets==0.1.0


In [2]:
from moses import metrics
import atomInSmiles as AIS
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import copy
import json
import os
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
import collections
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM , AutoConfig
from datasets import load_dataset
from datasets import DatasetDict
import datasets
from transformers import DataCollatorForLanguageModeling 
from huggingface_hub.hf_api import HfFolder
from AISTokenizer import AISTokenizer


  _mcf.append(_pains, sort=True)['smarts'].values]
2024-04-12 19:07:51.197290: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 19:07:51.197390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 19:07:51.321545: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
vocabfile = "/kaggle/input/dpotest2/vocab(2).txt" 
context_length = 72
tokenizer = AISTokenizer(vocabfile,context_length)

### Load and encode the finetuning dataset

In [5]:
import json
from datasets import Dataset
# Deserialize our data
with open("/kaggle/input/dpotest2/unique_pairs_cpu_optimized.json", "r") as file:
    data = json.load(file)

dataset_dict = {'text': []}

for pair in data:
    dataset_dict.setdefault('text', []).append(AIS.encode(pair[0]))
    dataset_dict.setdefault('text', []).append(AIS.encode(pair[1]))




In [6]:
#Load in the datasets and convert to memory-mapped for efficiency reasons
train_dataset = Dataset.from_dict(dataset_dict).shuffle()


In [7]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 56810
})

In [8]:
# README:
# Wrapper around the tokenizer that concatenates samples in each dataset mapping batch. So say it's performing a mapping (which we specified
# as mapping this function) on a batch of the dataset. It will take every sample in the batch, concatenate them to one long string, then split
# that up into chunks of context_length. This effectively packs the samples, most of which are smaller than context_length, into a tighter
# representation which makes training more efficient (otherwise there would be hella padding). It also lets the model learn where the end of
# sentence will be as we use EOS token between columns. 
def tokenize(element):      
    # Remove token_type_ids and attention_mask columns
    input_ids = tokenizer(element["text"])['input_ids']


    # Concatenate input_ids up to max_length
    concatenated_ids = [token for sublist in input_ids for token in sublist]
    # Split concatenated string into batches
    batch_list = [concatenated_ids[i:i+context_length] for i in range(0, len(concatenated_ids), context_length)]
    # Return processed output
    return {'input_ids': batch_list} # Remove token_type_ids and attention_mask columns


In [9]:
t_dataset = train_dataset.map(tokenize,batched=True,remove_columns=["text"])

  0%|          | 0/57 [00:00<?, ?ba/s]

### Define a function to benchmark each combo of hyperparameters' resultant model 

In [10]:
# Make a set of every molecule in the dataset to compare novelty against
allgenerated = set()
for line in data:
    allgenerated.add(line[0])
    allgenerated.add(line[1])
from moses import metrics

def calc_unique(test_model):
    samples = []
    novelty = []
    for _ in range(250):
        # Note: model.generate includes the leading/trailing EOS tokens, so we have to remove them ourselves with [1:-1]
        decoded = AIS.decode(tokenizer.decode(test_model.generate(max_new_tokens=72,do_sample=True, temperature=0.5)[0][1:-1]))
        samples.append(decoded)
        if decoded not in allgenerated:
            novelty.append(decoded)
    # Calculate samples uniqueness
    unique_elements = set(samples)
    novel_elements = set(novelty)
    uniqueness_percentage = (len(unique_elements) / len(samples)) * 100
    novelty_percentage = (len(novel_elements) / len(samples)) * 100
    intdiv = metrics.internal_diversity(samples)
    return uniqueness_percentage, novelty_percentage, unique_elements, intdiv

In [11]:

# This makes our attention masks and labels for us for each batch. Note, do NOT set pad_token to eos_token like many guides to, 
# this will cause the data_collator to set the label for the eos_token to -100 which leads to it being ignored in loss calculation and
# giving us a model that only knows how to yap and not to send a sequence.
data_collator = DataCollatorForLanguageModeling (tokenizer,mlm=False)

In [12]:
# Sanity check. Note, the datasets are iterable so you have to wrap them in iter if you want to examine them. You cannot index them with [].
it = iter(t_dataset)
test = data_collator([next(it) for _ in range(2)])
test

{'input_ids': tensor([[   0,  511,   15,  211,    7,  270,  829,    3,  505, 1083,    8, 1130,
         1130, 1130, 1082,    3,  474,  103,    3,   15,  511,    4, 1078,    9,
         1130, 1130, 1130, 1082,   10, 1147, 1096,    3,  308,    3,  387,    4,
          387,    4, 1160, 1082,    9,   10,    4, 1130,    8,    4,  495,    7,
            0,  511,   15,  103,    3,  818,    7,  270,  167,    7,    8,  270,
          270,  270,    8,    4,  431,    7,  271,  270,  270,  884,    8,    3],
        [ 270,  271,  431,    3, 1096,    9, 1130, 1130, 1130, 1090,    3,  387,
            4, 1147,    9,    4,  271,    8,    4,  271,    7,    0,  286,  240,
          240,  240,  244,  474,  103,    3,   15,  511,    4,  103,    3,   15,
          511,    4,  474,  794,    3,  286,    4,  240,  240,  295,    3,  286,
            4,  286,    0,  286,  240, 1082,    7, 1078,    3,  103,    3,   15,
          511,    4,  474,  822,    8,  270,  270,  271,  431,    3, 1096,    9]]), 'attention

### Train!

In [13]:
from transformers import Trainer, TrainingArguments
from transformers.utils import logging
logging.set_verbosity_error()



epochs = [1, 2, 3]
lrs = [1e-4, 3e-4, 5e-4, 1e-3]

for epoch in epochs:
        for lr in lrs:
            model = AutoModelForCausalLM.from_pretrained(
                "victornica/AIS_3", # location of saved SFT model
            )
            model_ref = AutoModelForCausalLM.from_pretrained(
                "victornica/AIS_3", # location of saved SFT model
            )
            args = TrainingArguments(
                output_dir="/kaggle/working/sft",
                per_device_train_batch_size=128,
                per_device_eval_batch_size=128,
                num_train_epochs=epoch,
                weight_decay=0.1,
                learning_rate=lr,
                lr_scheduler_type="linear",
                report_to="none"
            )
            trainer = Trainer(
                model=model,
                tokenizer=tokenizer,
                args=args,
                data_collator=data_collator,
                train_dataset=t_dataset
            )

            trainer.train()

            uniqueness, novelty, unique_elems, intdiv = calc_unique(model)
            if uniqueness >= 98:
                file = open(f"{epoch}-{lr}-{uniqueness}-{novelty}-{intdiv}.txt", "w")
                file.write(f"{unique_elems}")
                file.close()
            print("finished a tuning")

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 107.7398, 'train_samples_per_second': 293.169, 'train_steps_per_second': 1.151, 'train_loss': 0.6257256538637223, 'epoch': 1.0}
finished a tuning
{'train_runtime': 108.5611, 'train_samples_per_second': 290.951, 'train_steps_per_second': 1.142, 'train_loss': 0.6318123725152784, 'epoch': 1.0}
finished a tuning
{'train_runtime': 108.4449, 'train_samples_per_second': 291.263, 'train_steps_per_second': 1.143, 'train_loss': 0.646789058562248, 'epoch': 1.0}
finished a tuning
{'train_runtime': 108.5211, 'train_samples_per_second': 291.059, 'train_steps_per_second': 1.143, 'train_loss': 0.686693130000945, 'epoch': 1.0}
finished a tuning
{'train_runtime': 216.5225, 'train_samples_per_second': 291.757, 'train_steps_per_second': 1.145, 'train_loss': 0.6115672819076046, 'epoch': 2.0}
finished a tuning
{'train_runtime': 216.1577, 'train_samples_per_second': 292.25, 'train_steps_per_second': 1.147, 'train_loss': 0.6118397866525958, 'epoch': 2.0}
finished a tuning
{'train_runtime': 2

[20:05:37] Can't kekulize mol.  Unkekulized atoms: 12 13 14 16 18 19 20 21 22


finished a tuning
{'train_runtime': 324.1062, 'train_samples_per_second': 292.367, 'train_steps_per_second': 1.148, 'train_loss': 0.6029669238675025, 'epoch': 3.0}


[20:13:24] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 28


finished a tuning
{'train_runtime': 323.9606, 'train_samples_per_second': 292.499, 'train_steps_per_second': 1.148, 'train_loss': 0.6306084663637223, 'epoch': 3.0}
finished a tuning


[20:21:11] SMILES Parse Error: unclosed ring for input: 'Cc1cccc(-c2cccc(CNC(=O)c3ccc4c(=O)n5c(nc4c3)CCC4)c2)c1'
