In [1]:
import sys
sys.path.append('../')

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3"

import pandas as pd
import numpy as np
import csv
import torch
import torch.nn as nn

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftType
import torch

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, PretrainedConfig, Trainer, TrainingArguments
from transformers import default_data_collator, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, get_constant_schedule
from tqdm import tqdm
# from torch.utils.tensorboard import SummaryWriter
import logging
import random
import time

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# choose target species to train a LoRA adapter for, from a given list of species
species_df = pd.read_csv('/data/hansol/jroot/data_DBAASP/Genomes_35/species.csv')
species_list = np.array(species_df['Species Name'])
species = species_list[9]
species

'Micrococcus luteus'

In [3]:
class TrainConfig:
    def __init__(self):
        # directory for pre-trained ProtGPT2 model and tokenizer
        self.model_name_or_path =  '/data/hansol/jroot/ProtGPT2'
        self.tokenizer_name_or_path = '/data/hansol/jroot/ProtGPT2'

        # directory for train/validation data of each target species
        self.train_path = '/data/hansol/jroot/data_DBAASP/separated_by_species/'+species+'_train_mol_mic64_cleaned.csv'
        self.test_path = '/data/hansol/jroot/data_DBAASP/separated_by_species/'+species+'_val_mol_mic64_cleaned.csv'
        
        self.config = PretrainedConfig()
        self.task_type = 'lora_fine-tune'
        self.text_column = 'sequence'
        self.max_length = 100
        self.batch_size = 64

In [4]:
train_config = TrainConfig()

data_files = {"train": train_config.train_path, "test": train_config.test_path}
dataset = load_dataset('csv', data_files=data_files)

tokenizer = AutoTokenizer.from_pretrained(train_config.model_name_or_path) 
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
def preprocess_function(examples):
    batch_size = len(examples[train_config.text_column])
    print(batch_size)
    inputs = [x for x in examples[train_config.text_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(inputs)
    
    for i in range(batch_size):
        sample_input_ids = [tokenizer.eos_token_id] + model_inputs["input_ids"][i] + [tokenizer.eos_token_id]
        label_input_ids = [tokenizer.eos_token_id] + labels["input_ids"][i] + [tokenizer.eos_token_id]
        labels["input_ids"][i] = label_input_ids
        model_inputs["input_ids"][i] = sample_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = sample_input_ids + [tokenizer.pad_token_id] * (
            train_config.max_length - len(sample_input_ids)
        )
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i] +  [0] * (train_config.max_length - len(sample_input_ids)) 
        labels["input_ids"][i] = label_input_ids + [-100] * (train_config.max_length - len(sample_input_ids))
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:train_config.max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:train_config.max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:train_config.max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

Running tokenizer on dataset:   0%|          | 0/733 [00:00<?, ? examples/s]

733


Running tokenizer on dataset: 100%|██████████| 733/733 [00:00<00:00, 1198.38 examples/s]
Running tokenizer on dataset: 100%|██████████| 82/82 [00:00<00:00, 1133.98 examples/s]

82





In [7]:
# dataloader
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=train_config.batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=train_config.batch_size, pin_memory=True)

In [8]:
print(tokenizer.decode(eval_dataset['input_ids'][1]))
print(eval_dataset['input_ids'][0])

<|endoftext|>FLWGLIPGAISAVTSLIKK<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endo

In [9]:
model = AutoModelForCausalLM.from_pretrained(train_config.model_name_or_path)

# choose rank and alpha hyperparameter values, and batch size per available GPU device here
r = 32
a = int(2*r)
batch_size = 16
dev_size = 4

# show number of trainable parameters for the chosen set of rank and alpha values
config = LoraConfig(
    task_type="CAUSAL_LM",
    r = r,
    lora_alpha=a,
    # target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# create configuration for LoRA fine-tuning that can be integrated into huggingface's trainer function
def model_init():
    model = AutoModelForCausalLM.from_pretrained(train_config.model_name_or_path)

    config = LoraConfig(
        task_type="CAUSAL_LM",
        r = r,
        lora_alpha=a,
        # target_modules=["q", "v"],
        lora_dropout=0.1,
        bias="none",
    )
    
    model = get_peft_model(model, config)
    return model



trainable params: 5,898,240 || all params: 779,928,320 || trainable%: 0.7563


In [11]:
# training and testing

training_args = TrainingArguments(
    output_dir='/data/hansol/jroot/lora_fine-tune_dbaasp/'+species+'/r'+str(r)+'_alpha'+str(a)+'_lr1e-5_batch'+str(batch_size),
    num_train_epochs=100,              # total number of training epochs
    learning_rate=1e-5,
    per_device_train_batch_size=int(batch_size/dev_size),   # batch size per device during training
    per_device_eval_batch_size=int(batch_size/dev_size),   # batch size for evaluation
    logging_dir='/data/hansol/jroot/lora_fine-tune_dbaasp/'+species+'/r'+str(r)+'_alpha'+str(a)+'_lr1e-5_batch'+str(batch_size),            # directory for storing logs
    logging_strategy='epoch',
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    save_strategy="epoch",
    gradient_accumulation_steps=1,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    seed=0,                          
    load_best_model_at_end = True
)

trainer = Trainer(
    model_init=model_init, 
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=eval_dataset,             # evaluation dataset
)

trainer.train()
trainer.save_model('/data/hansol/jroot/lora_fine-tune_dbaasp/'+species+'/r'+str(r)+'_alpha'+str(a)+'_lr1e-5_batch'+str(batch_size))
trainer.model.save_pretrained('/data/hansol/jroot/lora_fine-tune_dbaasp/'+species+'/r'+str(r)+'_alpha'+str(a)+'_lr1e-5_batch'+str(batch_size))



Epoch,Training Loss,Validation Loss
1,11.9009,12.108963
2,11.4318,11.607847
3,11.0377,11.056172
4,10.5641,10.539885
5,10.1641,10.119022
6,9.8536,9.85003
7,9.6778,9.693913
8,9.5389,9.575465
9,9.4634,9.466384
10,9.3781,9.358882


