In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "facebook/opt-1.3b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=['k_proj', 'v_proj', 'q_proj', 'out_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, config)

In [1]:
def cap_len(col, max_len):
    for i in range(len(col)):
        if len(col[i]) > max_len:
            col[i] = col[i][:max_len]

In [3]:
def import_and_process_data(dataset_id, fields, data_split = 'train'):
    data_set = load_dataset(dataset_id)
    for field in fields:
        cap_len(data_set[data_split][field], 2048)
    processed_data = data_set.map(
        lambda samples: tokenizer(
            *[samples[field] for field in fields]
        ),
        batched=True
    )
    return processed_data

In [None]:
# Mental Health
#data = import_and_process_data(
#    'Amod/mental_health_counseling_conversations',
#    ['Context', 'Response']
#)

# Physics
#data = import_and_process_data(
#    'camel-ai/physics',
#    ['message_1', 'message_2']
#)

# Biology
#data = import_and_process_data(
#    'camel-ai/biology',
#    ['message_1', 'message_2']
#)

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=1500,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        optim='paged_adamw_8bit'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer,
        mlm=False
    )
)

model.config.use_cache = False
trainer.train()

In [None]:
model.save_pretrained('outputs')