# Setup

In [18]:
!pip install -qqq bitsandbytes==0.39.0 
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc 
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f 
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71 
!pip install -qqq datasets==2.12.0 
!pip install -qqq loralib==0.1.1 
!pip install -qqq einops==0.6.1 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [7]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [8]:
MODEL_NAME = "vilsonrodrigues/falcon-7b-instruct-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of FalconForCausalLM were not initialized from the model checkpoint at vilsonrodrigues/falcon-7b-instruct-sharded and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
      f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
  )

In [22]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3613463424 || trainables%: 0.13058363808693696


In [23]:
prompt = """
<human>: midjourney prompt for a girl sit on the mountain
<assistant>:
""".strip()

In [24]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [12]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

<human>: midjourney prompt for a boy running in the snow
<assistant>: A young boy running in the snow, with a backpack, and a red scarf, by the artist "Miyu" --ar 16:9 --w 3000 --h 5000 --no-repeat --no-stretch --ar 16:9 --w 3000 --h 5000 --no-repeat --ar 16:9 --w 3000 --h 5000 --no-repeat --ar 16:9 --w 3000 --h 5000 --no-repeat --ar 16:9 --w 3000 --h 5000 --no-repeat --ar 16:9 --w 3000 --h 5000 --no-repeat --ar 16:9 --w 3000 --h 5000 --no-repeat
CPU times: user 16.8 s, sys: 15.4 ms, total: 16.8 s
Wall time: 16.8 s


In [26]:
data = load_dataset("csv", data_files="midjourney_prompt_dataset.csv")

Found cached dataset csv (/home/andy/.cache/huggingface/datasets/csv/default-7510911a5294dc2e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
data["train"][0]

{'User': '"midjourney prompt for a female character in a futuristic setting"',
 'Prompt': '"< yoshida akihiko art, pixiv art, patreon art, girl art, painting by Yoshida Akihiko, Nier Automata 2B, Nier Automata, r-18, Nier Automata concept art, Akihiko Yoshida concept art, painting by Akihiko Yoshida"'}

In [28]:
data

DatasetDict({
    train: Dataset({
        features: ['User', 'Prompt'],
        num_rows: 289
    })
})

In [29]:
def generate_prompt(data_point):
  return f"""
<human>: {data_point["User"]}
<assistant>: {data_point["Prompt"]}
""".strip()

def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
  return tokenized_full_prompt

In [30]:
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

In [31]:
data

Dataset({
    features: ['User', 'Prompt', 'input_ids', 'attention_mask'],
    num_rows: 289
})

# Train

In [32]:
training_args = transformers.TrainingArguments(
      per_device_train_batch_size=1,
      gradient_accumulation_steps=4,
      num_train_epochs=1,
      learning_rate=2e-4,
      fp16=True,
      save_total_limit=3,
      logging_steps=1,
      output_dir="experiments",
      optim="paged_adamw_8bit",
      lr_scheduler_type="cosine",
      warmup_ratio=0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=72, training_loss=2.8763046430216894, metrics={'train_runtime': 51.4732, 'train_samples_per_second': 5.615, 'train_steps_per_second': 1.399, 'total_flos': 426189740580864.0, 'train_loss': 2.8763046430216894, 'epoch': 1.0})

# Save

In [35]:
model.save_pretrained("trained-model")

In [9]:
config = PeftConfig.from_pretrained('./trained-model')
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, './trained-model')

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of FalconForCausalLM were not initialized from the model checkpoint at vilsonrodrigues/falcon-7b-instruct-sharded and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Run

In [10]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [13]:
%%time
device = "cuda:0"

prompt = """
<human>: midjourney prompt for a boy running in the sand
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

<human>: midjourney prompt for a boy running in the sand
<assistant>: A young boy running in the sand, with a bucket of water, on a beach, in the middle of a storm, --ar 16:9 --w 3000 --h 5000 --no people --no shadows --ar 16:9 --w 3000 --h 5000 --no people --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 3000 --h 5000 --ar 16:9 --w 
CPU times: user 16 s, sys: 12.1 ms, total: 16 s
Wall time: 16 s
