In [None]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U accelerate
!pip install -U loralib
!pip install -U torch
!pip install -U datasets
!pip install -q git+https://github.com/huggingface/peft.git # This is to import PEFT
!pip install -U scipy

In [None]:
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
import torch.nn as nn
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_token = 64

## Huggingface Login

In [None]:
# You need a huggingface token that can access llama2
notebook_login()

## Data preprocessing

In [None]:
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # LLAMA2 does not have default padding token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Tokenization functions
def dialog_to_string(dialog: list[str]) -> str:
    formatted_dialogue = '<User> '+dialog[0]+'\n<Assistant> '+dialog[1] + "</s>"
    return formatted_dialogue

def tokenize_function(row):
    row["dialog"] = dialog_to_string(row["dialog"])
    return tokenizer(row["dialog"], max_length=max_token, truncation=True)

def is_shorter_than_max_token(row):
    """
    Return if a given row has more than max_token number of tokens
    """
    return len(row['input_ids']) <= max_token

In [None]:
# Load and tokenize dataset
dataset = load_dataset("daily_dialog")
dataset = dataset.map(tokenize_function)

# Filter the dataset
dataset = dataset.filter(is_shorter_than_max_token)

## Training

In [None]:
class FP32Output(nn.Sequential):
    def __init__(self, model: nn.Sequential):
        super().__init__(model)

    def forward(self, tensor: torch.Tensor):
        return super().forward(tensor).to(torch.float32)

In [None]:
# Instantiate the model
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map='auto', use_cache=False)
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model) # Freeze the weight of the model and some floating point changes.
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.lm_head = FP32Output(model.lm_head) # Change to fp32 for more stable back propagation.

In [None]:
# LORA config
config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
training_args = TrainingArguments(
    output_dir="output_dir",
    per_device_train_batch_size=100,
    gradient_accumulation_steps=10,
    num_train_epochs=15,
    learning_rate=2e-4,
    evaluation_strategy="epoch",
    warmup_steps=150,
    weight_decay=1e-3,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()

In [None]:
def talk_with_llm(tweet: str) -> str:
    # Encode and move tensor into cuda if applicable.
    encoded_input = tokenizer(tweet, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

    output = model.generate(**encoded_input, max_new_tokens=256)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
talk_with_llm("<User> Yo, what's up \n<Assistant>")

## Push the model to Huggingface

In [None]:
model.push_to_hub("danjie/Chadgpt-Llama2-7b", commit_message="first draft", private=False)

## Load the model from Huggingface

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "danjie/Chadgpt-Llama2-7b"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)