In [1]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U accelerate
!pip install -U loralib
!pip3 install torch torchvision torchaudio
!pip install -U datasets
!pip install -U peft # This is to import PEFT
!pip install -U scipy

[0m

In [2]:
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
import torch.nn as nn
from datasets import load_dataset, DatasetDict, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [3]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_token = 128

In [4]:
# You need a huggingface token that can access llama2
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Data preprocessing

In [5]:
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # LLAMA2 does not have default padding token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [6]:
# Tokenization functions
def tokenize_function(row):
    return tokenizer(row["dialog"], max_length=max_token, truncation=False)

def is_shorter_than_max_token(row):
    """
    Return if a given row has more than max_token number of tokens
    """
    return len(row['input_ids']) <= max_token

def split_conversation(conversation): 
    """
    Split conversation into turns
    """
    return [conversation[:i+2] for i in range(0, len(conversation), 2) if i+2 <= len(conversation)]

def format_conversation(conversation: list[str]) -> str:
    formatted_conversation = ""
    
    # Check if the conversation has more than two turns
    if len(conversation) > 2:
        # Process all but the last two turns
        for i in range(len(conversation) - 2):
            if i % 2 == 0:
                formatted_conversation += "<Past User>" + conversation[i] + "\n"
            else:
                formatted_conversation += "<Past Assistant>" + conversation[i] + "\n"
    
    # Process the last two turns
    if len(conversation) >= 2:
        formatted_conversation += "<User>" + conversation[-2] + "\n"
        formatted_conversation += "<Assistant>" + conversation[-1]
    
    return formatted_conversation

def convert_to_conversation(row):
    conversation_list = row["dialog"]
    
    conversation = format_conversation(conversation_list)
    conversation += "<|endoftext|>"
    return {"dialog": conversation.strip()}

In [7]:
# Load and tokenize dataset
dataset = load_dataset("daily_dialog")

# Split into multiple turns of conversation
split_dataset = dataset.map(lambda x: {'dialog': split_conversation(x['dialog'])})

# Flatten dataset
flatten_dataset_train = [item for row in split_dataset["train"]["dialog"] for item in row]
flatten_dataset_valid = [item for row in split_dataset["validation"]["dialog"] for item in row]
flatten_dataset_test = [item for row in split_dataset["test"]["dialog"] for item in row]

flatten_dataset_train = Dataset.from_dict({'dialog': flatten_dataset_train})
flatten_dataset_valid = Dataset.from_dict({'dialog': flatten_dataset_valid})
flatten_dataset_test = Dataset.from_dict({'dialog': flatten_dataset_test})

dataset = DatasetDict({
    'train': flatten_dataset_train,
    'validation': flatten_dataset_valid,
    'test': flatten_dataset_test
})

# Change to conversational manner
dataset = dataset.map(convert_to_conversation)

# Tokenize dataset
dataset = dataset.map(tokenize_function)

# Filter conversation longer than tok`en limit
dataset = dataset.filter(is_shorter_than_max_token)

Map:   0%|          | 0/41637 [00:00<?, ? examples/s]

Map:   0%|          | 0/3851 [00:00<?, ? examples/s]

Map:   0%|          | 0/3700 [00:00<?, ? examples/s]

Map:   0%|          | 0/41637 [00:00<?, ? examples/s]

Map:   0%|          | 0/3851 [00:00<?, ? examples/s]

Map:   0%|          | 0/3700 [00:00<?, ? examples/s]

Filter:   0%|          | 0/41637 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3851 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3700 [00:00<?, ? examples/s]

## Training

In [8]:
class FP32Output(nn.Sequential):
    def __init__(self, model: nn.Sequential):
        super().__init__(model)

    def forward(self, tensor: torch.Tensor):
        return super().forward(tensor).to(torch.float32)

In [9]:
# Instantiate the model
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map='auto', use_cache=False)
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model) # Freeze the weight of the model and some floating point changes.
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.lm_head = FP32Output(model.lm_head) # Change to fp32 for more stable back propagation.

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
# LORA config
config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
training_args = TrainingArguments(
    output_dir="output_dir",
    per_device_train_batch_size=25,
    gradient_accumulation_steps=40,
    num_train_epochs=10,
    learning_rate=2e-4,
    evaluation_strategy="epoch",
    warmup_steps=50,
    fp16=True,
    weight_decay=1e-3,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()

In [40]:
conversation_history = []

def talk_with_llm(chat: str) -> str:
    # Encode and move tensor into cuda if applicable.
    conversation_history.append(chat)
    conversation_history.append("")
    conversation = format_conversation(conversation_history)
    
    encoded_input = tokenizer(conversation, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

    output = model.generate(**encoded_input, max_new_tokens=256)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    response = response[len(conversation):]
    
    conversation_history.pop()
    conversation_history.append(response)
    return response

## Conversation example

In [41]:
talk_with_llm("Yo what's up?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


" I'm looking for a new job. "

In [42]:
talk_with_llm("Interesting, what kinda job are you looking for?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


" I'm looking for a job in the advertising industry. "

## Push the model to Huggingface

In [None]:
model.push_to_hub("danjie/Chadgpt-gpt2-xl-conversation", commit_message="first draft", private=False)

## Load the model from Huggingface

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "danjie/Chadgpt-gpt2-xl-conversation"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)