In [13]:
import json
import re
from pprint import pprint
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

from trl import MultimodalTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "microsoft/phi-2"
!huggingface-cli login --token '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'

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/dchenbs/.cache/huggingface/token
Login successful


In [14]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(DEVICE)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.91it/s]


## Data

In [3]:
dataset = load_dataset("Salesforce/dialogstudio", "TweetSumm")

DEFAULT_SYSTEM_PROMPT = """
Below is a conversation between a human and an AI agent. Write a summary of the conversation.
""".strip()


def generate_training_prompt(
    conversation: str, summary: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:
{summary}
""".strip()

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)


def create_conversation_text(data_point):
    text = ""
    for item in data_point["log"]:
        user = clean_text(item["user utterance"])
        text += f"user: {user.strip()}\n"

        agent = clean_text(item["system response"])
        text += f"agent: {agent.strip()}\n"

    return text

def generate_text(data_point):
    summaries = json.loads(data_point["original dialog info"])["summaries"][
        "abstractive_summaries"
    ]
    summary = summaries[0]
    summary = " ".join(summary)

    conversation_text = create_conversation_text(data_point)
    return {
        "conversation": conversation_text,
        "summary": summary,
        "text": generate_training_prompt(conversation_text, summary),
    }

In [4]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
        .remove_columns(
            [
                "original dialog id",
                "new dialog id",
                "dialog index",
                "original dialog info",
                "log",
                "prompt",
            ]
        )
    )

dataset["train"] = process_dataset(dataset["train"])
dataset["validation"] = process_dataset(dataset["validation"])
dataset["test"] = process_dataset(dataset["test"])

## Model

In [5]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map={"":0},
        trust_remote_code=True,
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 7,864,320 || all params: 2,787,548,160 || trainable%: 0.2821231974697076


## Training

In [7]:
# OUTPUT_DIR = "experiments"

# %load_ext tensorboard
# %tensorboard --logdir experiments/runs

In [8]:
training_arguments = TrainingArguments(
        output_dir="runs/phi-2-finetuned-dialogstudio",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=1,
        max_steps=10,
        num_train_epochs=1,
        push_to_hub=True
    )

In [9]:
trainer = MultimodalTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
)
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.7921
2,2.9987
3,2.9525
4,2.9336
5,3.2104
6,2.9184
7,3.0683
8,3.2524
9,3.1701
10,2.798


TrainOutput(global_step=10, training_loss=3.009433913230896, metrics={'train_runtime': 8.7967, 'train_samples_per_second': 4.547, 'train_steps_per_second': 1.137, 'total_flos': 268856639201280.0, 'train_loss': 3.009433913230896, 'epoch': 0.05})

In [10]:
trainer.evaluate()

{'eval_loss': 2.8287675380706787,
 'eval_runtime': 7.3264,
 'eval_samples_per_second': 15.014,
 'eval_steps_per_second': 1.911,
 'epoch': 0.05}

## Inference

In [11]:
# from peft import PeftModel
# from transformers import AutoModelForCausalLM
# import torch
# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True, torch_dtype=torch.float32)
# peft_model = PeftModel.from_pretrained(model, "runs/phi-2-finetuned-dialogstudio", from_transformers=True)
# model = peft_model.merge_and_unload().to(DEVICE)

In [12]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# # model = AutoModelForCausalLM.from_pretrained("ashishpatel26/phi-1_5-finetuned-dialogstudio", trust_remote_code=True, torch_dtype=torch.float32)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
# inputs = tokenizer(f'''{dataset["test"]['text'][0]}''', return_tensors="pt", return_attention_mask=False).to(DEVICE)

# outputs = model.generate(**inputs, max_length=512)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)