In [None]:
!pip install -Uqqq  pip --progress-bar off
!pip install -qqq  torch==2.0.1 --progress-bar off
!pip install transformers==4.32.1 --progress-bar off
!pip install datasets==2.14.4 --progress-bar off
!pip install peft==0.5.0 --progress-bar off
!pip install bitsandbytes==0.41.1 --progress-bar off
!pip install trl==0.7.1 --progress-bar off


[0m

In [None]:
!pip install peft==0.5.0

[0m

In [None]:
import json
import re
from datasets import Dataset, load_dataset
from pprint import pprint
import torch
from peft import LoraConfig, PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
Model = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
dataset = load_dataset('Salesforce/dialogstudio', "TweetSumm")
dataset

In [None]:
DEFAULT_SYSTEM_PROMPT = """
Below is the conversation between a human and an AI agent. Write a summary of the conversation.
""".strip()
def generate_training_prompt(
    conversation: str, summary: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
)->str:
  return f"""###Instruction: {system_prompt}

### Input:
{conversation.strip()}
### Response:
{summary}
""".strip()


In [None]:
def clean_text(text):
  text = re.sub(r"http\S+", " ", text)
  text = re.sub(r"@[^\s]+", " ", text)
  text = re.sub(r"\s+", " ", text)
  return re.sub(r"\^[^ ]+","", text)
def create_conversation_text(data_point):
  text = ""
  for item in data_point["log"]:
    user = clean_text(item["user utterance"])
    text += f"user:{user.strip()}\n"
    agent = clean_text(item["system response"])
    text += f"agent: {agent.strip()}\n"
    return text


In [None]:
def generate_text(data_point):
  summaries = json.loads(data_point[f"original dialog info"])["summaries"][
    "abstractive_summaries"
  ]
  summary = summaries[0]
  summary = " ".join(summary)
  conversation_text = create_conversation_text(data_point)
  return {
      "conversation": conversation_text,
      "summary": summary,
      "text":  generate_training_prompt(conversation_text, summary),
  }

In [None]:
example = generate_text(dataset["train"][0])

In [None]:
print(example["summary"])

In [None]:
print(example["text"])

In [None]:
def process_dataset(data: Dataset):
  return(
      data.shuffle(seed=42)
      .map(generate_text)
      .remove_columns(
          [
          "original dialog id",
          "new dialog id",
          "dialog index",
          "original dialog info",
          "log",
          "prompt",
          ]
      )
  )

In [None]:
dataset["train"] = process_dataset(dataset["train"])
dataset["test"] = process_dataset(dataset["test"])
dataset["validation"] = process_dataset(dataset["validation"])

In [None]:
dataset

In [None]:
token = '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'

In [None]:

def create_model_and_tokenier():
  bnb_config = BitsAndBytesConfig(
      load_in_4bit = True,
      bnb_4bit_quant_type = "nf4",
      bnb_4bit_compute_dtype = torch.float16,
      )
  model = AutoModelForCausalLM.from_pretrained(
      Model,
      use_safetensors = True,
      quantization_config =bnb_config,
      trust_remote_code = True,
      device_map = "auto",

  )
  tokenizer = AutoTokenizer.from_pretrained(Model)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"
  return model, tokenizer

In [None]:
!pip install huggingface-cli

In [None]:
!huggingface-cli login --token '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'

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model, tokenizer = create_model_and_tokenier()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.config.use_cache = False
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 'load_in_8bit': False,
 'load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'float16'}

In [None]:
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    # Additional PeftModel specific parameters here (if needed)
)




In [None]:
Output = "experiments"


In [None]:
training_arguments = TrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    optim = "paged_adamw_32bit",
    logging_steps = 1,
    learning_rate = 1e-4,
    fp16 = True,
    max_grad_norm = 0.3,
    num_train_epochs = 2,
    evaluation_strategy = "steps",
    eval_steps = 0.2,
    warmup_ratio = 0.05,
    save_strategy = "epoch",
    group_by_length = True,
    output_dir = Output,
    report_to = "tensorboard",
    save_safetensors = True,
    lr_scheduler_type = "cosine",
    seed = 42

)

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset["train"],
    eval_dataset = dataset["validation"],
    peft_config = peft_config,
    dataset_text_field = "text",
    max_seq_length = 4096,
    tokenizer = tokenizer,
    args = training_arguments,

)



In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
22,2.5678,2.478146
44,2.1272,2.067457
66,1.8303,1.914237


In [None]:
%reload_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
trainer.save_model()

In [None]:
trainer.model

In [None]:
from peft import AutoPeftModelForCausalLM
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    Output,
    low_cpu_mem_usage = True,
)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
)-> str:
  return f"""### Instructions {system_prompt}

### Input:
{conversation.strip()}
### Response:
""".strip()

In [None]:
examples = []
for data_point in dataset["test"].select(range(5)):
  summaries = json.loads(data_point["orignal dialog info"])["summaries"][
      "abstractive_summaries"
  ]
  summary = summaries[0]
  summary = " ".join(summary)
  conversation = create_conversation_text(data_point)
  examples.append(
      {
          "summary":summary,
          "conversation": conversation,
          "prompt": generate_prompt(conversation),
      }
  )
test_df = pd.DataFrame(examples)
test_df

In [None]:
model, tokenizer = create_model_and_tokenizer()

In [None]:
def summerize(model, text:str):
  input = tokenizer(text, return_tensors = "pt").to(DEVICE)
  input_length = len(inputs["inputs_ids"][0])
  with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens = 256, temperature == 0.0001)
  return tokenizer.decode(output[0][inputs_length:], skip_special_tokens=True)

In [None]:
example = test_df.iloc[0]
print(example.conversation)

In [None]:
print(example.summary)

In [None]:
%%time
summary = summarize(mdoel, example.prompt)

In [None]:
pprint(summary)

In [None]:
example = test_df.iloc[1]
print(example.conversation)

In [None]:
print(example.summary)


In [None]:
model = PeftModel.from_pretrained(model, Output)

In [None]:
example = test_df.iloc[0]
pprint(summary)

In [None]:
print(example.conversation)

In [None]:
%%time
summary = summarize(model, example.prompt)

In [None]:
pprint(summary)

In [None]:
pprint(summary.strip().split("\n")[0])
