# Fine-tune Llama 2 on your whatsapp chat data
> 🗣️ Checkout the [github repo](https://github.com/Bissmella/FineTune_llama_on_chat_data) for more details on generating dataset, and finetuning Llama.

###  Table of Contents:
- Preprocess your whatsapp chat data, and create a dataset from it.
- Load, fine-tune, and evaluate Llama on the generated dataset

🛡️ For protecting your privacy and private chat contents do not share the fine-tuned model in public.

Based on Younes Belkada's [GitHub Gist](https://github.com/kw2828/guardrail-ml) and [kevin's](https://github.com/kw2828) notebook


## 1. Pre-process chat data and create dataset

In [None]:
import csv
import re

# Define a regular expression pattern to match date and time at the beginning of a line
date_time_pattern = r'^\d{2}/\d{2}/\d{4}'

# Input and output file paths
input_file_path = '/content/CHATS.txt'
json_output_file = '/content/CHATS.json'

# Initialize variables to store the chat data
chat_data = []
current_chat = []

# Read the input text file
with open(input_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if re.match(date_time_pattern, line):
            # Start of a new chat, save the previous chat if any
            if current_chat:
                chat_data.append(current_chat)
            current_chat = [line]
        elif current_chat:
            # Append the line to the current chat
            current_chat.append(line)

# Append the last chat to the chat data
if current_chat:
    chat_data.append(current_chat)

In [None]:
# Initialize variables to store the dataset
dataset = []

# Initialize variables to keep track of the context and instruction
context = []
instruction = None
response = None
for row in chat_data:
    # Split the row into date_time and message parts
    message = row

    # Split the message into words
    words = message[0].split()

    # Check if the message is authored by YOU, replace 'YOUR NAME' with your own name as appears in chat
    if len(words) > 0 and ' '.join(words[3:5]) == 'YOUR NAME:':
      # If it's authored by "john," set it as the instruction
      response = message[0]
    else:
        # If it's not authored by "john," add it to the context
        context.append(message[0])

        # Keep only the last 5 messages in the context
        context = context[-5:]

    # If there's an instruction and context, add them to the dataset
    if response:
        response = response.split()
        dataset.append({
            'instruction': context[-1],
            'context': '\n'.join(context),
            'response': ' '.join(response[5:]) # The last message in context is the response
        })

        # Reset context and instruction
        #context = []
        response = None

# Now you have a dataset with instructions, context, and response
# You can access and use it as needed
for i, data in enumerate(dataset[:5], start=1):
    print(f"Example {i}:")
    print(f"Instruction: {data['instruction']}")
    print(f"Context: {data['context']}")
    print(f"Response: {data['response']}")
    print()

In [None]:
import json
with open(json_output_file, 'w', encoding='utf-8') as jsonfile:
    json.dump(dataset, jsonfile, ensure_ascii=False, indent=4)

## 2. Load and fine-tune Llama 7B chat

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 guardrail-ml==0.0.12 tensorboard
!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q unstructured["local-inference"]==0.7.4 pillow

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m122.9/244.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m118.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from guardrail.client import (
    run_metrics,
    run_simple_metrics,
    create_dataset)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Used for multi-gpu
local_rank = -1
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
weight_decay = 0.001
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
max_seq_length = None

# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"#"guardrail/llama-2-7b-guanaco-instruct-sharded"

# Fine-tuned model name
new_model = "llama-2-7b-guanaco-dolly-mini"

# The instruction dataset to use
dataset_name = "databricks/databricks-dolly-15k"

# Activate 4-bit precision base model loading
use_4bit = True

# Activate nested quantization for 4-bit base models
use_nested_quant = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Number of training epochs
num_train_epochs = 20

# Enable fp16 training, (bf16 to True with an A100)
fp16 = False

# Enable bf16 training
bf16 = False

# Use packing dataset creating
packing = False

# Enable gradient checkpointing
gradient_checkpointing = True

# Optimizer to use, original is paged_adamw_32bit
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)
lr_scheduler_type = "cosine"

# Number of optimizer update steps, 10K original, 20 for demo purposes
max_steps = -1

# Fraction of steps to do a warmup for
warmup_ratio = 0.03

# Group sequences into batches with same length (saves memory and speeds up training considerably)
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 10

# Log every X updates steps
logging_steps = 1

# The output directory where the model predictions and checkpoints will be written
output_dir = "./results"

# Load the entire model on the GPU 0
device_map = {"": 0}

# Visualize training
report_to = "tensorboard"

# Tensorboard logs
tb_log_dir = "./results/logs"

In [None]:
def load_model(model_name):
    # Load tokenizer and model with QLoRA configuration
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Load LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer, peft_config

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model, tokenizer, peft_config = load_model(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
def format_dolly(sample):
    instruction = f"<s>[INST] Reply to message "#{sample['instruction']}"
    context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None
    response = f" [/INST] {sample['response']}"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = load_dataset('json', data_files='/content/CHATS.json', split="train")
#load_dataset("databricks/databricks-dolly-15k", split="train")

# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
dataset = dataset_shuffled.select(range(50))

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 50
})

In [None]:
dataset_shuffled[0]

In [None]:
dataset[49]

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
generator("[INST] Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire? [/INST]", max_length=1024)

[{'generated_text': '[INST] Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire? [/INST]  In the A Song of Ice and Fire series by George R. hopefully this helps! R. R. Martin, Garth Greenhand is not a character mentioned in the series. In fact, the term "High King" is not used to describe any character in the series. Instead, the rulers of the Seven Kingdoms are referred to as "Kings of the First Men."\nThe children of Garth Greenhand are not mentioned in the series either, as Garth Greenhand is a mythical figure who is said to have ruled Westeros before the First Men arrived. According to the history of Westeros presented in the series, the First Men arrived in Westeros around 10,000 years before the events of the series and established the Kingdom of the First Men. There is no record of any High King of the First Men in the series.\nIt\'s worth noting that while Garth Greenhand is not a character in the series, t

In [None]:
prompt = "how long does an American football match REALLY last, if you substract all the downtime?"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
prompt = "What is the airspeed velocity of an unladen swallow?"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is the airspeed velocity of an unladen swallow? [/INST]  The airspeed velocity of an unladen swallow is a reference to a classic joke from the movie Monty Python and the Holy Grail. everybody's favorite scene. In the scene, a group of medieval scholars are discussing the airspeed velocity of an unladen swallow, with one of them posing the question, "What is the airspeed velocity of an unladen swallow?"

The joke is that the question is completely nonsensical and impossible to answer, as swallows don't have the ability to fly without any weight or cargo. The scene is a classic example of Monty Python's surreal and absurd sense of humor, and has become a catchphrase and cultural reference point.
In short, the airspeed velocity of an unladen swallow is a non-


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Create the below-mentioned folder in your google drive
output_dir = "/content/drive/MyDrive/AI_models/llama/finetuned"

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

trainer.train()
trainer.model.save_pretrained(output_dir)



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.7405
2,2.7069
3,2.7323
4,2.764
5,2.649
6,2.278
7,2.4899
8,2.6004
9,2.2777
10,2.3618


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Reply to message Here's some context: YOUR MESSAGE"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, output_dir)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [None]:
## WARNING!  make sure to not make your model public, since it is trained on your private data.
!huggingface-cli login

model.push_to_hub(new_model, max_shard_size='2GB')
tokenizer.push_to_hub(new_model)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 

CommitInfo(commit_url='https://huggingface.co/guardrail/llama-2-7b-guanaco-dolly-mini/commit/f35edfd604b890ae613ee998c5088d492983c034', commit_message='Upload tokenizer', commit_description='', oid='f35edfd604b890ae613ee998c5088d492983c034', pr_url=None, pr_revision=None, pr_num=None)