In [1]:
# install the required libraries

! pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 trl==0.4.7 --quiet

In [2]:
# ignoring the warnings

import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"

### Dataset preparation

In [3]:
# Load the dataset from HF - https://huggingface.co/datasets/knkarthick/dialogsum

from datasets import load_dataset

dataset_name = 'knkarthick/dialogsum' 

ds = load_dataset(dataset_name)

# looking at the dataset splits
ds

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/csv/knkarthick--dialogsum-1aed23a5f481e688/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/knkarthick--dialogsum-1aed23a5f481e688/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [4]:
# loading the full train dataset and subset of test dataset

train_ds , test_ds = load_dataset(dataset_name,split =['train', 'test[0:200]'])

  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# covert the dataset to pandas dataframe for instruction finetuning dataset preparation

import pandas as pd

train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)

In [6]:
# looking at the training dataset

train_df.head()

Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


Write a concise summary of the following text which starts with ### Input: \n
Return your response in bullet points which covers the key points of the text.

In [7]:
# instruction finetuning data preparation function

def prepare_dataset(df,split='train'):
    text_col = []
    instruction = """Write a concise summary of the below input text. """ # change instuction according to the task
    if split == 'train':
        for _ , row in df.iterrows():
            input_q = row["dialogue"]
            output = row["summary"]
            text = ("### Instruction: \n" + instruction + 
                    "\n### Input: \n" + input_q + 
                    "\n### Response :\n" + output) # keeping output column in training dataset
            text_col.append(text)
        df.loc[:,'text'] = text_col
    else:
        for _ , row in df.iterrows():
            input_q = row["dialogue"]
            text = ("### Instruction: \n" + instruction + 
                    "\n### Input: \n" + input_q +
                    "\n### Response :\n" ) # not keeping output column in test dataset
            text_col.append(text)
        df.loc[:,'text'] = text_col
    return df

In [8]:
train_df = prepare_dataset(train_df,'train')
test_df = prepare_dataset(test_df,'test')

In [9]:
# looking at the train df , new text column is created
train_df.head()

Unnamed: 0,id,dialogue,summary,topic,text
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up,### Instruction: \nWrite a concise summary of ...
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines,### Instruction: \nWrite a concise summary of ...
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys,### Instruction: \nWrite a concise summary of ...
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend,### Instruction: \nWrite a concise summary of ...
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance,### Instruction: \nWrite a concise summary of ...


In [10]:
# looking at one of the train text column format
print(train_df['text'][0])

### Instruction: 
Write a concise summary of the below input text. 
### Input: 
#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, tha

In [11]:
# looking at the test df , new text column is created
test_df.head()

Unnamed: 0,id,dialogue,summary,topic,text
0,test_0_1,"#Person1#: Ms. Dawson, I need you to take a di...",Ms. Dawson helps #Person1# to write a memo to ...,communication method,### Instruction: \nWrite a concise summary of ...
1,test_0_2,"#Person1#: Ms. Dawson, I need you to take a di...",In order to prevent employees from wasting tim...,company policy,### Instruction: \nWrite a concise summary of ...
2,test_0_3,"#Person1#: Ms. Dawson, I need you to take a di...",Ms. Dawson takes a dictation for #Person1# abo...,dictation,### Instruction: \nWrite a concise summary of ...
3,test_1_1,#Person1#: You're finally here! What took so l...,#Person2# arrives late because of traffic jam....,public transportation,### Instruction: \nWrite a concise summary of ...
4,test_1_2,#Person1#: You're finally here! What took so l...,#Person2# decides to follow #Person1#'s sugges...,transportation,### Instruction: \nWrite a concise summary of ...


In [12]:
# looking at one of the test text column format without output data
print(test_df['text'][0])

### Instruction: 
Write a concise summary of the below input text. 
### Input: 
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging

In [13]:
# coverting the dataframe to huggingface dataset for easy finetuning
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)

In [14]:
# looking at the dataset
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic', 'text'],
    num_rows: 12460
})

### Loading the sharded Llama-2 model in Quantized format

In [15]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

# sharded model path in hugging face
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

# loading the model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map='auto'
)
model.config.use_cache = False

Downloading config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00014.bin:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00014.bin:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [16]:
# Creating the Llama-2 tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True , return_token_type_ids=False)
tokenizer.pad_token = tokenizer.eos_token

Downloading tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

### QLoRA Configuration

In [17]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.05 
lora_r = 8 # rank

# Parameter efficient finetuning for LoRA configuration

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules= ["q_proj","v_proj"], # we will only create adopters for q, v metrices of attention module
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

### Finetuning Process

In [18]:
# defining the model fine tuning arguments
# arguments are self explanatory

from transformers import TrainingArguments, Trainer

training_arguments = TrainingArguments(
        output_dir="llama2_qlora_finetuned",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=100,
        fp16=True,
        push_to_hub=False,
        report_to="none"
    )

In [19]:
# creating trainer with the training agruments

from trl import SFTTrainer
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config, # passing peft config
        dataset_text_field="text", # mentioned the required column
        args=training_arguments, # training agruments
        tokenizer=tokenizer, # tokenizer 
        packing=False,
        max_seq_length=512
    )

  0%|          | 0/13 [00:00<?, ?ba/s]

In [20]:
# upcasting the layer norms in float 32 for more stable training

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [21]:
# starting the finetuning process

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,45.2657
20,6.2113
30,6.2605
40,37.2808
50,13.2073
60,6.2332
70,6.1017
80,7.6953
90,493.3201
100,0.0


TrainOutput(global_step=100, training_loss=62.157594718933105, metrics={'train_runtime': 3921.3378, 'train_samples_per_second': 0.408, 'train_steps_per_second': 0.026, 'total_flos': 1.2988137631481856e+16, 'train_loss': 62.157594718933105, 'epoch': 0.13})

### Save the LoRA adopters / you can even push these adopters to hugging face model hub for future inference

In [22]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [23]:
# adding back the LoRA adopters to the base Llama-2 model

lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

### Inference using Llama2 + QLoRA adopters

In [24]:
# perform inference on the first row of the test dataset
text = test_df['text'][0]
print(text)

### Instruction: 
Write a concise summary of the below input text. 
### Input: 
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging

### <b>Update:</b>

Added repetition_penalty=1.2 to avoid the repetion of input task as input

In [25]:
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=100 ,repetition_penalty=1.2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Instruction: 
Write a concise summary of the below input text. 
### Input: 
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging