In [1]:
%%capture
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install evaluate==0.4.1
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install trl
# !pip install tqdm==4.66.1

In [None]:
import os
from huggingface_hub import login
from huggingface_hub import notebook_login, login
import wandb
from datasets import Dataset, load_dataset

notebook_login()

# Loading the dataset

In [3]:
from datasets import load_dataset

# dataset=load_dataset(os.getenv('DATASET_NAME'))
dataset = load_dataset('kmfoda/booksum')

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating validation split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [4]:
smaller_training=dataset['train'].select(range(1000))
smaller_validation=dataset['validation'].select(range(200))
smaller_test=dataset['test'].select(range(200))

dataset['train']=smaller_training
dataset['validation']=smaller_validation
dataset['test']=smaller_test

dataset

DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 200
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 200
    })
})

In [5]:
import re
import json
from datasets import Dataset, load_dataset

DEFAULT_SYSTEM_PROMPT = """
Below is a chapter from a book and its summary. Write a summary of the chapter.
""".strip()


def generate_training_prompt(
    chapter: str, summary_text: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{chapter.strip()}

### Response:
{summary_text}
""".strip()


def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)


def generate_text(data_point):
    chapter = clean_text(data_point["chapter"])
    summary_text = clean_text(data_point["summary_text"])
    return {
        "chapter": chapter,
        "summary_text": summary_text,
        "text": generate_training_prompt(chapter, summary_text),
    }

def process_dataset(data: Dataset) -> None:
    """
    This function removes all columns except 'chapter', 'summary_text', and 'text'.
    """
    return (
        data.shuffle(seed=42)
        .map(generate_text)
        .remove_columns(
            [
                "bid",
                "is_aggregate",
                "source",
                "chapter_path",
                "summary_path",
                "book_id",
                "summary_id",
                "content",
                "summary_name",
                "summary_url",
                "summary",
                "summary_analysis",
                "chapter_length",
                "summary_length",
                "analysis_length",
            ]
        )
    )

In [6]:
dataset['train'] = process_dataset(dataset['train'])
dataset['validation']= process_dataset(dataset['validation'])
dataset['test'] = process_dataset(dataset['test'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
MODEL_NAME = "microsoft/phi-2"

def create_model_and_tokenizer():
    
    bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_type=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
    )

    model=AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map='auto',
        quantization_config=bnb_config,
        # Solving the issue: ValueError: PhiForCausalLM does not support `device_map='auto'`. To implement support, the model class needs to implement the `_no_split_modules` attribute.
        trust_remote_code=True,
    #     attn_implementation="flash_attention_2", # Does not be supported in here
        torch_dtype=torch.float16
    )
    model.config.quantization_config

    from transformers import AutoTokenizer
    # see https://github.com/huggingface/transformers/issues/18388 for description about padding
    tokenizer=AutoTokenizer.from_pretrained(
        MODEL_NAME,
        padding_side='left',
        add_eos_token=True,
        add_bos_token=True,
        use_fast=False
    )
    tokenizer.pad_token=tokenizer.eos_token

    return model, tokenizer

In [8]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Frozen the model's parameters

In [9]:
from peft import prepare_model_for_kbit_training

# save memory
model.gradient_checkpointing_enable()
model=prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layern

In [10]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config=LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense',
        'fc1',
        'fc2',
    ],
    bias="none",
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

peft_model=get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 23,592,960 || all params: 2,803,276,800 || trainable%: 0.8416207775129448


In [32]:
import time
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer

OUTPUT_DIR = '/home/shantanu20118/work/exper'

training_args=TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=5,
    gradient_checkpointing=True,  # Enable gradient checkpointing
    gradient_checkpointing_kwargs={"use_reentrant": False},
    warmup_steps=50,
    max_steps=100, # Total number of training steps
    num_train_epochs=2, # Number of training epochs
    learning_rate=5e-5, # Learning rate
    weight_decay=0.01, # Weight decay
    optim="paged_adamw_8bit", # Keep the optimizer state and quantize it
#     bf16=True, # Do not supported in Kaggle environment, require Ampere....
    fp16=True, # use fp16 16bit(mixed) precision training instead of 32-bit training.
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2, # Limit the total number of checkpoints
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True, # Load the best model at the end of training,
#     report_to="wandb",
#     run_name=os.getenv("WANDB_NAME")
)

peft_model.config.use_cache=False

trainer=SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    dataset_text_field="text",
    max_seq_length=2048,
#     data_collator=data_collator
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
start_time=time.time()
trainer.train()
end_time=time.time()
training_time=end_time-start_time
print(f"Training completed in {training_time} seconds.")

In [13]:
import pandas as pd

def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:
""".strip()

test_df = pd.DataFrame(dataset['test'][:5])
test_df

Unnamed: 0,chapter,summary_text,text
0,Mrs. Dashwood was surprised only for a moment...,Mrs. Dashwood is pleasantly surprised by Edwar...,### Instruction: Below is a chapter from a boo...
1,Commencing then with the first of the above-n...,"Machiavelli's rule number 670,979,843,101: Don...",### Instruction: Below is a chapter from a boo...
2,Mrs. Jennings came immediately to their room ...,"Mrs. Jennings returns, with news of Willoughby...",### Instruction: Below is a chapter from a boo...
3,Having discoursed particularly on the charact...,Having discussed the different types of states...,### Instruction: Below is a chapter from a boo...
4,But the difficulties occur in a new principal...,New principalities always cause problems for t...,### Instruction: Below is a chapter from a boo...


# Evaluating

In [24]:
def inference(model, prompt, max_length=200):
    tokens=tokenizer(prompt, return_tensors='pt')
    res=model.generate(
        **tokens.to('cuda'),
        max_new_tokens=max_length,
        do_sample=True,
        num_return_sequences=1,
        temperature=0.1,
        num_beams=1,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(res, skip_special_tokens=False)

In [28]:
chapter=test_df.iloc[0].chapter
summary=test_df.iloc[0].summary_text

prompt=f'Instruct:  Write a summary of the chapter.\n{chapter}\nOutput:\n'

peft_model_res=inference(peft_model, prompt, 200)
peft_model_output=peft_model_res[0].split('Output:\n')[1]

prefix, success, result=peft_model_output.partition('###')

dashline='-'.join('' for x in range(100))
print(prompt)
print(dashline)

Instruct:  Write a summary of the chapter.
 Mrs. Dashwood was surprised only for a moment at seeing him; for his coming to Barton was, in her opinion, of all things the most natural. Her joy and expression of regard long outlived her wonder. He received the kindest welcome from her; and shyness, coldness, reserve could not stand against such a reception. They had begun to fail him before he entered the house, and they were quite overcome by the captivating manners of Mrs. Dashwood. Indeed a man could not very well be in love with either of her daughters, without extending the passion to her; and Elinor had the satisfaction of seeing him soon become more like himself. His affections seemed to reanimate towards them all, and his interest in their welfare again became perceptible. He was not in spirits, however; he praised their house, admired its prospect, was attentive, and kind; but still he was not in spirits. The whole family perceived it, and Mrs. Dashwood, attributing it to some wa

In [29]:
result

''

In [30]:
prefix

'\nPossible output:\n\nEdward and Marianne Dashwood were sitting in the drawing-room of their house, waiting for their guests to arrive. They had invited their friends, the Ferrars, to join them for dinner and conversation. Edward was a lawyer, and Marianne was a writer. They had met at a party a few months ago, and had fallen in love. They had not told their families, and had kept their relationship a secret. They had planned to elope, but had changed their minds at the last minute. They had decided to marry in secret, and to keep their marriage a secret. They had not told their families, and had kept their marriage a secret. They had decided to marry in secret, and to keep their marriage a secret. They had decided to marry in secret, and to keep their marriage a secret. They had decided to marry in secret, and to keep their marriage a secret. They had decided to marry in secret, and to keep'

In [31]:
summary

"Mrs. Dashwood is pleasantly surprised by Edward's appearance - but not too surprised, since she takes it for granted that he's in love with Elinor. Under her affectionate gaze, he can't help but become more like his previous self, and Elinor is relieved that he's back to normal. Mrs. Dashwood asks a rather sensitive question - what are Mrs. Ferrars's plans for her eldest son? Are her expectations still too high? Edward tells them that he still doesn't have any ambition, except to live moderately and happily. Elinor and Marianne have another spat, this time about how much money one requires to live well - Elinor's estimated sum is about half of what her sister requires. Marianne describes her reasons for needing two thousand pounds a year, including horses for hunting - and they match up exactly with what she and Willoughby would require at his home, Combe Magna. Margaret comes up with a great solution - someone should come along and give them all a huge fortune each. Margaret and Mrs.