In [2]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

In [11]:
from huggingface_hub import create_repo
create_repo("clementdevarieux/PA4A")

RepoUrl('https://huggingface.co/clementdevarieux/PA4A', endpoint='https://huggingface.co', repo_type='model', repo_id='clementdevarieux/PA4A')

In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

In [3]:
compute_dtype = getattr(torch, "float16")
print(compute_dtype)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

torch.float16


In [4]:
model = AutoModelForCausalLM.from_pretrained(
          model_name,
          quantization_config=bnb_config,
          use_flash_attention_2 = False, #set to True you're using A100
          device_map={"": 0}, #device_map="auto" will cause a problem in the training

)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [6]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj", "lm_head",]
)

In [7]:
#Cast some modules of the model to fp32
model = prepare_model_for_kbit_training(model)
#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

In [8]:
training_arguments = TrainingArguments(
        output_dir="./results", # directory in which the checkpoint will be saved.
        evaluation_strategy="epoch", # you can set it to 'steps' to eval it every eval_steps
        optim="paged_adamw_8bit", #used with QLoRA
        per_device_train_batch_size=4, #batch size
        per_device_eval_batch_size=4, #same but for evaluation
        gradient_accumulation_steps=1, #number of lines to accumulate gradient, carefull because it changes the size of a "step".Therefore, logging, evaluation, save will be conducted every gradient_accumulation_steps * xxx_step training example
        log_level="debug", #you can set it to  ‘info’, ‘warning’, ‘error’ and ‘critical’
        save_steps=500, #number of steps between checkpoints
        logging_steps=20, #number of steps between logging of the loss for monitoring adapt it to your dataset size
        learning_rate=4e-4, #you can try different value for this hyperparameter
        num_train_epochs=1,
        warmup_steps=100,
        lr_scheduler_type="constant",
)

In [3]:
dataset = load_dataset("pszemraj/booksum-short")

In [10]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 5912
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1012
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 988
    })
})


In [6]:
import pandas as pd
train_data = dataset['train']
chapters = train_data['chapter']
summaries = train_data['summary_text']

df = pd.DataFrame({'chapter': chapters, 'summary': summaries})

In [7]:
df.head()

Unnamed: 0,chapter,summary
0,"\n ""Before these fields were shorn and tilled...",In another part of the forest by the river a f...
1,"\n ""Well, go thy way: thou shalt not from thi...",When the mounted party from Fort Howard approa...
2,"\n ""In such a night\n Di...","The pursuit of Magua is unsuccessful, but Hawk..."
3,"\n ""Those strains that once did sweet in Zion...",Heyward and the girls are uneasy and Gamut is ...
4,"\n ""Be gay securely...","In the stillness that follows, Heyward finds i..."


In [8]:
train_data['chapter'][0]

'\n  "Before these fields were shorn and tilled,\n    Full to the brim our rivers flowed;\n  The melody of waters filled\n    The fresh and boundless wood;\n  And torrents dashed, and rivulets played,\n    And fountains spouted in the shade."\n\n  BRYANT.\n\n\nLeaving the unsuspecting Heyward and his confiding companions to\npenetrate still deeper into a forest that contained such treacherous\ninmates, we must use an author\'s privilege, and shift the scene a few\nmiles to the westward of the place where we have last seen them.\n\nOn that day, two men were lingering on the banks of a small but rapid\nstream, within an hour\'s journey of the encampment of Webb, like those\nwho awaited the appearance of an absent person, or the approach of some\nexpected event. The vast canopy of woods spread itself to the margin of\nthe river overhanging the water, and shadowing its dark current with a\ndeeper hue. The rays of the sun were beginning to grow less fierce, and\nthe intense heat of the day 

In [10]:
df.shape

(5912, 2)

In [9]:
train_data['summary_text'][0]

'In another part of the forest by the river a few miles to the west, Hawkeye and Chingachgook appear to be waiting for someone as they talk with low voices. It is now afternoon. The Indian and the scout are attired according to their forest habits: Chingachgook with his semi-nude, war-painted body and scalping tuft of hair, his tomahawk, scalping knife, and short rifle; Hawkeye with his hunting shirt, skin cap, buckskin leggings, knife, pouch and horn, and long rifle. They discuss their respective forefathers, and Chingachgook relates the slow demise of his tribe of Mohicans so that only he and his son Uncas now remain. At the mention of his name, Uncas, a youthful warrior dressed much like Hawkeye, appears and says that he has been on the trail of the Maquas, another name for the Mengwe or Iroquois, their natural enemies. The antlers of a deer are seen in the distance, and Hawkeye is about to shoot the animal for food when the warrior warns him that a shot will warn the enemy. Just as

In [11]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="summary",
        #packing = True
        #max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
# Launch the training
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 5,912
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1,478
  Number of trainable parameters = 42,520,576
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
