In [1]:
import const
import pandas as pd
import torch

from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, TrainingArguments

In [2]:
MODEL_OUTPUT_NAME = const.MODELS_FOLDER + "aletheianomous_ai-keyword_extractor-v0.1"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

In [4]:
training_df = pd.read_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train.csv")
training_df = training_df[0:1000]
training_df = training_df.sample(frac=1)
zephyr = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
training_df.head()

Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers,keyword,possible_keywords,is_searchable,text
313,313,Squad 2.0,Beyoncé,At the 57th Annual Grammy Awards in February 2...,Beyoncé lost the Album of the Year award to wh...,56d4ceac2ccc5a1400d83242,False,Beck,rd was won by Beck for his Morning Phase album...,Entertainer that Beyoncé lost the Album of the...,['Beyoncé lost the Album of the Year award to ...,True,<|system|>\nYou are a chatbot that assists in ...
682,682,Squad 2.0,Beyoncé,"In October 2014, it was announced that Beyoncé...",Where is Topshop located?,56d4f5d32ccc5a1400d83382,False,London,for Topshop to break into the activewear marke...,Topshop located,"['Topshop located', 'Topshop', 'located']",True,<|system|>\nYou are a chatbot that assists in ...
77,77,Squad 2.0,Beyoncé,"At age eight, Beyoncé and childhood friend Kel...","Who signed the girl group on October 5, 1995?",56d45abf2ccc5a1400d830e9,False,Dwayne Wiggins's Grass Roots Entertainment,"er seeing the group, R&B producer Arne Frager ...",signed the girl group on October,"['signed the girl group on October', 'girl gro...",False,<|system|>\nYou are a chatbot that assists in ...
781,781,Squad 2.0,Frédéric_Chopin,At the age of 21 he settled in Paris. Thereaft...,What year did he gain citizenship in France?,56ce0d8662d2951400fa69eb,False,1835,in high demand. Chopin formed a friendship wi...,year did he gain citizenship in France,"['year did he gain citizenship in France', 'ga...",False,<|system|>\nYou are a chatbot that assists in ...
938,938,Squad 2.0,Frédéric_Chopin,"In September 1828 Chopin, while still a studen...",Who was Frédéric a guest of during his stay in...,56cbeb396d243a140015edeb,False,Prince Antoni Radziwiłł,"to Berlin, he was a guest of Prince Antoni Ra...",Frédéric a guest of during his stay in Berlin,['Frédéric a guest of during his stay in Berli...,True,<|system|>\nYou are a chatbot that assists in ...


In [6]:
training_ds = Dataset.from_pandas(training_df)

In [7]:
len(training_ds)

1000

In [8]:
len(training_df)

1000

In [9]:
zephyr.config.use_cache = False
zephyr.config.pretraining_tp = 1
zephyr.gradient_checkpointing_enable()

In [10]:
zephyr

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm(

In [11]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
zephyr = prepare_model_for_kbit_training(zephyr)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

In [12]:
zephyr = get_peft_model(zephyr, peft_config)

In [13]:
training_args = TrainingArguments(
    output_dir = const.MODELS_FOLDER + "/keyword-extractor",
    per_device_train_batch_size = 4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm = 0.3,
    max_steps=-1,
    group_by_length=True,
    lr_scheduler_type="constant",
    num_train_epochs=3,
)

trainer = SFTTrainer(
    model=zephyr,
    train_dataset=training_ds, 
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()



Step,Training Loss
25,0.7754
50,0.4392
75,0.5753
100,0.4007
125,0.5555
150,0.3957
175,0.5481
200,0.3724
225,0.5351
250,0.3726




TrainOutput(global_step=750, training_loss=0.3590689779917399, metrics={'train_runtime': 1570.1231, 'train_samples_per_second': 1.911, 'train_steps_per_second': 0.478, 'total_flos': 1.0075234681946112e+16, 'train_loss': 0.3590689779917399, 'epoch': 3.0})

In [15]:
trainer.model.save_pretrained(MODEL_OUTPUT_NAME)
zephyr.config.use_cache = True