In [1]:
import const
import pandas as pd
import torch

from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, TrainingArguments

In [2]:
MODEL_OUTPUT_NAME = const.MODELS_FOLDER + "aletheianomous_ai-keyword_extractor-v0.2"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

In [4]:
training_df = pd.read_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train.csv")
training_df = training_df[0:1000]
training_df = training_df.sample(frac=1)
zephyr = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
training_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers,keyword,possible_keywords,is_searchable,text
17,17,17,Squad 2.0,Beyoncé,Following the disbandment of Destiny's Child i...,"After her second solo album, what other entert...",56be86cf3aeaaa14008c9076,False,acting,"s (2006), and starring roles in The Pink Panth...",entertainment venture did Beyonce explore,"['entertainment venture did Beyonce explore', ...",True,<|system|>\nYou are a chatbot that assists in ...
489,501,501,Squad 2.0,Beyoncé,Beyoncé has received praise for her stage pres...,Who chose her as number one on his list of Bes...,56bfc281a10cfb14005512b6,False,Jarett Wieselman,"emale artist she's seen onstage, while Alice J...",chose her as number one on his list of Best si...,['chose her as number one on his list of Best ...,False,<|system|>\nYou are a chatbot that assists in ...
188,188,188,Squad 2.0,Beyoncé,"On April 4, 2008, Beyoncé married Jay Z. She p...",Which prominent star felt the 2009 Female Vide...,56d4bf242ccc5a1400d831c2,False,Kanye West,8 in the United States. The album formally int...,Person that felt Female Video of the Year awar...,['Female Video of the Year award should have w...,True,<|system|>\nYou are a chatbot that assists in ...
843,855,855,Squad 2.0,Frédéric_Chopin,"In October 1810, six months after Fryderyk's b...",What language did Chopin's father teach?,56d1d4d3e7d4791d00902276,False,French,violin; the mother played the piano and gave ...,language did Chopin father teach,"['language did Chopin father teach', 'Chopin f...",True,<|system|>\nYou are a chatbot that assists in ...
291,291,291,Squad 2.0,Beyoncé,"On December 13, 2013, Beyoncé unexpectedly rel...",When did Beyonce get her fifth consecutive num...,56bfa5b3a10cfb14005511f7,False,"December 13, 2013",ade her the first woman in the chart's history...,Date Beyonce got her fifth consecutive number ...,['Beyonce get her fifth consecutive number one...,True,<|system|>\nYou are a chatbot that assists in ...


In [6]:
training_ds = Dataset.from_pandas(training_df)

In [7]:
len(training_ds)

1000

In [8]:
len(training_df)

1000

In [9]:
zephyr.config.use_cache = False
zephyr.config.pretraining_tp = 1
zephyr.gradient_checkpointing_enable()

In [10]:
zephyr

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm(

In [11]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
zephyr = prepare_model_for_kbit_training(zephyr)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

In [12]:
zephyr = get_peft_model(zephyr, peft_config)

In [13]:
training_args = TrainingArguments(
    output_dir = const.MODELS_FOLDER + "/keyword-extractor",
    per_device_train_batch_size = 4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm = 0.3,
    max_steps=-1,
    group_by_length=True,
    lr_scheduler_type="constant",
    num_train_epochs=3,
)

trainer = SFTTrainer(
    model=zephyr,
    train_dataset=training_ds, 
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()



Step,Training Loss
25,0.7819
50,0.4227
75,0.5793
100,0.3937
125,0.5557
150,0.4003
175,0.5095
200,0.383
225,0.5588
250,0.3865




TrainOutput(global_step=750, training_loss=0.35775692431132, metrics={'train_runtime': 1579.3484, 'train_samples_per_second': 1.9, 'train_steps_per_second': 0.475, 'total_flos': 1.0220445847191552e+16, 'train_loss': 0.35775692431132, 'epoch': 3.0})

In [15]:
trainer.model.save_pretrained(MODEL_OUTPUT_NAME)
tokenizer.save_pretrained(MODEL_OUTPUT_NAME + "tokenizer")
zephyr.config.use_cache = True