In [1]:
import const
import pandas as pd
import torch

from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, Trainer, TrainingArguments

In [2]:
MODEL_OUTPUT_NAME = const.MODELS_FOLDER + "aletheianomous_ai-keyword_extractor-v0.3.2"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

In [4]:
training_df = pd.read_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train.csv")
training_df = training_df[0:1000]
training_df = training_df.sample(frac=1)
zephyr = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
training_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers,keyword,possible_keywords,is_searchable,text
384,396,396,Squad 2.0,Beyoncé,Forbes magazine began reporting on Beyoncé's e...,Beyonce beat out which musical artists for mos...,56beb4023aeaaa14008c9252,False,Madonna and Celine Dion,on. They placed her fourth on the Celebrity 10...,musical artists Beyonce beat for most paid bet...,['Beyonce beat out which musical artists for m...,True,<|system|>\nYou are a chatbot that assists in ...
181,181,181,Squad 2.0,Beyoncé,"On April 4, 2008, Beyoncé married Jay Z. She p...",When did Beyonce begin her second world tour?,56bf97aba10cfb140055119f,False,March 2009,8 in the United States. The album formally int...,Year Beyonce began her second world tour,"['Beyonce begin her second world tour', 'Beyon...",True,<|system|>\nYou are a chatbot that assists in ...
582,594,594,Squad 2.0,Beyoncé,Beyoncé's work has influenced numerous artists...,Which Indie band said Beyoncé was an inspirati...,56d4e9d12ccc5a1400d8333a,False,White Rabbits,erican indie rock band White Rabbits also cite...,Indie band said Beyoncé was an inspiration for...,['Indie band said Beyoncé was an inspiration f...,True,<|system|>\nYou are a chatbot that assists in ...
600,612,612,Squad 2.0,Beyoncé,Beyoncé has received numerous awards. As a sol...,"Totaling worldwide, how many records as Beyonc...",56becb8d3aeaaa14008c9496,False,118 million,rding Industry Association of America (RIAA) l...,records as Beyonce sold,"['records as Beyonce sold', 'Beyonce sold', 'T...",True,<|system|>\nYou are a chatbot that assists in ...
975,987,987,Squad 2.0,Frédéric_Chopin,Chopin arrived in Paris in late September 1831...,What language was Chopin never completely at e...,56d318f359d6e41400146238,False,French,"on a French passport. However, Chopin remaine...",language was Chopin never completely at ease s...,['language was Chopin never completely at ease...,True,<|system|>\nYou are a chatbot that assists in ...


In [6]:
training_ds = Dataset.from_pandas(training_df)

In [7]:
len(training_ds)

1000

In [8]:
len(training_df)

1000

In [9]:
zephyr.config.use_cache = False
zephyr.config.pretraining_tp = 1
zephyr.gradient_checkpointing_enable()

In [10]:
zephyr

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm(

In [11]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True
zephyr = prepare_model_for_kbit_training(zephyr)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

In [12]:
zephyr = get_peft_model(zephyr, peft_config)

In [13]:
training_args = TrainingArguments(
    output_dir = const.MODELS_FOLDER + "/keyword-extractor",
    per_device_train_batch_size = 4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=1e-4,
    warmup_ratio=0.03,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm = 0.3,
    max_steps=-1,
    group_by_length=True,
    lr_scheduler_type="constant",
    num_train_epochs=3,
)

trainer = SFTTrainer(
    model=zephyr,
    train_dataset=training_ds, 
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

#trainer = Trainer(
#    model=zephyr, 
#    train_dataset=training_ds['text'],
#    tokenizer=tokenizer,
#    args=training_args
#)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()



Step,Training Loss
25,0.9114
50,0.4217
75,0.5659




KeyboardInterrupt: 

In [None]:
trainer.save_model(MODEL_OUTPUT_NAME)
tokenizer.save_pretrained(MODEL_OUTPUT_NAME + "/tokenizer")
zephyr.config.use_cache = True