In [1]:
import const
import pandas as pd
import torch

from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, Trainer, TrainingArguments

In [2]:
MODEL_OUTPUT_NAME = const.MODELS_FOLDER + "aletheianomous_ai-keyword_extractor-v0.3.1"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

In [4]:
training_df = pd.read_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train.csv")
training_df = training_df[0:1000]
training_df = training_df.sample(frac=1)
zephyr = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
training_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers,keyword,possible_keywords,is_searchable,text
947,959,959,Squad 2.0,Frédéric_Chopin,"Back in Warsaw that year, Chopin heard Niccolò...",Who did Chopin hear play violin in 1829 that p...,56d315d159d6e41400146222,False,Niccolò Paganini,"nstrument. On 11 August, three weeks after com...",Chopin hear play violin,"['Chopin hear play violin', 'Chopin hear play'...",True,<|system|>\nYou are a chatbot that assists in ...
60,60,60,Squad 2.0,Beyoncé,Beyoncé attended St. Mary's Elementary School ...,Which song did Beyonce sing to win a competiti...,56bf76ef3aeaaa14008c9667,False,Imagine,interest in music and performing continued af...,song Beyonce sang to win a competition at age,['song did Beyonce sing to win a competition a...,True,<|system|>\nYou are a chatbot that assists in ...
443,455,455,Squad 2.0,Beyoncé,Beyoncé names Michael Jackson as her major mus...,To whom did Beyonce credit as her major influe...,56beba293aeaaa14008c92ef,False,Michael Jackson,"said, ""if it wasn't for Michael Jackson, I wou...",Beyonce major influence on her music,['Beyonce credit as her major influence on her...,True,<|system|>\nYou are a chatbot that assists in ...
276,276,276,Squad 2.0,Beyoncé,"In January 2013, Destiny's Child released Love...",What is the name of Beyoncé's documentary film?,56d4c6b02ccc5a1400d83226,False,Life Is But a Dream,"bama's second inauguration in Washington, D.C....",Beyoncé documentary film,"['Beyoncé documentary film', 'Beyoncé document...",True,<|system|>\nYou are a chatbot that assists in ...
217,217,217,Squad 2.0,Beyoncé,Beyoncé announced a hiatus from her music care...,Which famous landmark did Beyonce see in China?,56be97c73aeaaa14008c912e,False,the Great Wall of China,"isit multiple European cities, the Great Wall ...",famous landmark Beyonce saw in China,"['famous landmark did Beyonce see in China', '...",True,<|system|>\nYou are a chatbot that assists in ...


In [6]:
training_ds = Dataset.from_pandas(training_df)

In [7]:
len(training_ds)

1000

In [8]:
len(training_df)

1000

In [9]:
zephyr.config.use_cache = False
zephyr.config.pretraining_tp = 1
zephyr.gradient_checkpointing_enable()

In [10]:
zephyr

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm(

In [11]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True
zephyr = prepare_model_for_kbit_training(zephyr)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

In [12]:
zephyr = get_peft_model(zephyr, peft_config)

In [13]:
training_args = TrainingArguments(
    output_dir = const.MODELS_FOLDER + "/keyword-extractor",
    per_device_train_batch_size = 4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=1e-4,
    warmup_ratio=0.03,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm = 0.3,
    max_steps=-1,
    group_by_length=True,
    lr_scheduler_type="constant",
    num_train_epochs=3,
)

trainer = SFTTrainer(
    model=zephyr,
    train_dataset=training_ds, 
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

#trainer = Trainer(
#    model=zephyr, 
#    train_dataset=training_ds['text'],
#    tokenizer=tokenizer,
#    args=training_args
#)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()



Step,Training Loss
25,0.9327
50,0.4566
75,0.6015
100,0.3918
125,0.5307
150,0.3851
175,0.493
200,0.36
225,0.5445
250,0.3797




In [None]:
trainer.model.save_pretrained(MODEL_OUTPUT_NAME)
tokenizer.save_pretrained(MODEL_OUTPUT_NAME + "/tokenizer")
zephyr.config.use_cache = True