In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 
# pip install bitsandbytes
# pip install bnb
# pip install wandb==0.13.3 --upgrade


In [3]:
import torch
torch.cuda.empty_cache()

In [3]:

## WORKING VERSION OF GEITJE-Chat using AutoModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation="eager",
                                             device_map='balanced')
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<|user|>
Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
<|assistant|>
Geitje hoort er niet bij. De andere drie zijn voertuigen.


In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='cpu')


model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')

# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                  low_cpu_mem_usage=True, attn_implementation="flash_attention_2",
#                                                  device_map='cuda')

# model = AutoModelForCausalLM.from_pretrained(basemodel_name,  load_in_8bit=True,
#                                                 low_cpu_mem_usage=True, attn_implementation="sdpa",
#                                                 device_map='cpu')




Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    WANDB_NOTEBOOK_NAME='FTgeitje.ipynb'
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

   

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=False,
        output_dir=f'{cf.output_path}/geitje_finetuning_output',
        # report_to=["tensorboard", 'wandb'],
        report_to="wandb",
        logging_steps=1,
        
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=4096,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5
    )

    print("yeet")
    trainer.train()
    # print("yeet2")
    # trainer.push_to_hub()
    # print("yeet3")

    return trainer
    # trainer.push_to_hub()


# if __name__ == '__main__':
# basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='balanced')

tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

# Mistral 7B is missing a padding token by default, so we need to assign
# another token to the padding job during training.
# Unfortunately we cannot use the </s> token, because we need the model to
# learn to output </s> at the end of its turn, so that we can stop generating
# when it emits it. If we were to also use it as the padding token,
# any loss computed on </s> would then be discarded, nothing would be learned
# and the model would never stop generating.
# Trust me, I learned this the hard way ;).
# Therefore, we take the least bad alternative action and assign
# the rarely used <UNK> token to the padding role.
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
model.config.pad_token_id = tokenizer.unk_token_id

no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
                                                    for turn in row['messages_nl']))

trained_model = train(model, tokenizer, chat_dataset,
        new_model_name='FemkeBakker/TryOutFinetuningGeitje')

KeyboardInterrupt: 

### Code works for GEITje example

In [8]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=True,
        output_dir=f'{cf.output_path}/geitje_finetuning_output',
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

    trainer.train()
    trainer.push_to_hub()
    return trainer


# if __name__ == '__main__':
# basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='balanced')

tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

# Mistral 7B is missing a padding token by default, so we need to assign
# another token to the padding job during training.
# Unfortunately we cannot use the </s> token, because we need the model to
# learn to output </s> at the end of its turn, so that we can stop generating
# when it emits it. If we were to also use it as the padding token,
# any loss computed on </s> would then be discarded, nothing would be learned
# and the model would never stop generating.
# Trust me, I learned this the hard way ;).
# Therefore, we take the least bad alternative action and assign
# the rarely used <UNK> token to the padding role.
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
model.config.pad_token_id = tokenizer.unk_token_id

no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
                                                    for turn in row['messages_nl']))

trained_model = train(model, tokenizer, chat_dataset,
        new_model_name='FemkeBakker/TryOutFinetuningGeitje')

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfemkebakker[0m ([33mthesisamsterdam[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
1,0.373,1.369126
2,0.3711,1.075798
3,0.127,1.051754




model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

events.out.tfevents.1713267084.femke-gpu-24cores-220ram:   0%|          | 0.00/6.70k [00:00<?, ?B/s]