In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



In [1]:
import torch
torch.cuda.empty_cache()

In [3]:

## WORKING VERSION OF GEITJE-Chat using AutoModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation="eager",
                                             device_map='balanced')
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<|user|>
Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
<|assistant|>
Geitje hoort er niet bij. De andere drie zijn voertuigen.


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='cpu')


model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.half,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')



# model = AutoModelForCausalLM.from_pretrained(basemodel_name,  load_in_8bit=True,
#                                                 low_cpu_mem_usage=True, attn_implementation="sdpa",
#                                                 device_map='cpu')






Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    WANDB_NOTEBOOK_NAME='FTgeitje.ipynb'
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

   

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=False,
        output_dir=f'{cf.output_path}/geitje_finetuning_output',
        # report_to=["tensorboard", 'wandb'],
        report_to="wandb",
        logging_steps=1,
        
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=4096,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5
    )

    print("yeet")
    trainer.train()
    # print("yeet2")
    # trainer.push_to_hub()
    # print("yeet3")

    return trainer
    # trainer.push_to_hub()


# if __name__ == '__main__':
# basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='balanced')

tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

# Mistral 7B is missing a padding token by default, so we need to assign
# another token to the padding job during training.
# Unfortunately we cannot use the </s> token, because we need the model to
# learn to output </s> at the end of its turn, so that we can stop generating
# when it emits it. If we were to also use it as the padding token,
# any loss computed on </s> would then be discarded, nothing would be learned
# and the model would never stop generating.
# Trust me, I learned this the hard way ;).
# Therefore, we take the least bad alternative action and assign
# the rarely used <UNK> token to the padding role.
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
model.config.pad_token_id = tokenizer.unk_token_id

no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
                                                    for turn in row['messages_nl']))

trained_model = train(model, tokenizer, chat_dataset,
        new_model_name='FemkeBakker/TryOutFinetuningGeitje')

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


yeet


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfemkebakker[0m ([33mthesisamsterdam[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 15.57 GiB of which 3.38 MiB is free. Including non-PyTorch memory, this process has 15.57 GiB memory in use. Of the allocated memory 15.35 GiB is allocated by PyTorch, and 87.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing#scrollTo=OifnljyvgN4u

In [6]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install bnb

Collecting bnb
  Downloading bnb-0.3.0-py3-none-any.whl.metadata (490 bytes)
Collecting attrs<21.0.0,>=20.2.0 (from bnb)
  Downloading attrs-20.3.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting click<8.0.0,>=7.1.2 (from bnb)
  Downloading click-7.1.2-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting markdown<4.0.0,>=3.3.3 (from bnb)
  Downloading Markdown-3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting pyyaml<6.0.0,>=5.3.1 (from bnb)
  Downloading PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl.metadata (2.1 kB)
Collecting questionary<2.0.0,>=1.5.2 (from bnb)
  Downloading questionary-1.10.0-py3-none-any.whl.metadata (5.7 kB)
Collecting smart_getenv<2.0.0,>=1.1.0 (from bnb)
  Downloading smart-getenv-1.1.0.tar.gz (5.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading bnb-0.3.0-py3-none-any.whl (9.7 kB)
Downloading attrs-20.3.0-py2.py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m

In [15]:
pip install wandb==0.13.3 --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting wandb==0.13.3
  Downloading wandb-0.13.3-py2.py3-none-any.whl.metadata (7.4 kB)
Collecting promise<3,>=2.0 (from wandb==0.13.3)
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting shortuuid>=0.5.0 (from wandb==0.13.3)
  Downloading shortuuid-1.0.13-py3-none-any.whl.metadata (5.8 kB)
Collecting protobuf<4.0dev,>=3.12.0 (from wandb==0.13.3)
  Downloading protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)
Collecting pathtools (from wandb==0.13.3)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading wandb-0.13.3-py2.py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 M

In [12]:
pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl.metadata (10 kB)
Collecting Click!=8.0.0,>=7.1 (from wandb)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.45.0-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting appdirs>=1.4.3 (from wandb)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting protobuf!=4.21.0,<5,>=3.15.0 (from wandb)
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting gitdb<5,>=4.0.1 (from GitPytho

In [16]:
no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))


chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

display(chat_dataset)


DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 4
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 4
    })
})