In [2]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [3]:
pip install protobuf  

Collecting protobuf
  Using cached protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Using cached protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
Installing collected packages: protobuf
Successfully installed protobuf-5.26.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 


In [6]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer


def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=True,
        output_dir='/tmp/geitje/output',
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

    trainer.train()
    trainer.push_to_hub()


if __name__ == '__main__':
    basemodel_name = 'Rijgersberg/GEITje-7B'
    model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                 low_cpu_mem_usage=True, attn_implementation="eager",
                                                 device_map='auto')
    tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

    # Mistral 7B is missing a padding token by default, so we need to assign
    # another token to the padding job during training.
    # Unfortunately we cannot use the </s> token, because we need the model to
    # learn to output </s> at the end of its turn, so that we can stop generating
    # when it emits it. If we were to also use it as the padding token,
    # any loss computed on </s> would then be discarded, nothing would be learned
    # and the model would never stop generating.
    # Trust me, I learned this the hard way ;).
    # Therefore, we take the least bad alternative action and assign
    # the rarely used <UNK> token to the padding role.
    # tokenizer.pad_token = tokenizer.unk_token
    # model.config.pad_token_id = tokenizer.unk_token_id

    # no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
    # ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')

    # chat_dataset = DatasetDict({
    #     'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
    #                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    #     'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
    #                                       ultrachat_nl['test_sft']]).shuffle(seed=42),
    # })

    # chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
    #                                                    for turn in row['messages_nl']))

    # train(model, tokenizer, chat_dataset,
    #       new_model_name='Rijgersberg/GEITje-7B-chat')


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer

basemodel_name = 'Rijgersberg/GEITje-7B'
model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="eager",
                                                device_map='cpu',offload_buffers=True)
print("end model")
tokenizer = AutoTokenizer.from_pretrained(basemodel_name)
print('end tokenizer')
tokenizer.pad_token = tokenizer.unk_token
model.config.pad_token_id = tokenizer.unk_token_id


device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_response(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to('cpu')
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)
    print(outputs)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate_response(conversation))

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

end model


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


end tokenizer


KeyboardInterrupt: 

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation="eager",
                                             device_map=device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 15.57 GiB of which 27.44 MiB is free. Process 22277 has 13.81 GiB memory in use. Including non-PyTorch memory, this process has 1.73 GiB memory in use. Of the allocated memory 1.58 GiB is allocated by PyTorch, and 25.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_response(conversation, temperature=0.2, top_k=50, max_new_tokens=250):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    print(tokenized)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate_response(conversation))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


tensor([[  523, 28766,  1838, 28766, 28767,    13, 28780,   301, 28729, 14277,
           556,  7463,   419,  1234, 14654,   297,  8052,  3191, 28768, 28707,
          2099,   306, 14867, 28747,   345,  6415, 28725,   363,  5715,  3606,
         28718,   326, 28725,  2970,   279,  2099, 28725,  1579, 27257,     2,
         28705,    13, 28789, 28766,   489, 11143, 28766, 28767,    13]],
       device='cuda:0')


KeyboardInterrupt: 

In [None]:
pip install protobuf  

Collecting protobuf
  Downloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-5.26.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install flash-attn --no-build-isolation


Collecting flash-attn
  Using cached flash_attn-2.5.7.tar.gz (2.5 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[13 lines of output][0m
  [31m   [0m fatal: not a git repository (or any parent up to mount point /)
  [31m   [0m Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/tmp/pip-install-0w7wgfd1/flash-attn_df563f3f7cd745e98d322f82a4b5cc97/setup.py", line 115, in <module>
  [31m   [0m     raise RuntimeError(
  [31m   [0m RuntimeError: FlashAttention is only supported on CUDA 11.6 and above.  Note: make sure nvcc has a supported version by running nvcc -V.
  [31m

In [None]:
pip install cuda

[31mERROR: Could not find a version that satisfies the requirement cuda (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cuda[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


In [6]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer


def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=True,
        tf32=True,
        output_dir='/tmp/geitje/output',
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

    trainer.train()
    trainer.push_to_hub()


if __name__ == '__main__':
    basemodel_name = 'Rijgersberg/GEITje-7B'
    model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                 low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                 device_map='auto')
    tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

    # Mistral 7B is missing a padding token by default, so we need to assign
    # another token to the padding job during training.
    # Unfortunately we cannot use the </s> token, because we need the model to
    # learn to output </s> at the end of its turn, so that we can stop generating
    # when it emits it. If we were to also use it as the padding token,
    # any loss computed on </s> would then be discarded, nothing would be learned
    # and the model would never stop generating.
    # Trust me, I learned this the hard way ;).
    # Therefore, we take the least bad alternative action and assign
    # the rarely used <UNK> token to the padding role.
    tokenizer.pad_token = tokenizer.unk_token
    model.config.pad_token_id = tokenizer.unk_token_id

    no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
    no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
    no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

    ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
    ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
    ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

    chat_dataset = DatasetDict({
        'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                           ultrachat_nl['train_sft']]).shuffle(seed=42),
        'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                          ultrachat_nl['test_sft']]).shuffle(seed=42),
    })

    chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
                                                       for turn in row['messages_nl']))

    train(model, tokenizer, chat_dataset,
          new_model_name='Rijgersberg/GEITje-7B-chat')

  


RuntimeError: Failed to import trl.trainer.sft_trainer because of the following error (look up to see its traceback):
No module named 'torch.distributed.algorithms'

In [5]:
pip install tokenizers==0.14

Collecting tokenizers==0.14
  Downloading tokenizers-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface_hub<0.17,>=0.16.4 (from tokenizers==0.14)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
Downloading tokenizers-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
[?25hDownloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub, tokenizers
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.22.2
    Uninstalling huggingface-hub-0.22.2:
      Successfully uninstalled huggingface-hub-0.22.2
  Attempting uninstall: tokenizers
    Found existing 

In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu110

Looking in indexes: https://download.pytorch.org/whl/cu110
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu110/torchvision-0.8.2%2Bcu110-cp39-cp39-linux_x86_64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/torchaudio-0.9.1-cp39-cp39-linux_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading https://download.pytorch.org/whl/cu110/torch-1.7.1%2Bcu110-cp39-cp39-linux_x86_64.whl (1156.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 GB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hCollecting pillow>=4.1.1 (from torchvision)
  Downloading https://download.pytorch.org/whl/pillow-10.2.0-cp39-cp39-manylinux_2_28_x86_64.whl (4.

In [16]:
no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))


chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

display(chat_dataset)


DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 4
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 4
    })
})