In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [None]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 
# pip install bitsandbytes
# pip install bnb
# pip install wandb==0.13.3 --upgrade
#pip install tensorboardX


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

### Finetuning GEITje

In [None]:
# Load GEITje formatted data
from datasets import load_dataset
chat_dataset = load_dataset('FemkeBakker/AmsterdamFormat200LlamaTokens')


In [None]:
# chat_dataset["train"]=chat_dataset["train"].select(range(2))
# chat_dataset["test"]=chat_dataset["test"].select(range(2))
# chat_dataset["val"]=chat_dataset["val"].select(range(2))
# chat_dataset["dev"]=chat_dataset["dev"].select(range(2))

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# basemodel_name = 'Rijgersberg/GEITje-7B-chat-v2'
basemodel_name = 'mistralai/Mistral-7B-Instruct-v0.2'
# basemodel_name = "stabilityai/stablelm-2-1_6b"
# basemodel_name = 'meta-llama/Llama-2-7b-chat-hf'
model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')


tokenizer = AutoTokenizer.from_pretrained(basemodel_name)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

model.config.pad_token_id = tokenizer.unk_token_id

In [None]:
# print(tokenizer.apply_chat_template(chat_dataset['train']['message'][0], tokenize=False))

In [None]:
import pandas as pd
def combine_and_save_df(model_df, save_to_path):
    
    # combine with earlier runs if exists
    if os.path.exists(save_to_path):
        original = pd.read_pickle(save_to_path)
        model_df = pd.concat([original, model_df])

    model_df.to_pickle(save_to_path)

In [11]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import time

import sys
sys.path.append('../scripts/') 
import prediction_helperfunctions as ph

def train(model, model_name, tokenizer, chat_dataset, chat_dataset_name, new_model_name, train_set, test_set, run_id='No_id', save_to_hub=True, resume=False):
    start_time = time.time()
    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['message']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset[train_set])\
                // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    output_directory = f'{cf.output_path}/finetuning_output/MistralSmallData200Tokens_finetuning_output'

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=1,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=False, #bf16=True require CUDA 11 -> original code bf16=True
        output_dir=output_directory,
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset[train_set],
        eval_dataset=chat_dataset[test_set],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

  
    dict_info = {
        'model':new_model_name,
        'base_model':model_name,
        'chat_dataset':chat_dataset_name,
        'train_set':train_set,
        'test_set': test_set,
        'training_args': training_args,
        'resume_from_checkpoint':resume,
        'date':ph.get_datetime(),
        'runtime': False,
        'Error': False,
        'run_id':run_id,
        'save_to_hub':save_to_hub,
        'output_dir': output_directory
        }

    data = pd.DataFrame(columns=dict_info.keys())

    # trainer.train(resume_from_checkpoint=resume)
    # if save_to_hub == True:
    #     trainer.push_to_hub()
    # return trainer

    try:
        trainer.train(resume_from_checkpoint=resume)
        if save_to_hub == True:
            trainer.push_to_hub()
            
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')
        print("Finished without error!")

    except KeyboardInterrupt:
        dict_info['Error'] = 'KeyboardInterrupt'3        
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')

    except Exception  as e:
        print(e)
        dict_info['Error'] = e
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')


        model_df = pd.DataFrame(dict_info)
        combine_and_save_df(model_df, f'{cf.output_path}/overview_models.pkl')


##### Note
resume_from_checkpoitn, gives error if last epoch was not fully run, because then files are missing but the folder exists, thus it throws an error. Removing the last checkpoint folder solves this, does mean the epoch need to restart completely. If this is a viable solution  depends on how long an epoch takes to train. Might be quite long. However if all files are in folder then it is fine. How long does it take before all files are saved in folder? Does this need the epoch to be completed?


MAKE SURE: no previous checkpints of runs unrelated to current run are in output folder!!

MAKE SURE: run_id is unique, for each seperate run.

In [13]:
train(model, basemodel_name, tokenizer, chat_dataset, 'FemkeBakker/AmsterdamFormat200LlamaTokens',
          'FemkeBakker/MistralSmallData200Tokens', 'dev', 'val',  run_id=9, save_to_hub=True, resume=False)



Map:   0%|          | 0/832 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect 

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,1.0069,1.119058
20,0.8022,0.961923
30,0.6905,0.897307
40,0.697,0.872296
50,1.0026,0.869633


HTTP Error 500 thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/37/a7/37a787d996b7af1b6f18fa7d6857bc2f748841b70d9f17b75c5e1bc285c3cdf4/6a750e0269eca2c8c355632186fb1bd74700f08e6f82ff67ccdd0e6af523fc3a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240502%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240502T114439Z&X-Amz-Expires=86400&X-Amz-Signature=2cba5ad20f07e06db4718a485b70a9c9a9b29aefdb2c5c9948573ad583ee8ca8&X-Amz-SignedHeaders=host&partNumber=2&uploadId=8q.sMk9sougyC06hMDB8rU9efvnFqejAv7l6P1_Z2muAt7ofmPoe4esfqlAouMbvmixrVonnY_YC76gviJ7t1DtjalhQ4SpMB1i_Su3XoUnbEFwAh7QzzHINLswMROYo&x-id=UploadPart
Retrying in 1s [Retry 1/5].


model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Finished without error!


In [None]:
# from transformers import pipeline, Conversation

# chatbot = pipeline(task='conversational', model='FemkeBakker/GEITjeSmallData200Tokens',
#                    device_map='auto', model_kwargs={'offload_buffers':True})

# ## EXAMPLE PROMPT
# response = chatbot(
#     Conversation(chat_dataset['train']['message'][0][0]['content'])
# )

# print(response)

In [14]:
yeet = pd.read_pickle(f'{cf.output_path}/overview_models.pkl')
display(yeet)

Unnamed: 0,model,base_model,chat_dataset,train_set,test_set,training_args,resume_from_checkpoint,date,runtime,Error,run_id,save_to_hub,output_dir
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-25 13:21:46.201696+02:00,0.876791,KeyboardInterrupt,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-25 13:24:59.884223+02:00,0.274652,Error(s) in loading state_dict for MistralForC...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-25 13:25:57.527833+02:00,0.362344,No valid checkpoint found in output directory ...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-25 13:26:24.369492+02:00,0.319212,CUDA out of memory. Tried to allocate 112.00 M...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 08:47:21.945291+02:00,0.392476,KeyboardInterrupt,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 08:47:57.151755+02:00,0.600505,string longer than 2147483647 bytes,0,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-26 08:56:52.283214+02:00,0.449522,False,1,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 09:26:12.329799+02:00,3.251497,[Errno 2] No such file or directory: '/home/az...,1,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 09:29:06.636546+02:00,0.543515,False,1,,
0,FemkeBakker/tryoutstablelm,stabilityai/stablelm-2-1_6b,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-29 09:46:03.908296+02:00,0.364137,KeyboardInterrupt,2,,


### classification model

In [None]:
chat_dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")


In [None]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import time

import sys
sys.path.append('../scripts/') 
import prediction_helperfunctions as ph

def train(model, model_name, tokenizer, dataset, chat_dataset_name, new_model_name, train_set, test_set, run_id='No_id', save_to_hub=True, resume=False):
    start_time = time.time()

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    # tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_data = dataset[train_set].shuffle(seed=42).select(range(100))
    train_data = train_data.map(tokenize_function, batched=True)
    test_data = dataset[test_set].shuffle(seed=42).select(range(100))    
    test_data = test_data.map(tokenize_function, batched=True)

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(train_data)\
                // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=1,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=False, #bf16=True require CUDA 11 -> original code bf16=True
        output_dir=f'{cf.output_path}/finetuning_output/stablelm_finetuning_output',
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=train_data,
        eval_dataset=test_data,
        packing=True
        # formatting_func=format,
        # neftune_noise_alpha=5,
    )

  
    dict_info = {
        'model':new_model_name,
        'base_model':model_name,
        'chat_dataset':chat_dataset_name,
        'train_set':train_set,
        'test_set': test_set,
        'training_args': training_args,
        'resume_from_checkpoint':resume,
        'date':ph.get_datetime(),
        'runtime': time.time()-start_time,
        'Error': False,
        'run_id':run_id,
        'save_to_hub':save_to_hub
        }

    data = pd.DataFrame(columns=dict_info.keys())

    trainer.train(resume_from_checkpoint=resume)
    if save_to_hub == True:
        trainer.push_to_hub()
    return trainer

    # try:
    #     trainer.train(resume_from_checkpoint=resume)
    #     if save_to_hub == True:
    #         trainer.push_to_hub()
    #     data.loc[len(data)] = dict_info
    #     combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')

    # except KeyboardInterrupt:
    #     dict_info['Error'] = 'KeyboardInterrupt'
    #     data.loc[len(data)] = dict_info
    #     combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')

    # except Exception  as e:
    #     print(e)
    #     dict_info['Error'] = e
    #     data.loc[len(data)] = dict_info
    #     combine_and_save_df(data, f'{cf.output_path}/overview_models.pkl')


    #     model_df = pd.DataFrame(dict_info)
    #     combine_and_save_df(model_df, f'{cf.output_path}/overview_models.pkl')


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification

# basemodel_name = 'Rijgersberg/GEITje-7B'
basemodel_name = "stabilityai/stablelm-2-1_6b"
model = AutoModelForSequenceClassification.from_pretrained(basemodel_name,num_labels=5, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')
tokenizer = AutoTokenizer.from_pretrained(basemodel_name)
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer.padding_side = 'right'

# model.config.pad_token_id = tokenizer.unk_token_id
model.config.problem_type = "multi_label_classification"

In [None]:
trained_class = train(model, basemodel_name, tokenizer, dataset, 'FemkeBakker/AmsterdamGEITjeFormat200Tokens',
          'FemkeBakker/tryoutstablelm', 'train', 'test',  run_id=2, save_to_hub=False, resume=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load pre-trained model and tokenizer
model_name = "FemkeBakker/TryoutGeitje2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize input text
input_text = "This is a sample sentence."
inputs = tokenizer(input_text, return_tensors="pt")

# Model inference
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1)

# Post-processing (e.g., convert predictions to labels)
predicted_label = predictions.item()
print("Predicted label:", predicted_label)


https://huggingface.co/docs/transformers/training

### Tryout finetuned model

In [None]:

## WORKING VERSION OF GEITJE-Chat using AutoModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation="eager",
                                             device_map='balanced')
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Xx Gemeente Amsterdam F l N\n% Raadscommissie voor Financiën, Coördinatie 3d, Coördinatie Aanpak Subsidies,\nAanpak Belastingen, Waterbeheer, Vastgoed, Inkoop en Personeel en Organisatie\n\n% Agenda, woensdag 3 juni 2015\nHierbij wordt u uitgenodigd voor de openbare vergadering van de Raadscommissie\nvoor Financiën, Coördinatie 3d, Coördinatie Aanpak Subsidies, Aanpak Belastingen,\nWaterbeheer, Vastgoed, Inkoop en Personeel en Organisatie\n\nTijd 09.00 tot 12.30 uur en van 13.30 tot 17',
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.

In [None]:

conversation = [
      {
        'role': 'user',
        'content': f"""
         Classificeer het document in één van de categoriën.
        Houd het kort, geef enkel de naam van de categorie als response.
    
    Categoriën:   ['Voordracht', 'Besluit', 'Schriftelijke Vragen', 'Brief', 'Raadsadres', 'Onderzoeksrapport', 'Termijnagenda', 'Raadsnotulen', 'Agenda', 'Motie', 'Actualiteit', 'Factsheets']
    
    Document: 

        'Xx Gemeente Amsterdam F l N\n% Raadscommissie voor Financiën, Coördinatie 3d, Coördinatie Aanpak Subsidies,\nAanpak Belastingen, Waterbeheer, Vastgoed, Inkoop en Personeel en Organisatie\n\n% Agenda, woensdag 3 juni 2015\nHierbij wordt u uitgenodigd voor de openbare vergadering van de Raadscommissie\nvoor Financiën, Coördinatie 3d, Coördinatie Aanpak Subsidies, Aanpak Belastingen,\nWaterbeheer, Vastgoed, Inkoop en Personeel en Organisatie\n\nTijd 09.00 tot 12.30 uur en van 13.30 tot 17'
        """
    }
]
print(generate(conversation))

In [None]:
chat_dataset['val'][0]['message'][0]['content']

### Code works for GEITje example

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

basemodel_name = 'Rijgersberg/GEITje-7B'

model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')

In [None]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

def train(model, tokenizer, chat_dataset, new_model_name):

    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['messages_nl']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset['train_sft'])\
                 // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=True,
        output_dir=f'{cf.output_path}/geitje_finetuning_output',
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset['train_sft'],
        eval_dataset=chat_dataset['test_sft'],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

    trainer.train()
    trainer.push_to_hub()
    return trainer


# if __name__ == '__main__':
# basemodel_name = 'Rijgersberg/GEITje-7B'
# model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
#                                                 low_cpu_mem_usage=True, attn_implementation="eager",
#                                                 device_map='balanced')

tokenizer = AutoTokenizer.from_pretrained(basemodel_name)

# Mistral 7B is missing a padding token by default, so we need to assign
# another token to the padding job during training.
# Unfortunately we cannot use the </s> token, because we need the model to
# learn to output </s> at the end of its turn, so that we can stop generating
# when it emits it. If we were to also use it as the padding token,
# any loss computed on </s> would then be discarded, nothing would be learned
# and the model would never stop generating.
# Trust me, I learned this the hard way ;).
# Therefore, we take the least bad alternative action and assign
# the rarely used <UNK> token to the padding role.
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
model.config.pad_token_id = tokenizer.unk_token_id

no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

chat_dataset = chat_dataset.filter(lambda row: all(turn['content'] != '<TRANSLATION FAILED>'
                                                    for turn in row['messages_nl']))

trained_model = train(model, tokenizer, chat_dataset,
        new_model_name='FemkeBakker/TryOutFinetuningGeitje')

In [None]:

## WORKING VERSION OF GEITJE-Chat using AutoModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation="eager",
                                             device_map='balanced')
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.