In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd

In [3]:
import torch
torch.cuda.empty_cache()

## Notebook Overview
Goal: Fine-tune models for document classification.

Method: the documents are shortened by taking the first 200 tokens. Then the shortened doc is formatted using the zero-shot prompt, without template. Then the ideal response is formatted according to JSON format. Formatted doc and response are combined into conversation using the apply_chat_template function. 

*Previous notebook: FinetuningDataFormatting*

*Next notebook: GetPredictions*

In [16]:
# necesarry to log in to huggingface, to save models there
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Finetuning GEITje

In [3]:
# Load GEITje formatted data

from datasets import load_dataset
chat_dataset = load_dataset('FemkeBakker/AmsterdamBalancedFirst200Tokens')


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# basemodel_name = 'Rijgersberg/GEITje-7B-chat-v2'
basemodel_name = 'mistralai/Mistral-7B-Instruct-v0.2'
# basemodel_name = "stabilityai/stablelm-2-1_6b"
# basemodel_name = 'meta-llama/Llama-2-7b-chat-hf'
model = AutoModelForCausalLM.from_pretrained(basemodel_name, torch_dtype=torch.bfloat16,
                                                low_cpu_mem_usage=True, attn_implementation="sdpa",
                                                device_map='cpu')


tokenizer = AutoTokenizer.from_pretrained(basemodel_name)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

model.config.pad_token_id = tokenizer.unk_token_id



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [23]:
import pandas as pd

# funcation to load previous saved dataframe and combine with current model, then save again
def combine_and_save_df(model_df, save_to_path):
    
    # combine with earlier runs if exists
    if os.path.exists(save_to_path):
        original = pd.read_pickle(save_to_path)
        model_df = pd.concat([original, model_df])

    model_df.to_pickle(save_to_path)

In [27]:
import torch
from datasets import DatasetDict, load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import time

import sys
sys.path.append('../src/') 
import prediction_helperfunctions as ph

def train(model, model_name, tokenizer, chat_dataset, chat_dataset_name, new_model_name, output_directory, train_set, test_set, run_id='No_id', save_to_hub=True, resume=False):
    start_time = time.time()

    # format conversations
    def format(examples):
        return [tokenizer.apply_chat_template(conversation, tokenize=False)
                for conversation in examples['message']]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 8
    steps_per_epoch = len(chat_dataset[train_set])\
                // (torch.cuda.device_count() * per_device_train_batch_size * gradient_accumulation_steps)
    eval_steps = steps_per_epoch // 5

    training_args = TrainingArguments(
        optim='adamw_bnb_8bit',
        num_train_epochs=3,
        learning_rate=1e-5,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        gradient_checkpointing=True,
        evaluation_strategy='steps',
        eval_steps=eval_steps,
        save_strategy='epoch',
        bf16=False, #bf16=True require CUDA 11 -> original code bf16=True
        output_dir=output_directory,
        report_to=["tensorboard", 'wandb'],
        logging_steps=1,
        logging_first_step=True,
        hub_model_id=new_model_name,
        push_to_hub=True,
        hub_private_repo=True,
        hub_strategy='all_checkpoints',
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=8192,
        train_dataset=chat_dataset[train_set],
        eval_dataset=chat_dataset[test_set],
        formatting_func=format,
        neftune_noise_alpha=5,
    )

  
    dict_info = {
        'model':new_model_name,
        'base_model':model_name,
        'chat_dataset':chat_dataset_name,
        'train_set':train_set,
        'test_set': test_set,
        'training_args': training_args,
        'resume_from_checkpoint':resume,
        'date':ph.get_datetime(),
        'runtime': False,
        'Error': False,
        'run_id':run_id,
        'save_to_hub':save_to_hub,
        'output_dir': output_directory
        }

    data = pd.DataFrame(columns=dict_info.keys())

    # if no error during training, save run in overview_models and push to hub
    try:
        trainer.train(resume_from_checkpoint=resume)
        if save_to_hub == True:
            trainer.push_to_hub()
            
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/finetuning_output/overview_models.pkl')
        print("Finished without error!")

    # if keyboardinterrupted or an error is thrown, save run in overview_models
    except KeyboardInterrupt:
        dict_info['Error'] = 'KeyboardInterrupt'        
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/finetuning_output/overview_models.pkl')

    except Exception  as e:
        print(e)
        dict_info['Error'] = e
        dict_info['runtime'] = time.time()-start_time

        data.loc[len(data)] = dict_info
        combine_and_save_df(data, f'{cf.output_path}/finetuning_output/overview_models.pkl')


        model_df = pd.DataFrame(dict_info)
        combine_and_save_df(model_df, f'{cf.output_path}/finetuning_output/overview_models.pkl')


**Note**

To use resume_from_checkpoint, the epoch must be complete; otherwise, it will throw an error. If an error occurs even after an epoch is complete, remove the last checkpoint folder to resolve this. This means you can only resume training from a completed checkpoint. Since each epoch took about 30 minutes, this was not an issue.

MAKE SURE: run_id is unique, for each seperate run. Check overview_models.pkl to find which run ids have already been used.

In [None]:
# path to folder where the checkpoint of the model need to be saved
output_directory = f'{cf.output_path}/finetuning_output/AmsterdamDocClassificationLlama200T3Epochs'

# the name of the chat dataset
chat_dataset_name = 'FemkeBakker/AmsterdamBalancedFirst200Tokens'

training_set = 'train' 
validation_set = 'val'

In [28]:
train(model, basemodel_name, tokenizer, chat_dataset, chat_dataset_name,
          output_directory, training_set, validation_set,  run_id=28, save_to_hub=True, resume=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
1353,0.976,0.814801
1476,0.6826,0.812697
1599,0.7712,0.811746
1722,0.9744,0.811608
1845,1.0399,0.811607


## TODO: clean up overview file

In [10]:
overview = pd.read_pickle(f'{cf.output_path}/finetuning_output/overview_models.pkl')
display(overview)

Unnamed: 0,model,base_model,chat_dataset,train_set,test_set,training_args,resume_from_checkpoint,date,runtime,Error,run_id,save_to_hub,output_dir
0,FemkeBakker/AmsterdamDocClassificationGEITje200T,Rijgersberg/GEITje-7B-chat-v2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-05-29 14:07:04.642821+02:00,2926.216714,False,15,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationMistral200T,mistralai/Mistral-7B-Instruct-v0.2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-05-29 15:36:45.085202+02:00,2879.482625,False,16,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationLlama200T,meta-llama/Llama-2-7b-chat-hf,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-05-29 16:44:49.778730+02:00,2459.635145,False,17,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationLlama200T,meta-llama/Llama-2-7b-chat-hf,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-05-29 17:32:34.996990+02:00,2435.851116,False,17,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationMistrall...,mistralai/Mistral-7B-Instruct-v0.2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-05-29 18:17:40.189426+02:00,2909.715319,False,16,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationMistral200T,mistralai/Mistral-7B-Instruct-v0.2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-05-29 19:20:43.997402+02:00,19.168457,KeyboardInterrupt,16,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationMistral200T,mistralai/Mistral-7B-Instruct-v0.2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-05-29 19:21:12.049071+02:00,370.747607,False,16,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationGEITje200T,Rijgersberg/GEITje-7B-chat-v2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-05-29 19:39:27.535325+02:00,2821.834527,False,15,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationGEITje20...,Rijgersberg/GEITje-7B-chat-v2,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-06-03 08:33:45.251941+02:00,419.937012,False,18,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
0,FemkeBakker/AmsterdamDocClassificationLlama200...,meta-llama/Llama-2-7b-chat-hf,FemkeBakker/AmsterdamBalancedFirst200Tokens,train,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-06-03 08:52:32.234614+02:00,378.556517,False,19,True,/home/azureuser/cloudfiles/code/blobfuse/raads...
