In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [3]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 
# pip install bitsandbytes
# pip install bnb
# pip install wandb==0.13.3 --upgrade


## Notebook Overview
Goal: get predictions of the finetuned models

In [2]:
import pandas as pd
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph
import truncation as tf


In [3]:
# Load model
from transformers import pipeline, Conversation

chatbot = pipeline(task='conversational', model='FemkeBakker/GEITjeSmallData200Tokens',
                   device_map='auto', model_kwargs={'offload_buffers':True})

2024-04-29 11:27:58.375087: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [14]:
import time
import os
import pandas as pd
from bm25 import BM25


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template 
# train_df = dataframe with docs, which can be used as examples/training data/context data
# num_examples = number of examples in the prompt

def predictions_incontextlearning(chatbot, docs_df, text_column, prompt_function, train_df, num_examples):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date', 'prompt'])


    if prompt_function == pt.fewshot_prompt_bm25:
        BM25_model = BM25()
        BM25_model.fit(train_df[text_column])

    
    # prompt each document
    for index, row in docs_df.iterrows():
        if (index + 1) % 200 == 0:
            print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]

        # each prompt function takes different arguments
        # simple function is zeroshot+simple instruction
        if prompt_function == pt.simple_prompt:
            prompt = prompt_function(txt)
      
        # select fewshot examples using bm25
        elif prompt_function == pt.fewshot_prompt_bm25:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        else:
            raise ValueError("Prompt function not recognised. Check if prompt function is in prompt_template.py and included in the options above.")

        # prompt and get the response
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']
        print("label: ", row['label'].lower())
        print("response: ", response)

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)
        print("prediction:", prediction)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : docs_df.iloc[0]['trunc_col'],
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime(),
            'prompt':prompt
        }
    return results_df



In [15]:
import os
import time
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

"""
Function to run GEITje In-Context Learning experiment. 
The function allows to resume experiment, if run_id matches.
"""
# df = dataframe with all docs that need to have a prediction (docs still need to be predict + already predicted)
# run_id = unqiue for each experiment. 
# prompt_function = which prompt from prompt_template.py to use
# text_col = colum in df where the text is. (Needs to be already truncated)
# split_col = column with the dataset split. Either '2split' (train and test)or '4split'(train, test, dev and val)
# subset_train = indicates which subset to use as training. either 'train' or 'dev'
# subset_test = indicates which subset to use for testing. either 'test' or 'val'
# label_col = column with the true label
# prediction_path = path to file where predictions need to be saved.
# overview_path = path to file where results of each run need to be saved.
# model_name = name of the model. string.
# num_exmples = number of exaples given to prompt. zero in case of zeroshot. 

def run_experiment(chatbot, df, run_id, prompt_function, text_col, split_col, subset_train, subset_test, label_col, prediction_path, overview_path, model_name, num_examples=0):
    print(num_examples)
    start_time = time.time()
    test_df = df.loc[df[split_col]==subset_test]
    train_df = df.loc[df[split_col]==subset_train]
    
    # get rows of df that still need to be predicted for the specific run_id
    to_predict, previous_predictions = ph.get_rows_to_predict(test_df, prediction_path, run_id)

    # devide to_predict into subsection of 50 predictions at a time. 
    # Allows to rerun without problem. And save subsections of 50 predictions.
    step_range = list(range(0, len(to_predict), 25))

    for i in range(len(step_range)):
        try:
            sub_to_predict = to_predict.iloc[step_range[i]:step_range[i+1]]
            print(f'Starting...{step_range[i]}:{step_range[i+1]} out of {len(to_predict)}')
        except Exception as e:
            sub_to_predict = to_predict[step_range[i]:]
            print(f'Starting...last {len(sub_to_predict)} docs')

        # prompt geitje
        predictions = predictions_incontextlearning(chatbot, sub_to_predict, text_col, prompt_function, train_df, num_examples)

        # save info
        predictions['run_id'] = run_id
        predictions['train_set'] = subset_train
        predictions['test_set'] = subset_test
        predictions['shots'] = num_examples

        # save new combinations in file
        ph.combine_and_save_df(predictions, prediction_path)

        # if previous predictions, combine previous with new predictions, to get update classification report
        try:
            predictions = pd.concat([predictions, previous_predictions])

            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions
        except Exception as e:
            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions

        # save results in overview file
        date = ph.get_datetime()
        y_test = predictions['label']
        y_pred = predictions['prediction']
        report = classification_report(y_test, y_pred)

        overview = pd.DataFrame(
            [{
                'model':model_name,
                'run_id':run_id,
                'date': date,
                'train_set': subset_train,
                'test_set': subset_test,
                'train_set_support':len(df.loc[df[split_col]==subset_train]),
                'test_set_support':len(predictions),
                'split_col':split_col,
                'text_col':df.iloc[0]['trunc_col'],
                'runtime':sum(predictions['runtime']),
                'accuracy': accuracy_score(y_test, y_pred),
                'macro_avg_precision': precision_score(y_test, y_pred, average='macro'),
                'macro_avg_recall': recall_score(y_test, y_pred, average='macro'),
                'macro_avg_f1': f1_score(y_test, y_pred, average='macro'),
                'classification_report':report
            }   ]
        )
        # remove previous results of run_id, replace with new/updated results
        ph.replace_and_save_df(overview, overview_path, run_id)



In [3]:
#set  variables, same for each model
TRAIN_SET = 'dev' # must be dev or train
TEST_SET = 'val' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
TEXT_COLUMN = 'trunc_txt'

In [17]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [28]:



MODEL_NAME = 'GEITjeSmallData200Tokens'
PROMPT = pt.simple_prompt
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0
NUMBER_EXAMPLES = 0
run_id = f'FT_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
PREDICTION_PATH = f"{cf.output_path}/predictionsVal/finetuning/GEITje/{PROMPT_NAME}/predictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/finetuning/GEITje/{PROMPT_NAME}/overview.pkl"

print(run_id)
print(PREDICTION_PATH)

FT_GEITjeSmallData200Tokenssimple_promptLlamaTokens200_0devval_numEx0
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsVal/finetuning/GEITje/simple_prompt/predictions.pkl


In [23]:
# # ----- EXPERIMENT --------

# # add new column with truncated text -> new dataframe with column + new column name
# trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# # if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
# run_experiment(chatbot, trunc_df, f'Finetuning{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}', PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


In [27]:
pred = pd.read_pickle(PREDICTION_PATH)
display(pred)

Unnamed: 0,id,path,text_column,prompt_function,response,prediction,label,runtime,date,prompt,run_id,train_set,test_set,shots
0,25976,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Agenda},agenda,agenda,18.792697,2024-04-29 13:33:42.405623+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
1,22516,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Brief},brief,brief,14.238767,2024-04-29 13:33:56.714152+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
2,15708,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Agenda},agenda,agenda,14.273011,2024-04-29 13:34:10.996600+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
3,8810,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Onderzoeksrapport},onderzoeksrapport,onderzoeksrapport,20.118762,2024-04-29 13:34:31.117243+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
4,32980,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Factsheet},factsheet,factsheet,16.799716,2024-04-29 13:34:47.918599+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,849,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Motie},motie,motie,14.255849,2024-04-29 14:31:11.698306+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
5,29591,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Voordracht},voordracht,voordracht,17.181545,2024-04-29 14:31:28.881780+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
6,26434,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Raadsnotulen},raadsnotulen,raadsnotulen,20.037838,2024-04-29 14:31:48.921263+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0
7,10269,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront200Back0,simple_prompt,{'categorie': Actualiteit},actualiteit,actualiteit,17.375131,2024-04-29 14:32:06.298052+02:00,\n Classificeer het document in één van de ...,FT_GEITjeSmallData200Tokenssimple_promptLlamaT...,dev,val,0


### check predictions

In [6]:
from sklearn.metrics import classification_report

confusion_matrices = pd.crosstab(pred['label'], pred['prediction'])
display(confusion_matrices)

report = classification_report(pred['label'], pred['prediction']) 
print(report)

report = classification_report(pred.loc[pred['prediction']!='NoPredictionInOutput']['label'], pred.loc[pred['prediction']!='NoPredictionInOutput']['prediction']) 
print(report)

prediction,NoPredictionInOutput,actualiteit,agenda,besluit,brief,factsheet,motie,onderzoeksrapport,raadsadres,raadsnotulen,schriftelijke vraag,voordracht
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
actualiteit,0,4,4,0,1,0,0,0,0,0,1,0
agenda,0,0,22,0,0,0,0,1,0,0,0,0
besluit,0,0,0,4,0,0,0,0,0,0,0,0
brief,0,0,0,0,10,0,2,0,0,0,0,0
factsheet,0,0,0,0,0,2,0,0,0,0,0,0
motie,0,0,0,0,0,0,64,0,1,0,0,0
onderzoeksrapport,2,1,0,0,0,2,0,11,0,0,0,0
raadsadres,1,1,0,0,0,0,0,0,16,0,0,0
raadsnotulen,0,0,0,0,0,0,0,0,0,3,0,0
schriftelijke vraag,1,0,0,0,0,0,1,1,0,0,29,0


                      precision    recall  f1-score   support

NoPredictionInOutput       0.00      0.00      0.00         0
         actualiteit       0.67      0.40      0.50        10
              agenda       0.85      0.96      0.90        23
             besluit       1.00      1.00      1.00         4
               brief       0.91      0.83      0.87        12
           factsheet       0.50      1.00      0.67         2
               motie       0.96      0.98      0.97        65
   onderzoeksrapport       0.85      0.69      0.76        16
          raadsadres       0.94      0.89      0.91        18
        raadsnotulen       1.00      1.00      1.00         3
 schriftelijke vraag       0.97      0.91      0.94        32
          voordracht       1.00      1.00      1.00        24

            accuracy                           0.90       209
           macro avg       0.80      0.80      0.79       209
        weighted avg       0.92      0.90      0.91       209

     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In the table above we can see that many docs get correctly predicted. 
The classes besluit, raadsnotulen and voordracht are completely correct. However, besluit and raadsnotulen do not have many docs in the validation set. 

Additionally, there are no NoPredictionFormat errors, meaning that each time GEITje returned the output in the correct format, that of a json file.
However, there are 4 NoPredictionInOutput errors, meaning that within the output format it could not find a class from the class list

Let's take a closer look at NoPredictionInOutput.

In [7]:
no_prediction = pred.loc[pred['prediction']=='NoPredictionInOutput']
no_prediction = no_prediction[['label', 'response', 'prediction']]
display(no_prediction)

Unnamed: 0,label,response,prediction
4,schriftelijke vraag,{'categorie': Memo},NoPredictionInOutput
12,onderzoeksrapport,{'categorie': Plan},NoPredictionInOutput
2,raadsadres,{'categorie': Aanbiedingsformulier},NoPredictionInOutput
1,onderzoeksrapport,{'categorie': Plan van aanpak},NoPredictionInOutput


We can see that the response predicts clases that are not included in the class list. Thus GEITje creates new classes. One advantage: easy to add classes.

Next, we take a closer look at the other mistakes in the predictions

In [14]:
from collections import Counter

mistakes = pred.loc[pred['prediction']!=pred['label']]
mistakes = pred.loc[pred['prediction']!='NoPredictionInOutput']
print(Counter(mistakes['prediction']))
print(Counter(mistakes['label']))

Counter({'motie': 67, 'schriftelijke vraag': 30, 'agenda': 26, 'voordracht': 24, 'raadsadres': 17, 'onderzoeksrapport': 13, 'brief': 11, 'actualiteit': 6, 'factsheet': 4, 'besluit': 4, 'raadsnotulen': 3})
Counter({'motie': 65, 'schriftelijke vraag': 31, 'voordracht': 24, 'agenda': 23, 'raadsadres': 17, 'onderzoeksrapport': 14, 'brief': 12, 'actualiteit': 10, 'besluit': 4, 'raadsnotulen': 3, 'factsheet': 2})


In [33]:
yeet = pd.read_pickle(f'{cf.output_path}/predictionsVal/in_context/GEITje/simple_prompt/overview.pkl')
display(yeet)

Unnamed: 0,model,run_id,date,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,GEITje-7B-chat-v2,IC_GEITje-7B-chat-v2simple_promptLlamaTokens20...,2024-04-29 19:00:27.021162+02:00,dev,val,832,209,4split,TruncationLlamaTokensFront200Back0,5285.075402,0.674641,0.598501,0.595675,0.503742,precision recall f1-...


In [6]:
import pandas as pd
yeet = pd.read_pickle(f'{cf.output_path}/overview_models.pkl')
display(yeet)

AttributeError: Can't get attribute 'OutOfMemoryError' on <module 'torch.cuda' from '/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/cuda/__init__.py'>

In [5]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.29.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
yeet = pd.read_pickle(f'{cf.output_path}/overview_models.pkl')
display(yeet)

AttributeError: Can't get attribute 'OutOfMemoryError' on <module 'torch.cuda' from '/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/cuda/__init__.py'>

Unnamed: 0,model,base_model,chat_dataset,train_set,test_set,training_args,resume_from_checkpoint,date,runtime,Error,run_id,save_to_hub,output_dir
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-25 13:21:46.201696+02:00,0.876791,KeyboardInterrupt,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-25 13:24:59.884223+02:00,0.274652,Error(s) in loading state_dict for MistralForC...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-25 13:25:57.527833+02:00,0.362344,No valid checkpoint found in output directory ...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-25 13:26:24.369492+02:00,0.319212,CUDA out of memory. Tried to allocate 112.00 M...,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 08:47:21.945291+02:00,0.392476,KeyboardInterrupt,0,,
0,FemkeBakker/TryOutFinetuningGeitje,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 08:47:57.151755+02:00,0.600505,string longer than 2147483647 bytes,0,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-26 08:56:52.283214+02:00,0.449522,False,1,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 09:26:12.329799+02:00,3.251497,[Errno 2] No such file or directory: '/home/az...,1,,
0,FemkeBakker/TryoutGeitje2,Rijgersberg/GEITje-7B,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",True,2024-04-26 09:29:06.636546+02:00,0.543515,False,1,,
0,FemkeBakker/tryoutstablelm,stabilityai/stablelm-2-1_6b,FemkeBakker/AmsterdamGEITjeFormat200Tokens,dev,val,"TrainingArguments(\n_n_gpu=1,\naccelerator_con...",False,2024-04-29 09:46:03.908296+02:00,0.364137,KeyboardInterrupt,2,,
