In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

## Notebook overview
- Goal: Run experiment for InContext Learning GEITje
- Trial run model -> prompt GEITje using, example prompt
- Zeroshot prompts
- Fewshot prompts

Load data and functions:
- data is already split
- text is already converted to tokens using model tokenizer 

In [None]:
import pandas as pd
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph
import truncation as tf


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

#### Trial run Models 
Code to run the models with a simple prompt.

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                   device_map='auto', model_kwargs={'offload_buffers':True})


chatbot_llama = pipeline(task='conversational', model='meta-llama/Llama-2-7b-chat-hf',
                   device_map='auto', model_kwargs={'offload_buffers':True})

chatbot_mistral = pipeline(task='conversational', model='mistralai/Mistral-7B-Instruct-v0.2',
                   device_map='auto', model_kwargs={'offload_buffers':True})

## EXAMPLE PROMPT
# print(chatbot(
    # Conversation('Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?')
# ))

#### Experiment functions
Prompt GEITje for each document and save the prediction, return response, response time and the prompt version

Code structure:
- 2 functions/cells:
- predictions_incontextlearning -> given a df with docs that need to be predicted, prompt the model
- run the experiment -> built in failsaves (df run in parts, with saves in between)

In [None]:
import time
import os
import pandas as pd
from bm25 import BM25


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template 
# train_df = dataframe with docs, which can be used as examples/training data/context data
# num_examples = number of examples in the prompt

def predictions_incontextlearning(chatbot, docs_df, text_column, prompt_function, train_df, num_examples):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date', 'prompt'])


    if prompt_function == pt.fewshot_prompt_with_template or prompt_function == pt.fewshot_prompt_no_template:
        BM25_model = BM25()
        BM25_model.fit(train_df[text_column])
   

    # prompt each document
    for index, row in docs_df.iterrows():
        # if (index + 1) % 200 == 0:
        #     print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]

        # each prompt function takes different arguments
        # zeroshot prompt for geitje
        if prompt_function == pt.zeroshot_prompt_geitje:
            prompt = prompt_function(txt)

        # zeroshot function for mistral and llama
        elif prompt_function == pt.zeroshot_prompt_mistral_llama:
            prompt = prompt_function(txt)

        # select fewshot examples using bm25, fewshot is the same for all models
        # elif prompt_function == pt.fewshot_prompt_bm25:
        #     prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)
        
        elif prompt_function == pt.fewshot_prompt_no_template:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        elif prompt_function == pt.fewshot_prompt_with_template:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        else:
            raise ValueError("Prompt function not recognised. Check if prompt function is in prompt_template.py and included in the options above.")

        # prompt and get the response
        # print(prompt)
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']
        print("label: ", row['label'].lower())
        print("response: ", response)

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)
        print("prediction:", prediction)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : docs_df.iloc[0]['trunc_col'],
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime(),
            'prompt':prompt
        }
    return results_df



In [None]:
import os
import time
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

"""
Function to run GEITje In-Context Learning experiment. 
The function allows to resume experiment, if run_id matches.
"""
# df = dataframe with all docs that need to have a prediction (docs still need to be predict + already predicted)
# run_id = unqiue for each experiment. 
# prompt_function = which prompt from prompt_template.py to use
# text_col = colum in df where the text is. (Needs to be already truncated)
# split_col = column with the dataset split. Either '2split' (train and test)or '4split'(train, test, dev and val)
# subset_train = indicates which subset to use as training. either 'train' or 'dev'
# subset_test = indicates which subset to use for testing. either 'test' or 'val'
# label_col = column with the true label
# prediction_path = path to file where predictions need to be saved.
# overview_path = path to file where results of each run need to be saved.
# model_name = name of the model. string.
# num_exmples = number of exaples given to prompt. zero in case of zeroshot. 

def run_experiment(chatbot, df, run_id, prompt_function, text_col, split_col, subset_train, subset_test, label_col, prediction_path, overview_path, model_name, num_examples=0):
    test_df = df.loc[df[split_col]==subset_test]
    train_df = df.loc[df[split_col]==subset_train]
    
    # get rows of df that still need to be predicted for the specific run_id
    to_predict, previous_predictions = ph.get_rows_to_predict(test_df, prediction_path, run_id)

    # devide to_predict into subsection of 50 predictions at a time. 
    # Allows to rerun without problem. And save subsections of 50 predictions.
    step_range = list(range(0, len(to_predict), 10))

    for i in range(len(step_range)):
        try:
            sub_to_predict = to_predict.iloc[step_range[i]:step_range[i+1]]
            print(f'Starting...{step_range[i]}:{step_range[i+1]} out of {len(to_predict)}')
        except Exception as e:
            sub_to_predict = to_predict[step_range[i]:]
            print(f'Starting...last {len(sub_to_predict)} docs')

        # prompt geitje
        predictions = predictions_incontextlearning(chatbot, sub_to_predict, text_col, prompt_function, train_df, num_examples)

        # save info
        predictions['run_id'] = run_id
        predictions['train_set'] = subset_train
        predictions['test_set'] = subset_test
        predictions['shots'] = num_examples

        # save new combinations in file
        print("Dont interrupt, saving predictions...")
        ph.combine_and_save_df(predictions, prediction_path)

        # if previous predictions, combine previous with new predictions, to get update classification report
        try:
            predictions = pd.concat([predictions, previous_predictions])

            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions
        except Exception as e:
            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions

        # save results in overview file
        date = ph.get_datetime()
        y_test = predictions['label']
        y_pred = predictions['prediction']

        # change error predictions to one error
        # error_names = ['NoPredictionInOutput', 'MultiplePredictionErrorInFormatting','NoPredictionFormat', 'MultiplePredictionErrorInOutput']
        # y_pred = ['OutputError' if x in error_names else x for x in y_pred]

        report = classification_report(y_test, y_pred)

        overview = pd.DataFrame(
            [{
                'model':model_name,
                'run_id':run_id,
                'date': date,
                'train_set': subset_train,
                'test_set': subset_test,
                'train_set_support':len(df.loc[df[split_col]==subset_train]),
                'test_set_support':len(predictions),
                'split_col':split_col,
                'text_col':df.iloc[0]['trunc_col'],
                'runtime':sum(predictions['runtime']),
                'accuracy': accuracy_score(y_test, y_pred),
                'macro_avg_precision': precision_score(y_test, y_pred, average='macro'),
                'macro_avg_recall': recall_score(y_test, y_pred, average='macro'),
                'macro_avg_f1': f1_score(y_test, y_pred, average='macro'),
                'weighted_avg_precision': precision_score(y_test, y_pred, average='weighted'),
                'weighted_avg_recall': recall_score(y_test, y_pred, average='weighted'),
                'weighted_avg_f1': f1_score(y_test, y_pred, average='weighted'),
                'classification_report':report
            }   ]
        )
        # remove previous results of run_id, replace with new/updated results
        ph.replace_and_save_df(overview, overview_path, run_id)
        print("Saving done! Interrupting is allowed.")
        print("Accuracy: ", accuracy_score(y_test, y_pred))




Set up variables that are the same for each model

In [None]:
#set  variables, same for each model
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = 'balanced_split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
TEXT_COLUMN = 'trunc_txt'


In [8]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

### GEITje

In [None]:
SHORT_MODEL_NAME = 'GEITje'
PROMPT = pt.fewshot_prompt_no_template
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_geitje:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_no_template:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                    device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'GEITje-7B-chat-v2'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'



#### Load model - Finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationGEITje200T',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationGEITje200T'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 2

#### Set-up paths to save predictions

In [None]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


#### Run experiment

In [None]:
# ----- EXPERIMENT --------
# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)


# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_geitje, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


In [None]:
pred = pd.read_pickle(OVERVIEW_PATH)
# pred_run = pred.loc[pred['run_id']==f'{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}']
display(pred)

### Llama


In [9]:
SHORT_MODEL_NAME = 'Llama'
PROMPT = pt.fewshot_prompt_with_template
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_mistral_llama:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_with_template or PROMPT == pt.fewshot_prompt_no_template:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [10]:
from transformers import pipeline, Conversation

chatbot_llama = pipeline(task='conversational', model='meta-llama/Llama-2-7b-chat-hf',
                   device_map='cpu', model_kwargs={'offload_buffers':True})
# load llama using cpu, else will give cuda out of memory error when running fewshot bm25 prompt.

MODEL_NAME = 'Llama-2-7b-chat-hf'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Load model - finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_llama = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationLlama200T2Epochs',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationLlama200T2Epochs'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 2

#### Set-up paths to save predictions

In [11]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/in_context/Llama/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/in_context/Llama/fewshot_prompt_with_template/First200Last0Predictions.pkl

 IC_Llama-2-7b-chat-hffewshot_prompt_with_templateLlamaTokens200_0traintest_numEx2


#### Run experiment

In [13]:
# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_llama, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


Run-id already known, resuming predictions...
Starting...0:10 out of 1070
label:  actualiteit
response:   Sure! Based on the provided examples and the given document, I would categorize it as:

{'categorie': 'Actualiteit'}

The document appears to be an actuality item for the council committee on Spatial Planning, Ground Affairs (and Erfpacht), and it is related to the topic of the overstapregeling (transition regulation) on perpetual ground lease. The document provides information on the possibility for ground leaseholders to switch to perpetual ground lease as of October 1, 2017, and the deadline for submitting a definitive offer is November 1, 2017. Therefore, it can be classified as an actuality item for the council committee.
prediction: actualiteit
label:  voordracht
response:   Sure, I can help you categorize the document! Based on the information provided, I would classify the document as a "Voordracht" (Dutch for "Speech").

Here is the output in JSON format:

{
"categorie": "

Bad pipe message: %s [b'\xf6\xe7\x84/\xb7\x07\xa5\xe3*x\x10\x9b(\xf2\xcc\x84t\xe6 A@e\xa0\xe9\x82T\xd3\x1b\x8fn\xd7\xe8\xae\xca-\x90\x83\xaf\xd6N\x17\xf9\xbc\x91\x19\xf9t\x1c\xa2\xdb\xb4\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04', b'\x03\x06', b'\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b'\x08\x06\x04\x01\x05\x01\x06', b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 +\x03\xc9\xa0\xdaUU\xab4\x92NP"_0\x97\x19\xfc\x14\xae\xa0\xf4']
Bad pipe message: %s [b'\x89\x1c\xd29)vD\x17u\xf2`\xd1\x81W\xfc\x11\x89v\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0', b"\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R

label:  brief
response:   Sure, I can help you categorize the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Brief'}

The document appears to be a brief message from the municipality of Amsterdam to its council members, informing them about the progress of the buurtrechter project in the Venserpolder area. The document is written in a formal tone and contains information about the project's status and next steps. It does not contain any decisions or recommendations, but rather provides updates and information to the council members. Therefore, it fits into the category of a brief.
prediction: brief
Dont interrupt, saving predictions...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8
Starting...10:20 out of 1070
label:  besluit
response:   Sure! Based on the provided examples and the document that needs to be categorized, I can classify it as a 'Besluit'.

Here is the output in JSON format:

{
"categorie": "Besluit"
}

Explanation:

The document Nummer _BD2016-001839 is a voordracht (a proposal or recommendation) for a decision by the Algemeen Bestuur (General Board) of the bestuurscommissie (administrative commission) of the municipality of Amsterdam. It contains information about the agenda item, the date of the decision, and the topic of the decision. Based on these details, it can be classified as a 'Besluit' (decision) document.
prediction: besluit
label:  raadsadres
response:   Sure, I can help you with that! Based on the provided examples and the document you want to categorize, I would classify it under the category of 'Raadsadres'.

Here's the output in JSON format:

{
"categorie": "Raadsadres"
}

Explan

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.82
Starting...20:30 out of 1070
label:  voordracht
response:   Sure, I can help you categorize the given document. Based on the content and structure of the document, I would classify it under the category of "Voordracht" (Dutch for "Vote").

Here is the output in JSON format:

{
"categorie": "Voordracht"
}

The reason for this classification is that the document contains a formal proposal or recommendation for the attention of the committee, as well as information on the agenda points and decisions made by the committee. These elements are typical of a voordracht document.
prediction: voordracht
label:  onderzoeksrapport
response:   Sure! Based on the provided examples and the document that needs to be categorized, I would categorize the document as:

 {'categorie': 'Onderzoeksrapport'}

Here's why:

The document appears to be a research report, as it contains a detailed analysis and forecast of visitor numbers for Amsterdam's city ce

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8
Starting...30:40 out of 1070
label:  onderzoeksrapport
response:   Sure, I can help you categorize the given document. Based on the provided examples and the list of categories, I would classify the document as:

{'categorie': 'Onderzoeksrapport'}

The document appears to be an research report, as it contains a detailed analysis of a specific topic, including data collection, analysis, and conclusions. The structure of the document also resembles that of a research report, with sections for introduction, methodology, results, and conclusion. The use of technical terms and the level of detail in the analysis suggest that the document is intended for a specialized audience, such as researchers or professionals in a particular field.
prediction: onderzoeksrapport
label:  schriftelijke vraag
response:   Sure, I can help you classify the given document. Based on the information provided, I would categorize the document as:

{'categorie': 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8142857142857143
Starting...40:50 out of 1070
label:  actualiteit
response:   Sure, I can help you categorize the document! Based on the information provided, I would classify the document as:

{'categorie': 'Actualiteit'}

This document appears to be an actuality report for the Raadslid van Lammeren (Partij voor de Dieren) in Amsterdam, discussing various topics related to the city's politics and government. The document includes information about a new miljoenenstrop and a non-compliant permit of the AEB, as well as the lack of transparency in the diversity advisor for the police in Amsterdam. Therefore, it falls under the category of 'Actualiteit' (Actuality) in the provided list of categories.
prediction: actualiteit
label:  voordracht
response:   Sure, I can help you classify the document! Based on the examples provided and the categories you provided, I would categorize the document VN2021-015695 as follows:

{'categorie': 'Voord

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8125
Starting...50:60 out of 1070
label:  raadsnotulen
response:   Sure, I can help you categorize the given document. Based on the content and structure of the document, I would categorize it as:

{'categorie': 'Raadsnotulen'}

The document contains a meeting agenda and minutes of a municipal council meeting in Amsterdam, including the names of attendees and the topics discussed. This information is consistent with the format and content of official municipal council meeting minutes, which are typically classified as 'Raadsnotulen' in Dutch municipal governance.
prediction: raadsnotulen
label:  besluit
response:   Sure, I can help you classify the document! Based on the information provided, the document can be categorized as a 'Besluit' (Dutch for 'Decision').

Here is the output in JSON format:

{
"categorie": "Besluit"
}

The reason for this categorization is that the document appears to be a formal decision made by a governing bod

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8111111111111111
Starting...60:70 out of 1070
label:  brief
response:   Sure, I can help you classify the given document. Based on the content and structure of the document, I would categorize it as a "Brief".

Here is the output in JSON format:

{
"categorie": "Brief"
}

The reason for this categorization is that the document appears to be a formal communication from the city council to its members, providing them with information on a specific topic related to sustainability and circular economy. The document is written in a formal tone and includes details on the agenda items discussed during a meeting, as well as updates on the progress of a pilot project. These elements suggest that the document is a brief, rather than a more formal document such as a resolution or a report.
prediction: brief
label:  agenda
response:   Sure, I can help you with that! Based on the information provided, I would categorize the document as:

{'categor

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.8
Starting...70:80 out of 1070
label:  raadsnotulen
response:   Sure, I can help you categorize the given document. Based on the information provided, the document can be categorized under the following category:

{'categorie': 'Raadsnotulen'}

This is because the document contains information related to a municipal council meeting, including the agenda, minutes, and decisions made during the meeting. The document is likely to be a copy of the minutes or a summary of the meeting, which is a common type of document found in the category of 'Raadsnotulen' (Municipal Council Agendas and Minutes).
prediction: raadsnotulen
label:  onderzoeksrapport
response:   Sure, I can help you categorize the given document based on the provided categories. After analyzing the content of the document, I would categorize it as an "Onderzoeksrapport".

Here's the output in JSON format:

{
"categorie": "Onderzoeksrapport"
}

The reason for this categorizati

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7909090909090909
Starting...80:90 out of 1070
label:  raadsnotulen
response:   Sure, I can help you categorize the given document. Based on the content provided, I would classify the document as a "Raadsnotulen".

Here is the output in JSON format:

{
"categorie": "Raadsnotulen"
}

The reason for this classification is that the document contains information about a municipal council meeting, including the date, time, and agenda items, which are all typical elements of a Raadsnotulen. Additionally, the document mentions the presence of certain council members, which is also common in Raadsnotulen.
prediction: raadsnotulen
label:  raadsadres
response:   Sure, I can help you classify the given document. Based on the provided examples and the list of categories, I would categorize the given document as:

{'categorie': 'Raadsadres'}

Here's why:

The document contains the following elements that are typical of a Raadsadres:

* The address o

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7916666666666666
Starting...90:100 out of 1070
label:  besluit
response:   Sure! Based on the provided examples and the document you want to categorize, I would classify it as a 'Voordracht' (Voiceover in English).

Here's the output in JSON format:

{
"categorie": "Voordracht"
}

The reason for this classification is that the document appears to be a preparatory note or a draft of a speech, as it mentions a "voordracht" and a "besluit" (decision). The document also includes information about a specific topic or issue, which is typical of a voordracht.

I hope this helps! Let me know if you have any further questions.
prediction: voordracht
label:  brief
response:   Sure, I can help you categorize the document! Based on the information provided, I would classify the document as a "Brief".

Here is the output in the form of a JSON file:

{
"categorie": "Brief"
}

Here's why I classified it as a Brief:

* The document is addressed to the

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7769230769230769
Starting...100:110 out of 1070
label:  schriftelijke vraag
response:   Sure, I can help you categorize the given document. Based on the content and structure, I would classify it as a 'Schriftelijke Vraag' (Written Question).

Here is the output in JSON format:

{
"categorie": "Schriftelijke Vraag"
}

The document is a written question addressed to the Gemeente Amsterdam (City of Amsterdam) council, with a specific topic and date. It contains a detailed description of the question and the context in which it was asked, as well as the response from the council. Based on this information, it falls under the category of 'Schriftelijke Vraag' (Written Question).
prediction: schriftelijke vraag
label:  voordracht
response:   Sure, I can help you with that! Based on the provided examples and the content of the document you want to categorize, I would suggest the following category for the document:

{'categorie': 'Voordracht

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7785714285714286
Starting...110:120 out of 1070
label:  brief
response:   Sure! Based on the provided examples and the text of the document that needs to be categorized, I can determine the category of the document as follows:

{ 'categorie': 'Brief' }

The document appears to be a brief written by a city official to the members of the city council, providing information on a specific topic and requesting action or approval. The format and content of the document are consistent with this category.
prediction: brief
label:  factsheet
response:   Sure, I can help you classify the document! Based on the provided examples and the list of categories, I will try to categorize the given document.

The document is:

GROEN: MEER DAN MOOI EN GEZOND
De meerwaarde van groen in de stedelijke omgeving

After analyzing the document, I believe it belongs to the category of 'Onderzoeksrapport'. The document presents a research report on the benefits of

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.78
Starting...120:130 out of 1070
label:  schriftelijke vraag
response:   Sure, I can help you with that! Based on the examples you provided and the information given in the prompt, I would categorize the third document as follows:

{'categorie': 'Schriftelijke Vraag'}

The document appears to be a response to a written question from a council member regarding the promotion of intravenous drug use, specifically crystal meth, by a foundation that receives subsidy from the municipality. The document provides information and explanations to the council member's question, which suggests that it is a written answer to a question of a formal nature, hence the category of 'Schriftelijke Vraag'.
prediction: schriftelijke vraag
label:  onderzoeksrapport
response:   Sure, I can help you categorize the given document. After analyzing the content of the document, I suggest the following category:

{'categorie': 'Onderzoeksrapport'}

The document a

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.76875
Starting...130:140 out of 1070
label:  voordracht
response:   Sure, I can help you with that! Based on the provided documents and the list of categories, I would categorize the third document, VN2021-006398, as a 'Voordracht'.

Here's the output in JSON format:

{
"categorie": "Voordracht"
}

Explanation:

The document VN2021-006398 is a presentation or a speech given to the Raadsccommissie voor Bouwen en Wonen on March 24, 2021. The document contains information about the agenda points, including the topic of the presentation, which is related to the category of Voordracht. Therefore, I have categorized this document as a Voordracht.
prediction: voordracht
label:  voordracht
response:   Sure, I can help you classify the document! Based on the examples provided and the list of categories you provided, I would categorize the document VN2021-020059 as follows:

{'categorie': 'Voordracht'}

The document appears to be a speech or pre

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7764705882352941
Starting...140:150 out of 1070
label:  factsheet
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as an 'Onderzoeksrapport'.

Here's the output in JSON format:

{
"categorie": "Onderzoeksrapport"
}

The document appears to be a research report, as it contains data and analysis related to the performance of a social welfare program. The document provides an overview of the program's progress, including the number of participants, the amount of assistance provided, and the program's budget. The document also includes a dashboard with visualizations of the data, which is a common feature of research reports. Overall, the document has a formal and informative tone, which is typical of research reports.
prediction: onderzoeksrapport
label:  agenda
response:   Sure, I can help you categorize the document. Based on the examples provided, the document ca

KeyboardInterrupt: 

In [12]:
pred = pd.read_pickle(OVERVIEW_PATH)
display(pred)

Unnamed: 0,model,run_id,date,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
0,Llama-2-7b-chat-hf,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,2024-05-22 21:29:48.627017+02:00,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,140223.734147,0.422727,0.491578,0.31,0.287423,precision...,0.670333,0.422727,0.39194
0,Llama-2-7b-chat-hf,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,2024-05-27 17:53:30.065278+02:00,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,164152.727557,0.425455,0.543238,0.334286,0.321521,precision ...,0.691394,0.425455,0.409208
0,Llama-2-7b-chat-hf,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,2024-05-26 21:20:27.966620+02:00,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,132040.014386,0.474545,0.537874,0.372857,0.34995,precision ...,0.684567,0.474545,0.44539
0,Llama-2-7b-chat-hf,IC_Llama-2-7b-chat-hffewshot_prompt_with_templ...,2024-06-03 14:08:21.776925+02:00,train,test,9900,30,balanced_split,TruncationLlamaTokensFront200Back0,7092.938951,0.8,0.634615,0.641026,0.615385,precision...,0.841667,0.8,0.806667


### Mistral

In [None]:
SHORT_MODEL_NAME = 'Mistral'
PROMPT = pt.zeroshot_prompt_mistral_llama
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_mistral_llama:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_bm25:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [None]:
from transformers import pipeline, Conversation

chatbot_mistral = pipeline(task='conversational', model='mistralai/Mistral-7B-Instruct-v0.2',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'Mistral-7B-Instruct-v0.2'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'


#### Load model - finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_mistral = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationMistral200T1Epochs',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationMistral200T1Epochs'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 1

#### Set-up paths to save predictions

In [None]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview2.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


#### Run experiment

In [None]:
# run experiment

# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_mistral, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


In [None]:
pred = pd.read_pickle(OVERVIEW_PATH)
display(pred)
