In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [None]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

## Notebook overview
- Goal: Run experiment for InContext Learning GEITje
- Trial run model -> prompt GEITje using, example prompt
- Zeroshot prompts
- Fewshot prompts

Load data and functions:
- data is already split
- text is already converted to tokens using model tokenizer 

In [None]:
import pandas as pd
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph
import truncation as tf


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

#### Trial run Models 
Code to run the models with a simple prompt.

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                   device_map='auto', model_kwargs={'offload_buffers':True})


chatbot_llama = pipeline(task='conversational', model='meta-llama/Llama-2-7b-chat-hf',
                   device_map='auto', model_kwargs={'offload_buffers':True})

chatbot_mistral = pipeline(task='conversational', model='mistralai/Mistral-7B-Instruct-v0.2',
                   device_map='auto', model_kwargs={'offload_buffers':True})

## EXAMPLE PROMPT
# print(chatbot(
    # Conversation('Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?')
# ))

#### Experiment functions
Prompt GEITje for each document and save the prediction, return response, response time and the prompt version

Code structure:
- 2 functions/cells:
- predictions_incontextlearning -> given a df with docs that need to be predicted, prompt the model
- run the experiment -> built in failsaves (df run in parts, with saves in between)

In [None]:
import time
import os
import pandas as pd
from bm25 import BM25


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template 
# train_df = dataframe with docs, which can be used as examples/training data/context data
# num_examples = number of examples in the prompt

def predictions_incontextlearning(chatbot, docs_df, text_column, prompt_function, train_df, num_examples):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date', 'prompt'])


    if prompt_function == pt.fewshot_prompt_with_template or prompt_function == pt.fewshot_prompt_no_template:
        BM25_model = BM25()
        BM25_model.fit(train_df[text_column])
   

    # prompt each document
    for index, row in docs_df.iterrows():
        # if (index + 1) % 200 == 0:
        #     print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]

        # each prompt function takes different arguments
        # zeroshot prompt for geitje
        if prompt_function == pt.zeroshot_prompt_geitje:
            prompt = prompt_function(txt)

        # zeroshot function for mistral and llama
        elif prompt_function == pt.zeroshot_prompt_mistral_llama:
            prompt = prompt_function(txt)

        # select fewshot examples using bm25, fewshot is the same for all models
        # elif prompt_function == pt.fewshot_prompt_bm25:
        #     prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)
        
        elif prompt_function == pt.fewshot_prompt_no_template:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        elif prompt_function == pt.fewshot_prompt_with_template:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        else:
            raise ValueError("Prompt function not recognised. Check if prompt function is in prompt_template.py and included in the options above.")

        # prompt and get the response
        # print(prompt)
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']
        print("label: ", row['label'].lower())
        print("response: ", response)

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)
        print("prediction:", prediction)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : docs_df.iloc[0]['trunc_col'],
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime(),
            'prompt':prompt
        }
    return results_df



In [None]:
import os
import time
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

"""
Function to run GEITje In-Context Learning experiment. 
The function allows to resume experiment, if run_id matches.
"""
# df = dataframe with all docs that need to have a prediction (docs still need to be predict + already predicted)
# run_id = unqiue for each experiment. 
# prompt_function = which prompt from prompt_template.py to use
# text_col = colum in df where the text is. (Needs to be already truncated)
# split_col = column with the dataset split. Either '2split' (train and test)or '4split'(train, test, dev and val)
# subset_train = indicates which subset to use as training. either 'train' or 'dev'
# subset_test = indicates which subset to use for testing. either 'test' or 'val'
# label_col = column with the true label
# prediction_path = path to file where predictions need to be saved.
# overview_path = path to file where results of each run need to be saved.
# model_name = name of the model. string.
# num_exmples = number of exaples given to prompt. zero in case of zeroshot. 

def run_experiment(chatbot, df, run_id, prompt_function, text_col, split_col, subset_train, subset_test, label_col, prediction_path, overview_path, model_name, num_examples=0):
    test_df = df.loc[df[split_col]==subset_test]
    train_df = df.loc[df[split_col]==subset_train]
    
    # get rows of df that still need to be predicted for the specific run_id
    to_predict, previous_predictions = ph.get_rows_to_predict(test_df, prediction_path, run_id)

    # devide to_predict into subsection of 50 predictions at a time. 
    # Allows to rerun without problem. And save subsections of 50 predictions.
    step_range = list(range(0, len(to_predict), 10))

    for i in range(len(step_range)):
        try:
            sub_to_predict = to_predict.iloc[step_range[i]:step_range[i+1]]
            print(f'Starting...{step_range[i]}:{step_range[i+1]} out of {len(to_predict)}')
        except Exception as e:
            sub_to_predict = to_predict[step_range[i]:]
            print(f'Starting...last {len(sub_to_predict)} docs')

        # prompt geitje
        predictions = predictions_incontextlearning(chatbot, sub_to_predict, text_col, prompt_function, train_df, num_examples)

        # save info
        predictions['run_id'] = run_id
        predictions['train_set'] = subset_train
        predictions['test_set'] = subset_test
        predictions['shots'] = num_examples

        # save new combinations in file
        print("Dont interrupt, saving predictions...")
        ph.combine_and_save_df(predictions, prediction_path)

        # if previous predictions, combine previous with new predictions, to get update classification report
        try:
            predictions = pd.concat([predictions, previous_predictions])

            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions
        except Exception as e:
            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions

        # save results in overview file
        date = ph.get_datetime()
        y_test = predictions['label']
        y_pred = predictions['prediction']

        # change error predictions to one error
        # error_names = ['NoPredictionInOutput', 'MultiplePredictionErrorInFormatting','NoPredictionFormat', 'MultiplePredictionErrorInOutput']
        # y_pred = ['OutputError' if x in error_names else x for x in y_pred]

        report = classification_report(y_test, y_pred)

        overview = pd.DataFrame(
            [{
                'model':model_name,
                'run_id':run_id,
                'date': date,
                'train_set': subset_train,
                'test_set': subset_test,
                'train_set_support':len(df.loc[df[split_col]==subset_train]),
                'test_set_support':len(predictions),
                'split_col':split_col,
                'text_col':df.iloc[0]['trunc_col'],
                'runtime':sum(predictions['runtime']),
                'accuracy': accuracy_score(y_test, y_pred),
                'macro_avg_precision': precision_score(y_test, y_pred, average='macro'),
                'macro_avg_recall': recall_score(y_test, y_pred, average='macro'),
                'macro_avg_f1': f1_score(y_test, y_pred, average='macro'),
                'weighted_avg_precision': precision_score(y_test, y_pred, average='weighted'),
                'weighted_avg_recall': recall_score(y_test, y_pred, average='weighted'),
                'weighted_avg_f1': f1_score(y_test, y_pred, average='weighted'),
                'classification_report':report
            }   ]
        )
        # remove previous results of run_id, replace with new/updated results
        ph.replace_and_save_df(overview, overview_path, run_id)
        print("Saving done! Interrupting is allowed.")
        print("Accuracy: ", accuracy_score(y_test, y_pred))




Set up variables that are the same for each model

In [None]:
#set  variables, same for each model
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = 'balanced_split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
TEXT_COLUMN = 'trunc_txt'


In [None]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

### GEITje

In [None]:
SHORT_MODEL_NAME = 'GEITje'
PROMPT = pt.zeroshot_prompt_geitje
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_geitje:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_no_template:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                    device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'GEITje-7B-chat-v2'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'



#### Load model - Finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_geitje = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationGEITje200T3Epochs',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationGEITje200T3Epochs'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 3

#### Set-up paths to save predictions

In [None]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


#### Run experiment

In [None]:
# ----- EXPERIMENT --------
# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)


# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_geitje, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


In [None]:
pred = pd.read_pickle(OVERVIEW_PATH)
# pred_run = pred.loc[pred['run_id']==f'{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}']
display(pred)

### Llama


In [None]:
SHORT_MODEL_NAME = 'Llama'
PROMPT = pt.fewshot_prompt_with_template
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_mistral_llama:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_with_template or PROMPT == pt.fewshot_prompt_no_template:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [23]:
from transformers import pipeline, Conversation

chatbot_llama = pipeline(task='conversational', model='meta-llama/Llama-2-7b-chat-hf',
                   device_map='cpu', model_kwargs={'offload_buffers':True})
# load llama using cpu, else will give cuda out of memory error when running fewshot bm25 prompt.

MODEL_NAME = 'Llama-2-7b-chat-hf'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'

#### Load model - finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_llama = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationLlama200T3Epochs',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationLlama200T3Epochs'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 3

#### Set-up paths to save predictions

In [None]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


#### Run experiment

In [24]:
# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_llama, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


Run-id already known, resuming predictions...
Starting...0:10 out of 890
label:  schriftelijke vraag
response:   Sure, I can help you categorize the given document. Based on the information provided, the document can be classified under the category of "Schriftelijke Vraag" (Written Question).

Here is the output in the form of a JSON file:

{
"categorie": "Schriftelijke Vraag"
}

The reason for this classification is that the document appears to be a written question addressed to the Gemeenteraad (City Council) of Amsterdam, with a specific topic and date of indention, and a request for a response from the council. The document contains questions from a member of the council regarding a particular issue, and the response from the council is provided in the form of a written answer. Therefore, it fits into the category of "Schriftelijke Vraag" (Written Question).
prediction: schriftelijke vraag
label:  voordracht
response:   Sure, I can help you with that! Based on the information prov

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7818181818181819
Starting...10:20 out of 890
label:  brief
response:   Sure, I can help you classify the given document. Based on the provided information, I would categorize the document as a "Brief".

Here's the output in JSON format:

{
"categorie": "Brief"
}

The document appears to be a formal communication from the City of Amsterdam to its council members, providing information about an agenda item that was discussed during a previous council meeting. The document includes the date, subject, and portfolio of the council member responsible for handling the matter, as well as a brief summary of the issue and the outcome of the council meeting.

The format and content of the document are consistent with a brief, which is a type of document commonly used in government and business to communicate important information in a concise and formal manner.
prediction: brief
label:  schriftelijke vraag
response:   Here is the output for the g

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7782608695652173
Starting...20:30 out of 890
label:  factsheet
response:   Based on the provided examples and the document that needs to be categorized, I would classify it as a 'Factsheet'.

The document provides information on the results and developments of a social work program in Amsterdam, including the number of Amsterdammers on social work, the goal of growing the number of workers from 4,000 in 2018 to 4,500 by 2022, and the evaluation of a year-long work brigade program. It also mentions the criteria for regular placements of candidates belonging to the SR-doelgroep and the possibility of realizing Social Return through three different methods.

Here is the output in the form of a JSON file:

{
"categorie": "Factsheet"
}
prediction: factsheet
label:  agenda
response:   Sure, I can help you categorize the document. Based on the information provided, the document can be classified under the category of "Agenda".

Here is the ou

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7833333333333333
Starting...30:40 out of 890
label:  voordracht
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Voordracht'}

The document is a speech given to the Raadscommissie voor Duurzaamheid, Circulaire Economie, Afval en DC on April 13, 2023, for discussion and knowledge sharing. The topic of the speech is related to sustainability, circular economy, waste management, and food and animal welfare, which are all categories under the 'Voordracht' category.
prediction: voordracht
label:  schriftelijke vraag
response:   Sure, I can help you categorize the document. Based on the information provided, the document can be classified into the category of "Schriftelijke Vraag" (Written Question).

Here is the output in JSON format:
{ "categorie": "Schriftelijke Vraag" }
prediction: schriftelijke vraag
label:  brief
response:   Sure, I can help y

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.788
Starting...40:50 out of 890
label:  brief
response:   Sure, I can help you classify the document. Based on the information provided, I would categorize the document as a 'Brief'.

Here is the output in the form of a JSON file:

{
"categorie": "Brief"
}

The reason for this categorization is that the document appears to be a formal communication or notification to a specific group of people, rather than a more formal decision-making document or a detailed report. The language used is professional and informative, and the document includes information on a specific topic or issue, such as the use of Groen funds in Amsterdam.
prediction: brief
label:  raadsnotulen
response:   Sure, I can help you classify the given document. Based on the provided examples and the categories you provided, I would categorize the given document as:

{'categorie': 'Raadsnotulen'}

The document contains information about a municipal council meeting, includ

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7846153846153846
Starting...50:60 out of 890
label:  agenda
response:   Sure, I can help you classify the document. Based on the examples provided, the document can be categorized as an Agenda.

Here is the output in the form of a JSON file:

{ "categorie": "Agenda" }

The reason for this categorization is that the document contains information about the agenda for a meeting of the Raadscommissie voor Algemene Zaken, Openbare Orde en Veiligheid, including the date, time, and topics to be discussed. This is a common characteristic of agendas, which are documents that provide a list of items to be addressed during a meeting.
prediction: agenda
label:  agenda
response:   Based on the provided examples and the document that needs to be categorized, I would classify it as an 'Agenda'.

The document contains the agenda for a meeting of the Raadscommissie voor Verkeer en Vervoer en Infrastructuur (Raad of Transportation and Infrastructure) of

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7888888888888889
Starting...60:70 out of 890
label:  raadsadres
response:   Sure, I can help you categorize the document! Based on the examples provided and the information given, I would classify the document as a "Bezoekadres".

Here is the output in the form of a JSON file:

{
"categorie": "Bezoekadres"
}

The reason for this classification is that the document appears to be an address for a visit, with the recipient's name and address listed, as well as contact information for the municipality. The document also includes a return address and a subject line, which are typical elements of a bezoekadres.
prediction: NoPredictionInOutput
label:  raadsadres
response:   Sure, I can help you classify the documents!

For the first document, the category is: Raadsadres

Output: {'categorie': 'Raadsadres'}

For the second document, the category is: Schriftelijke Vraag

Output: {'categorie': 'Schriftelijke Vraag'}

For the third document, the

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7857142857142857
Starting...70:80 out of 890
label:  brief
response:   Sure, I can help you classify the document! Based on the provided examples and the information provided in the question, I would categorize the document as follows:

Output: {'categorie': 'Brief'}

The document appears to be a formal letter or communication from the municipality of Amsterdam to its council members, dated September 14, 2021. The content of the document deals with the agenda item "Afdoening motie 121 van het lid Vroege van de D66" and is handled by the Verkeer en Openbare Ruimte department. These details suggest that the document is a brief or letter, and therefore belongs to the category 'Brief'.
prediction: brief
label:  schriftelijke vraag
response:   Sure, I can help you categorize the document. Based on the information provided, the document can be classified under the category of "Schriftelijke Vraag".

Here is the output in the form of a JSON f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7827586206896552
Starting...80:90 out of 890
label:  onderzoeksrapport
response:   Sure, I can help you classify the document! Based on the examples provided and the information given, I would categorize the document as an "Onderzoeksrapport".

Here is the output in the form of a JSON file:

{
"categorie": "Onderzoeksrapport"
}

The document seems to be a research report, as it contains a detailed analysis of a specific topic or issue, and includes information such as tables, figures, and references. The structure and format of the document also suggest that it is an official report, rather than a personal letter or a brief.
prediction: onderzoeksrapport
label:  agenda
response:   Sure, I can help you with that! Based on the examples provided, I would categorize the given document as follows:

{'categorie': 'Agenda'}

The document appears to be an agenda for a meeting of the Public Commission of the City of Amsterdam, held on May 10, 2

Bad pipe message: %s [b'9\x1a8F\xa2\x10\xf1\x05#\x849\r\xbd\xd5\xd29\x1b{ >\xe3\xa7\x04\x14\xe8\xcb\xd2\xcd\x17\xf7 \xb3\xd8dy]\x97\x93\xa6\xf6\x86\x14\x86\x8d\x13$e<\x9a#\x86\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xeb?\x9c\x8b7?(i\xa3\xfa\x18\xcf6']
Bad pipe message: %s [b'=\xc8\x8c\xe6\x85.\xddy\x84\xd5-\xf8(<"\x94d\x10\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0\'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x

label:  motie
response:   Sure! Based on the provided examples and the document that needs to be categorized, I would classify it as a 'Motie'.

Here is the output in the form of a JSON file:

{
"categorie": "Motie"
}

The reason for this classification is that the document appears to be a proposal or motion put forward by a member of the council, as it contains language such as "Aan de gemeenteraad" (To the council) and "Ondergetekene heeft de eer voor te stellen" (The undersigned has the honor to present). The document also includes a specific proposal or motion related to the council's agenda, which is a key characteristic of a motie.
prediction: motie
label:  actualiteit
response:   Sure! Based on the provided examples and the document that needs to be categorized, I would classify it as:

{'categorie': 'Actualiteit'}

The document is an actuality item for the upcoming council meeting of June 22, 2022, written by Van Kreuger. It provides information about the planned reduction of p

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7866666666666666
Starting...90:100 out of 890
label:  motie
response:   Sure! Based on the provided examples and the document you want to categorize, I would categorize it as a 'Motie'.

Here's the output in JSON format:

{
"categorie": "Motie"
}

Explanation:

The document you provided is a motion presented to the Gemeenteraad of Amsterdam, which is a type of 'Motie' in Dutch. The document outlines the proposals and arguments for a new event policy, including the maximum duration of events and the rejection of location profiles in the council's decision. Therefore, based on the content and structure of the document, it can be classified as a 'Motie'.
prediction: motie
label:  brief
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as a "Brief".

Here's the output in JSON format:

{
"categorie": "Brief"
}

The document appears to be an informative letter addressed

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7774193548387097
Starting...100:110 out of 890
label:  onderzoeksrapport
response:   Sure, I can help you categorize the given document. After analyzing the document, I suggest the following categories:

 {'categorie': 'Onderzoeksrapport'}

The document appears to be an research report on the topic of reducing advertising in the city of Amsterdam. It contains a detailed analysis of the current situation, including the legal framework, organizational structure, and financial aspects of advertising in the city. The report also includes recommendations for reducing advertising in the city, such as implementing a less is more approach and prioritizing quality over quantity.

The categories I identified are:

2. Beleidskader (Policy Framework)
3. Buitenreclame (Outdoor Advertising)
4. Gemeente Amsterdam (City of Amsterdam)
5. Beleidsregels reclame en reclame gerelateerde objecten in de openbare ruimte (Policy Regulations for Advertising and

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.775
Starting...110:120 out of 890
label:  agenda
response:   Sure! Based on the provided examples and the document you want to categorize, I would classify it under the category of "Agenda".

Here is the output in JSON format:

{
"categorie": "Agenda"
}

The reason for this classification is that the document appears to be a agenda for a public meeting of the General Assembly of the Amsterdam-Noord district, including the date, time, location, and items on the agenda. The document also includes procedural information and instructions for the establishment of a committee for handling religious petitions.

The other categories you provided (Voorstand, Besluit, Schriftelijke Vraag, Brief, Raadsadres, Onderzoeksrapport, Raadsnotulen, Agenda, Motie, Actualiteit, Factsheet) do not seem to apply to this document based on its content.
prediction: agenda
label:  factsheet
response:   Sure, I can help you categorize the document. Based on the in

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7757575757575758
Starting...120:130 out of 890
label:  raadsnotulen
response:   Sure, I can help you with that! Based on the provided examples and the text of the document you want to categorize, I would classify it under the category of "Agenda".

Here's the output in JSON format:

{
"categorie": "Agenda"
}

The reason for this classification is that the document appears to be an agenda for a meeting, with items listed under different categories such as "Opening and establishment of the agenda", "Mediations and questions on current affairs", and "Closing of the meeting". These categories are consistent with the format of an agenda, and suggest that the document is intended to guide the discussion and organization of a meeting.
prediction: agenda
label:  schriftelijke vraag
response:   Sure, I can help you categorize the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Schriftelijke Vraag

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7735294117647059
Starting...130:140 out of 890
label:  actualiteit
response:   Output:

{
"categorie": "Gemeente Amsterdam"
}

Explanation:

Based on the provided examples and the content of the document, it can be categorized as a "Gemeente Amsterdam" document. This category includes documents that are related to the city of Amsterdam, its government, or its residents. The document is a briefing for the stadsdeelcommissie (district committee) of Amsterdam, discussing an actuality item related to the website of the GGD (Public Health Service) regarding rat control. The document is written in Dutch and includes information on the legal basis for the discussion, as well as a proposal from the GroenLinks fraction to hold a meeting with the GGD to discuss the issue. Therefore, the document can be classified as a "Gemeente Amsterdam" document.
prediction: NoPredictionInOutput
label:  actualiteit
response:   Sure, I can help you categorize t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7685714285714286
Starting...140:150 out of 890
label:  raadsadres
response:   Sure, I can help you classify the document! Based on the examples provided and the list of categories you provided, I would categorize the document as:

 {'categorie': 'Raadsadres'}

The document appears to be a formal letter addressed to the Stadsdeelraad Centrum in Amsterdam, with a registration number, a specific department, and a copy to the council members. The content of the letter is related to a complaint against the Stadsdeelraad regarding unacceptable behavior, and it includes the sender's and recipient's contact information. All of these elements suggest that the document is a formal communication between a government agency and its stakeholders, which falls under the category of 'Raadsadres'.
prediction: raadsadres
label:  raadsnotulen
response:   Sure! Based on the provided examples and the document you want to categorize, I would classify it as 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7694444444444445
Starting...150:160 out of 890
label:  actualiteit
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Raadsadres'}

Here's why:

The document is addressed to the College of Burgomaster and Aldermen of Amsterdam, and it contains a request for a meeting to discuss a specific issue (rattenplaag in winkelgebieden). The document is also copied to several Gemeenteraadsleden, indicating that it is an official communication from the writer to the city council. These characteristics suggest that the document is a Raadsadres, which is a formal letter or memo addressed to the city council or its members.
prediction: raadsadres
label:  schriftelijke vraag
response:   Sure! Based on the provided examples and the document you want to categorize, I would classify it as a "Schriftelijke Vraag" (categorie: ['Schriftelijke Vraag']).

Here's the JS

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7675675675675676
Starting...160:170 out of 890
label:  motie
response:   Sure, I can help you categorize the document. Based on the information provided, the document can be classified into the category of "Motie".

Here is the output in JSON format:

{
"categorie": "Motie"
}

Explanation:

The document is a motion presented to the council, as indicated by the title "Motie" and the date of the council meeting. The content of the document deals with the issue of representing the city's population in the regional energy strategy and the importance of making the results of the strategy more transparent. Therefore, the category of "Motie" is the most appropriate for this document.
prediction: motie
label:  raadsnotulen
response:   Sure, I can help you categorize the given document. Based on the provided examples and the content of the document you provided, I would categorize the document as:

{'categorie': 'Raadsnotulen'}

The document co

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7657894736842106
Starting...170:180 out of 890
label:  voordracht
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as a "Voordracht" (Dutch for "Motion").

Here is the output in JSON format:

{
"categorie": "Voordracht"
}

The reason for this classification is that the document appears to be a motion or a proposal presented to a municipal council committee, as it includes information such as the date, time, and agenda point, as well as a brief description of the topic being discussed. The use of terms such as "voordracht" and "ter bespreking en ter kennisneming" (for discussion and information) also suggest that the document is related to a motion or proposal.
prediction: voordracht
label:  brief
response:   Sure, I can help you categorize the document! Based on the examples provided and the information provided in the document, I would categorize it as a 'Gemeen

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7615384615384615
Starting...180:190 out of 890
label:  agenda
response:   Sure, I can help you classify the document! Based on the examples provided and the information given, I would categorize the document as:

{'categorie': 'Agenda'}

The document appears to be an agenda for a meeting of the Stadsdeelcommissie (District Commission) for the area of IJburg, Zeeburgereiland. It includes the date, time, location, and opening of the meeting, as well as various items to be discussed, including meddelingen (announcements), the concept report from the previous meeting, and evaluations of participation.

Therefore, the category for this document is "Agenda".
prediction: agenda
label:  schriftelijke vraag
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Schriftelijke Vraag'}

The document appears to be a response to a written question submitted to th

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.76
Starting...190:200 out of 890
label:  brief
response:   Sure, I can help you with that! Based on the provided examples and the information provided in the question, I would categorize the given document as:

{'categorie': 'Gemeente Raadsinformatiebrief'}

This is because the document appears to be a standard Gemeente Raadsinformatiebrief, which is a common type of document used by municipalities in the Netherlands to inform the city council members about various matters related to the municipality's policy and operations. The document includes information about the progress of a project, specifically the monitoring of the implementation of the city's traffic safety plan, and is addressed to the members of the city council.
prediction: brief
label:  actualiteit
response:   Sure! Based on the provided examples and the content of the Raadsvergadering van 13 juli document, I would categorize it under the category of 'Actualiteit'.

Here

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7634146341463415
Starting...200:210 out of 890
label:  factsheet
response:   Sure! Based on the examples provided, I would categorize the document as follows:

{ 'categorie': 'Onderzoeksrapport }

The document appears to be a research report, as it contains a detailed analysis of data related to the cultural diversity of the municipality of Amsterdam. The document includes an introduction, a methodology section, and a conclusion, which are all typical elements of a research report. The content of the document also suggests that it is an original research study, rather than a summary or review of existing research, which further supports the categorization of 'Onderzoeksrapport'.
prediction: onderzoeksrapport
label:  actualiteit
response:   Sure, I can help you with that! Based on the provided examples and the document that needs to be categorized, I would classify the document as:

{'categorie': 'Actualiteit'}

The reason for this clas

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7595238095238095
Starting...210:220 out of 890
label:  agenda
response:   Sure, I can help you classify the document! Based on the information provided, I would categorize the document as:

{'categorie': 'Agenda'}

This is because the document appears to be an agenda for a meeting of a municipal council committee, with details of the items to be discussed and the time and location of the meeting.
prediction: agenda
label:  factsheet
response:   Sure, I can help you classify the document! Based on the content and structure of the document you provided, I would categorize it as an "Onderzoeksrapport".

Here's the output in JSON format:

{
"categorie": "Onderzoeksrapport"
}

The document appears to be a research report on the topic of teacher shortages in Amsterdam, specifically looking at the number of teachers who choose to work in Amsterdam after completing their training and how many of them stay in the city long-term. The document in

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7604651162790698
Starting...220:230 out of 890
label:  agenda
response:   Sure, I can help you classify the given document into one of the categories.

Based on the content of the document, I would categorize it as an Agenda. The document lists the items to be discussed during a meeting of the Raadscommissie Welzijn en Onderwijs in 2012, including a term of office for a member, a more years maintenance program, and a concept verordening. These are all items that are typically discussed during a meeting of a council or committee, and therefore the document can be classified as an Agenda.

Therefore, the output of the classification is:

{ 'categorie': 'Agenda' }
prediction: agenda
label:  voordracht
response:   Sure, I can help you with that! Based on the information provided, I would categorize the third document, 3015955 N% Gemeente Raadscommissie voor Ruimtelijke Ordening en Grond en Ontwikkeling RO, as a 'Voordracht'.

Here is the o

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.7613636363636364
Starting...230:240 out of 890
label:  raadsnotulen
response:   Sure, I can help you classify the document. Based on the information provided, I would categorize the document as:

{'categorie': 'Raadsnotulen'}

Here's why:

The document appears to be a meeting minutes of the Gemeente Amsterdam's Raad (Council), specifically the Afdeling 2 (Department 2) meeting held on June 28, 2017. The document includes the agenda, reports, and decisions made during the meeting, as well as the names of the attendees and the raadslid (council member) who presided over the meeting.

The document contains several sections that are typical of raadsnotulen, including:

* The opening and closing of the meeting by the voorzitter (chairperson)
* The agenda and reports of the meeting
* The decisions made during the meeting
* The names of the attendees and the raadslid who presided over the meeting

Therefore, based on these features, I would c

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving done! Interrupting is allowed.
Accuracy:  0.76
Starting...240:250 out of 890
label:  raadsadres
response:   Sure, I can help you categorize the given document. Based on the given information, I would categorize the document as:

{'categorie': 'Raadsadres'}

Here's why:

The document appears to be a formal letter addressed to the City Council, with a clear subject line and a formal tone. It includes the sender's contact information and a return address, as well as a copy to another recipient. The content of the letter is related to a specific issue or topic, which is typical of a Raadsadres document.

In contrast, a Voordracht or a Besluit would typically be more formal and structured, with a clear introduction and conclusion, and a detailed explanation of the issue or proposal. A Schriftelijke Vraag would be a more informal request for information or clarification, while a Brief would be a shorter, more concise document with a specific purpose or goal.

An Agenda or Motie would 

In [None]:
pred = pd.read_pickle(OVERVIEW_PATH)
display(pred)

### Mistral

In [None]:
SHORT_MODEL_NAME = 'Mistral'
PROMPT = pt.zeroshot_prompt_mistral_llama
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0

if PROMPT==pt.zeroshot_prompt_mistral_llama:
    NUMBER_EXAMPLES = 0
elif PROMPT == pt.fewshot_prompt_bm25:
    NUMBER_EXAMPLES = 2



#### Load model - In-context learning
Note - ONLY load one model: either in-context or fine-tuning

In [None]:
from transformers import pipeline, Conversation

chatbot_mistral = pipeline(task='conversational', model='mistralai/Mistral-7B-Instruct-v0.2',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'Mistral-7B-Instruct-v0.2'
SUBFOLDER = 'in_context'
SHORT_ID = 'IC'


#### Load model - finetuning

In [None]:
from transformers import pipeline, Conversation

chatbot_mistral = pipeline(task='conversational', model='FemkeBakker/AmsterdamDocClassificationMistral200T3Epochs',
                   device_map='cpu', model_kwargs={'offload_buffers':True})

MODEL_NAME = 'AmsterdamDocClassificationMistral200T3Epochs'
SUBFOLDER = 'finetuning'
SHORT_ID = 'FT'
EPOCHS = 3

#### Set-up paths to save predictions

In [None]:
import os

if SPLIT_COLUMN == '4split' or SPLIT_COLUMN == '2split':
    OVERVIEW_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/overview.pkl"
    PREDICTION_PATH = f"{cf.output_path}/predictionsVal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/predictions.pkl"
    
elif SPLIT_COLUMN == 'balanced_split':
    if SUBFOLDER == 'finetuning':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{EPOCHS}epochs/{SHORT_MODEL_NAME}First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

    elif SUBFOLDER == 'in_context':
        OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/overview.pkl"
        PREDICTION_PATH = f"{cf.output_path}/predictionsFinal/{SUBFOLDER}/{SHORT_MODEL_NAME}/{PROMPT_NAME}/First{FRONT_THRESHOLD}Last{BACK_THRESHOLD}Predictions.pkl"

print(OVERVIEW_PATH)
print(PREDICTION_PATH)

if not os.path.isdir(os.path.dirname(os.path.abspath(OVERVIEW_PATH))):
    raise ValueError("Folder to OVERVIEW_PATH does not exist") 
if not os.path.isdir(os.path.dirname(os.path.abspath(PREDICTION_PATH))):
    raise ValueError("Folder to PREDICTION_PATH does not exist") 

run_id = f'{SHORT_ID}_{MODEL_NAME}{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}'
print ('\n', run_id)


#### Run experiment

In [None]:
# run experiment

# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot_mistral, trunc_df, run_id, PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


In [None]:
pred = pd.read_pickle(OVERVIEW_PATH)
display(pred)
