In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


import pandas as pd

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

### Notebook Overview
Goal: get insight into the predictions made.

In [2]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [43]:
from transformers import AutoTokenizer
from collections import Counter
from sklearn.metrics import classification_report
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import warnings
warnings.simplefilter('ignore')


def get_tokens(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    return df

def format_label(label):
    format = f"""{{'categorie': {label}}}"""
    return format
    

def get_response_length(df, model_name):
    # convert response column into tokens, using models tokenizer (so either geitje, mistral or Llama)
    df = get_tokens(model_name, df, 'response', 'responseTokens')

    # convert label into ideal format
    df['label_formatted'] = df['label'].apply(format_label)
    # convert ideal format into tokens, so that we know the ideal length of the responses
    df_ideal = get_tokens(model_name, df.loc[df['run_id']==df.iloc[0]['run_id']], 'label_formatted', 'label_formattedTokens')

    # for each run, describe the response lengths
    description = df.groupby('run_id')['count_responseTokens'].describe()
    description.loc[len(description)] = df_ideal['count_label_formattedTokens'].describe()
    
    print("RESPONSE LENGTH")
    description = description.rename(index={description.index[-1]: 'IDEAL'}).round(1)
    display(description)
    return df
   

def prediction_errors(df):
    print('PREDICTION ERRORS')
    error_names = ['NoPredictionInOutput', 'MultiplePredictionErrorInFormatting','NoPredictionFormat', 'MultiplePredictionErrorInOutput']

    # only select row that have prediction error -> response of which a prediction could not be extracted. 
    errors_df = df.loc[df['prediction'].isin(error_names)]

    df_errors_count = pd.DataFrame(columns=error_names)

    for runid in set(errors_df['run_id']):
        subdf = errors_df.loc[errors_df['run_id']==runid]
        # count for each error the instances
        count = dict(Counter(subdf['prediction']))

        # check if all errors are included, else set that error to 0
        for error in error_names:
            if error not in count.keys():
                count[error]=0

        df_errors_count.loc[len(df_errors_count)] = count

    df_errors_count.index = list(set(errors_df['run_id']))
    df_errors_count['total'] = df_errors_count.sum(axis=1)

    print("Count of prediction errors for each run:")
    display(df_errors_count)

    df_classes_in_response_count = pd.DataFrame(columns=['responses with 0 classes', 'responses with 1 classes', 'responses with 2 classes', 'Correct label in response'])

    for run_id in list(set(errors_df['run_id'])):
        subdf = errors_df.loc[errors_df['run_id']==run_id]

        classes_in_responses = []
        correct_class_in_response = []
        for index, row in subdf.iterrows():
            # for each response, return list with all labels that are named in response
            classes_in_response = [category.lower() for category in pt.get_class_list() if category.lower() in row['response'].lower()]
            classes_in_responses.append(classes_in_response)

            # for each response, check if true label is named in response.
            if row['label'].lower() in classes_in_response:
                correct_class_in_response.append(True)
            else:
                correct_class_in_response.append(False)

        # count how many classes are named in a response
        amount_of_classes = dict(Counter([len(response) for response in classes_in_responses]))
        amount_of_classes = {f"responses with {k} classes":v for k,v in amount_of_classes.items()}
        amount_of_classes['Correct label in response'] = f"{correct_class_in_response.count(True)} out of {len(subdf)} prediction errors" 
        df_classes_in_response_count.loc[len(df_classes_in_response_count)] = amount_of_classes

        
    df_classes_in_response_count.index = list(set(errors_df['run_id']))

    print('amount of class in responses:')
    display(df_classes_in_response_count)

    # for each run count the errors per class
    count_error_class_df_list = []
    for run_id in set(errors_df['run_id']):
        # select errors for current runid
        subdf = errors_df.loc[errors_df['run_id']==run_id]
        errors_count_per_class = dict()
        # for each class count the errors
        for category in pt.get_class_list():
            class_df = subdf.loc[subdf['label']==category.lower()]
            count_errors = dict(Counter(class_df['prediction']))
            # sort the errors from highest to lowest for each class
            count_errors = dict(sorted(count_errors.items(), key=lambda item: item[1], reverse=True))
            errors_count_per_class[category] = count_errors

        # create df
        df_errors_count = pd.DataFrame(list(errors_count_per_class.items()), columns=['Class', run_id])
        # add df to list with the dfs of other runs
        count_error_class_df_list.append(df_errors_count)
    
    # combine df into one
    df_errors_count = count_error_class_df_list[0]
    for count_df in count_error_class_df_list[1:]:
        df_errors_count = df_errors_count.merge(count_df, on='Class', how='outer')
    
    pd.set_option('display.max_colwidth', None)
    print('Amount of errors type per class per run:')
    display(df_errors_count)



def evaluation_metrics(df):
    warnings.filterwarnings("ignore", category=UserWarning)
    report = classification_report(df['label'], df['prediction'])
    print('EVALUATION METRICS')
    print(report)


def percentage_mistakes(count):
    return f"{round(count/1100*100,1)}%"

def mistakes(df, detailed=False):
    print('MISTAKES. INCLUDES PREDICTIONS ERRORS.')

    # select all response where the prediction was not correct
    mistakes = df.loc[df['label'] != df['prediction']]

    # groupby run_id so we get the amount of mistakes per run
    mistakes_per_run = mistakes.groupby('run_id').size().reset_index(name='count')
    mistakes_per_run['percentage of total predictions'] = mistakes_per_run['count'].apply(percentage_mistakes)
    print("Total amount of mistakes per run:")
    display(mistakes_per_run)

    # for each run get the amount of mistakes per class
    df_count_class = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])
    for run_id in set(mistakes['run_id']):
        # select mistakes of current runid
        subdf = mistakes.loc[mistakes['run_id']==run_id]
        # count mistakes per class
        class_count = dict(Counter(subdf['label']))

        # if a class is not included in mistakes, set counter to 0
        for category in pt.get_class_list():
            if category.lower() not in class_count.keys():
                class_count[category.lower()] = 0

        # add count of class to df
        df_count_class.loc[len(df_count_class)] = class_count

    # reset index to runids
    df_count_class.index = list(set(mistakes['run_id']))
    print("Amount of mistakes per class for each run:")
    display(df_count_class)

    # For each run get for each class the highest mistakes
    df_highest_class = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])
    for run_id in set(mistakes['run_id']):
        # select mistakes of current runid
        subdf = mistakes.loc[mistakes['run_id']==run_id]
        # count mistakes of runid
        count_mistakes_per_class = dict(Counter(subdf['label']))

        class_count = subdf.groupby('label')['prediction'].value_counts().reset_index(name='count')
        highest_per_class = dict()
        for label in set(class_count['label']):
            # for each class select the class for which it made the most mistakes
            highest_count_row = class_count.iloc[class_count[class_count['label'] == label]['count'].idxmax()]
            highest_per_class[label] = f"{highest_count_row['prediction']} ({highest_count_row['count']} out of {count_mistakes_per_class[label]})"

        # add to df
        df_highest_class.loc[len(df_highest_class)] = highest_per_class
    # reset index to run_id
    df_highest_class.index = list(set(mistakes['run_id']))
    print("Highest mistakes per class for each run:")
    display(df_highest_class.transpose())
   

def runtime(df):
    print("RUNTIME")

    # select all docs that have runtime longer than 75th percentile.
    percentile_75 = df['runtime'].quantile(0.75)

    # count how many docs for each class take longer than 75th percentile
    df_count_long_runtimes = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])

    # calculate average runtime per doc for each class
    df_average_runtime = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])

    # for each run_id, calculate average runtime and count how many docs exceed 75th percentile
    for run_id in set(df['run_id']):
        subdf = df.loc[df['run_id']==run_id]

        # get very long runtimes
        percentile_75 = subdf['runtime'].quantile(0.75)
        above_75th_percentile = subdf[subdf['runtime'] > percentile_75]
        count_long_runtimes_per_class = dict(Counter(above_75th_percentile['label']))
        df_count_long_runtimes.loc[len(df_count_long_runtimes)] = count_long_runtimes_per_class

        # average runtime per class
        average_runtime_per_class = subdf.groupby('label')['runtime'].mean()
        df_average_runtime.loc[len(df_average_runtime)] = average_runtime_per_class

    df_count_long_runtimes.index = list(set(df['run_id']))
    df_average_runtime.index = list(set(df['run_id']))

    print('Description of runtime per doc:')
    display(df.groupby('run_id')['runtime'].describe().round(1))

    print(f"Amount of docs that took longer than 75th percentile per class:")
    display(df_count_long_runtimes)

    print('Average runtime per doc for each class')
    display(df_average_runtime.transpose().sort_values(by=list(set(df['run_id']))).round(1))


## Truncation Experiment

#### GEITje

In [10]:
# truncation experiment
predictions = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last0Predictions.pkl")
predictions2 =  pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First200Last0Predictions.pkl")
predictions3 =  pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last100Predictions.pkl")
combined = pd.concat([predictions, predictions2, predictions3])


In [16]:
df_tokenizes_responses = get_response_length(combined, 'Rijgersberg/GEITje-7B-chat-v2')

RESPONSE LENGTH


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,1100.0,16.1,5.8,6.0,14.0,16.0,18.0,136.0
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,1100.0,16.3,7.0,2.0,14.0,16.0,18.0,236.0
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,1100.0,16.2,7.7,4.0,14.0,16.0,18.0,240.0
IDEAL,1100.0,9.3,1.6,7.0,8.0,9.0,11.0,12.0


In [34]:
randomly_selected_rows = df_tokenizes_responses.sample(n=2)
for index, row in randomly_selected_rows.iterrows():
    print(row['count_responseTokens'])
    print(row['response'])

print('\n')
max_response_length_row = df_tokenizes_responses.loc[df_tokenizes_responses['count_responseTokens']==df_tokenizes_responses['count_responseTokens'].max()].iloc[0]
print(max_response_length_row['response'])
print(max_response_length_row['count_responseTokens'])

16
{
    "categorie": "Voordracht"
}
14
{
    "categorie": "Brief"
}


Technische vragen — actualiteit preventief fouilleren

Voorafgaand aan onderstaande beantwoording wordt opgemerkt dat de vragen zich richten tot
het college, maar dat het aanwijzen van veiligheidsrisicogebieden om wapencontroles te
houden geen collegebevoegdheid betreft. Het betreft een burgemeestersbevoegdheid, die in
afstemming met politie en OM wordt toegepast.

1. Er waren 5 locaties bepaald binnen elk van de vijf aangewezen
veiligheidsrisicogebieden. Ik neem aan dat elke politiebureau vóór aanvang van de
pilot op de hoogte was gebracht over elk van de locatie in zijn of haar eigen
veiligheidsr

    Vul in met de categorie van het document: {'categorie': 'Actualiteit'}     
    
240


Response length: we can see that across the runs the response lengths is pretty consistent, although the truncations 100/100 and 200 tend to go longer, looking at the right tail. However, the 25th to 75th percentile are pretty similar. The runs come quite near to the ideal response length, but they are longer than the ideal response. Looking at the three examples though, we can see that the difference in response length is because of the extra lines, which is not problematic. However, there is a big difference between the runs and the ideal in the right tail. The example of the longest response shows that those response are indeed to longer. Overall response length is quite good.

In [39]:
runtime(combined)

RUNTIME
Description of runtime per doc:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,1100.0,29.1,8.4,15.5,26.3,29.2,31.5,204.0
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,1100.0,30.8,10.4,12.3,28.4,30.0,32.5,358.6
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,1100.0,32.8,11.7,14.1,30.5,32.5,34.3,375.2


Amount of docs that took longer than 75th percentile per class:


Unnamed: 0,voordracht,besluit,schriftelijke vraag,brief,raadsadres,onderzoeksrapport,raadsnotulen,agenda,motie,actualiteit,factsheet
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,9,11,31,56,10,40,54,11,13,19,21
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,6,12,52,49,12,40,60,7,3,13,21
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,16,14,59,53,6,32,45,8,7,17,18


Average runtime per doc for each class


Unnamed: 0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0
motie,26.4,30.8,29.4
agenda,26.9,30.4,28.9
schriftelijke vraag,27.7,34.5,32.5
factsheet,28.1,31.3,29.4
besluit,28.8,32.4,29.8
raadsadres,28.9,30.2,29.0
voordracht,29.2,32.3,30.8
onderzoeksrapport,30.1,34.3,30.9
brief,31.0,34.1,31.4
actualiteit,31.2,35.7,33.6


Average runtime per doc: on average the the runs have quite similar runs times. Although truncation methods 100/100 and 200 have a higher right tail. 

Document per class that take longer than 75th percentile: biggest difference between the runs is for the class schrijftelijk vraag, where truncation 100 has much fewer document than the other truncations above the 75th percentile. Noticable, is that some class have a lot of document in the upper 75th percentile, for example, raadnotulen, brief en schriftelijke vraag.

Average runtime per class: although there are class with significant more documents in the upper 75th percentile, the average between classes is quite similar, max of 5 sec difference on average per doc. Within there is some differences between average runtime per doc of a class, but also not a more than 5 sec difference. Though we should notice that 5 seconds on many documents does grow a lot. 

In [44]:
prediction_errors(combined)

PREDICTION ERRORS
Count of prediction errors for each run:


Unnamed: 0,NoPredictionInOutput,MultiplePredictionErrorInFormatting,NoPredictionFormat,MultiplePredictionErrorInOutput,total
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,7,0,107,4,118
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,18,0,62,3,83
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,8,0,22,3,33


amount of class in responses:


Unnamed: 0,responses with 0 classes,responses with 1 classes,responses with 2 classes,Correct label in response
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,9,105,4,84 out of 118 prediction errors
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,20,60,3,38 out of 83 prediction errors
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,8,19,6,13 out of 33 prediction errors


Amount of errors type per class per run:


Unnamed: 0,Class,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0
0,Actualiteit,"{'NoPredictionFormat': 12, 'NoPredictionInOutput': 2}","{'NoPredictionFormat': 5, 'NoPredictionInOutput': 1}","{'NoPredictionFormat': 3, 'NoPredictionInOutput': 2}"
1,Agenda,{'NoPredictionFormat': 3},"{'NoPredictionFormat': 7, 'NoPredictionInOutput': 1}",{'NoPredictionFormat': 2}
2,Besluit,"{'NoPredictionFormat': 6, 'MultiplePredictionErrorInOutput': 4}","{'NoPredictionFormat': 15, 'MultiplePredictionErrorInOutput': 3, 'NoPredictionInOutput': 2}","{'NoPredictionFormat': 8, 'MultiplePredictionErrorInOutput': 3}"
3,Brief,{'NoPredictionFormat': 1},{'NoPredictionFormat': 1},{}
4,Factsheet,{'NoPredictionFormat': 3},"{'NoPredictionFormat': 5, 'NoPredictionInOutput': 1}","{'NoPredictionFormat': 2, 'NoPredictionInOutput': 2}"
5,Motie,{'NoPredictionFormat': 22},"{'NoPredictionInOutput': 5, 'NoPredictionFormat': 3}",{}
6,Onderzoeksrapport,"{'NoPredictionFormat': 5, 'NoPredictionInOutput': 3}","{'NoPredictionInOutput': 6, 'NoPredictionFormat': 4}","{'NoPredictionFormat': 3, 'NoPredictionInOutput': 3}"
7,Raadsadres,"{'NoPredictionFormat': 13, 'NoPredictionInOutput': 1}","{'NoPredictionFormat': 8, 'NoPredictionInOutput': 1}",{'NoPredictionFormat': 3}
8,Raadsnotulen,{},{'NoPredictionFormat': 1},{}
9,Schriftelijke Vraag,"{'NoPredictionFormat': 41, 'NoPredictionInOutput': 1}","{'NoPredictionFormat': 4, 'NoPredictionInOutput': 1}","{'NoPredictionInOutput': 1, 'NoPredictionFormat': 1}"


Total errors per run: there's a very big difference in errors between runs. truncation 100 has by var the most, almost four times the amount as 100/100. truncation 200 is also quite bad. It's noticeable that there are no MultiplePredictionErrorInFormatting. Most errors are made because no JSON format is given in the response. 

Classes per response: most responses do contain one class, which we were not able to extract because it probably was not given in JSON format. Additionally, quite many response do contain the right label, although we did not check whether those response contain a second class or not. More than half of the responses for truncation 100 contain the right class but were not extracted, meaning that much improvement can be made.


In [8]:
mistakes(combined, False)


MISTAKES. INCLUDES PREDICTIONS ERRORS.
Total amount of mistakes per run:


Unnamed: 0,run_id,count,percentage of total predictions
0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,466,42.4%
1,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,373,33.9%
2,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,380,34.5%


Amount of mistakes per class for each run:


Unnamed: 0,voordracht,besluit,schriftelijke vraag,brief,raadsadres,onderzoeksrapport,raadsnotulen,agenda,motie,actualiteit,factsheet
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,15,49,48,12,71,90,0,26,55,54,46
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,9,44,16,6,61,90,2,21,32,43,56
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,11,39,9,5,65,89,10,31,29,44,41


Highest mistakes per class for each run:


Unnamed: 0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0
voordracht,raadsnotulen (14 out of 15),NoPredictionFormat (9 out of 9),raadsnotulen (10 out of 11)
besluit,voordracht (22 out of 49),voordracht (23 out of 44),voordracht (17 out of 39)
schriftelijke vraag,NoPredictionFormat (41 out of 48),besluit (6 out of 16),brief (5 out of 9)
brief,raadsnotulen (10 out of 12),raadsnotulen (5 out of 6),raadsnotulen (3 out of 5)
raadsadres,brief (36 out of 71),brief (44 out of 61),brief (47 out of 65)
onderzoeksrapport,raadsnotulen (50 out of 90),raadsnotulen (53 out of 90),raadsnotulen (48 out of 89)
raadsnotulen,,NoPredictionFormat (1 out of 2),raadsadres (10 out of 10)
agenda,raadsnotulen (15 out of 26),NoPredictionFormat (7 out of 21),raadsnotulen (15 out of 31)
motie,NoPredictionFormat (22 out of 55),besluit (12 out of 32),besluit (14 out of 29)
actualiteit,raadsnotulen (18 out of 54),besluit (9 out of 43),raadsnotulen (12 out of 44)


In [53]:
# onderzoek = combined.loc[(combined['label']=='raadsadres') & (combined['label'])!=combined['prediction']]
# onderzoek = onderzoek.sample(n=10)
# for index, row in onderzoek.iterrows():
#     print(row['prediction'])
#     print(row['prompt'])
# print(len(onderzoek))
# # display(onderzoek)

Mistakes per class: the onderzoeksrapport class does horribly, 90% of the documents for all three runs are wrong predicted, either mistakes or errors. Onderzoeksrapporten get mostly mistaken as raadsnotulen (50 out of 90 mistakes). Looking at the documents, we cna see that the beginnings of those docs are messy and have very different topics, which explains the bad classification. 
Raadsadressen are also quite bad and mostly get mistaken as Brief. Looking at those docs, that seems not that weird.