In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd
# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



### Notebook Overview
Goal: analyse the baselines, understand why they work so well

In [2]:
predictions = pd.read_pickle(f"{cf.output_path}/predictionsFinal/baselines/LogisticRegressionpredictions.pkl")
overview = pd.read_pickle(f"{cf.output_path}/predictionsFinal/baselines/overview.pkl")

In [3]:
from transformers import AutoTokenizer
from collections import Counter
from sklearn.metrics import classification_report
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import warnings
warnings.simplefilter('ignore')

def percentage_mistakes(count):
    return f"{round(count/1100*100,1)}%"

def mistakes(df, detailed=False):
    print('MISTAKES. INCLUDES PREDICTIONS ERRORS.')

    # select all response where the prediction was not correct
    mistakes = df.loc[df['label'] != df['prediction']]

    # groupby run_id so we get the amount of mistakes per run
    mistakes_per_run = mistakes.groupby('run_id').size().reset_index(name='count')
    mistakes_per_run['percentage of total predictions'] = mistakes_per_run['count'].apply(percentage_mistakes)
    print("Total amount of mistakes per run:")
    display(mistakes_per_run)

    # for each run get the amount of mistakes per class
    df_count_class = pd.DataFrame(columns=pt.get_class_list())
    for run_id in set(mistakes['run_id']):
        # select mistakes of current runid
        subdf = mistakes.loc[mistakes['run_id']==run_id]
        # count mistakes per class
        class_count = dict(Counter(subdf['label']))

        # if a class is not included in mistakes, set counter to 0
        for category in pt.get_class_list():
            if category not in class_count.keys():
                class_count[category] = 0

        # add count of class to df
        df_count_class.loc[len(df_count_class)] = class_count

    # reset index to runids
    df_count_class.index = list(set(mistakes['run_id']))
    print("Amount of mistakes per class for each run:")
    display(df_count_class)

    # For each run get for each class the highest mistakes
    df_highest_class = pd.DataFrame(columns=pt.get_class_list())
    for run_id in set(mistakes['run_id']):
        # select mistakes of current runid
        subdf = mistakes.loc[mistakes['run_id']==run_id]
        # count mistakes of runid
        count_mistakes_per_class = dict(Counter(subdf['label']))

        class_count = subdf.groupby('label')['prediction'].value_counts().reset_index(name='count')
        highest_per_class = dict()
        for label in set(class_count['label']):
            # for each class select the class for which it made the most mistakes
            highest_count_row = class_count.iloc[class_count[class_count['label'] == label]['count'].idxmax()]
            highest_per_class[label] = f"{highest_count_row['prediction']} ({highest_count_row['count']} out of {count_mistakes_per_class[label]})"

        # add to df
        df_highest_class.loc[len(df_highest_class)] = highest_per_class
    # reset index to run_id
    df_highest_class.index = list(set(mistakes['run_id']))
    print("Highest mistakes per class for each run:")
    display(df_highest_class.transpose())


def predictions_per_class(df):
    df_predictions_per_class = pd.DataFrame(columns=pt.get_class_list())
    for run_id in set(df['run_id']):
        subdf = df.loc[df['run_id']==run_id]
        count_predictions = dict(Counter(subdf['prediction']))
        df_predictions_per_class.loc[len(df_predictions_per_class)] = count_predictions

    df_predictions_per_class.index = list(set(df['run_id']))
    display(df_predictions_per_class.transpose())




In [4]:
display(overview.loc[overview['run_id'].isin(predictions['run_id'])])

Unnamed: 0,model,date,run_id,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,LogisticRegression,2024-05-15 16:06:08.417730+02:00,LogisticRegression_fulltext,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,52.680659,0.874545,0.903397,0.874545,0.864052,precision recall f1-s...
0,LogisticRegression,2024-05-15 16:06:24.172787+02:00,LogisticRegression_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,25.670804,0.854545,0.903185,0.854545,0.835388,precision recall f1-s...
0,LogisticRegression,2024-05-15 16:06:52.366648+02:00,LogisticRegression_first200_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,38.030517,0.862727,0.907136,0.862727,0.846879,precision recall f1-s...
0,LogisticRegression,2024-05-15 16:07:30.577905+02:00,LogisticRegression_first100_last100,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,47.476278,0.852727,0.905185,0.852727,0.833257,precision recall f1-s...


In [5]:
mistakes(predictions)
predictions_per_class(predictions)

MISTAKES. INCLUDES PREDICTIONS ERRORS.
Total amount of mistakes per run:


Unnamed: 0,run_id,count,percentage of total predictions
0,LogisticRegression_first100_last0,160,14.5%
1,LogisticRegression_first100_last100,162,14.7%
2,LogisticRegression_first200_last0,151,13.7%
3,LogisticRegression_fulltext,138,12.5%


Amount of mistakes per class for each run:


Unnamed: 0,Voordracht,Besluit,Schriftelijke Vraag,Brief,Raadsadres,Onderzoeksrapport,Raadsnotulen,Agenda,Motie,Actualiteit,Factsheet
LogisticRegression_first200_last0,1,8,8,5,3,6,3,1,6,24,86
LogisticRegression_first100_last0,2,8,8,5,5,4,2,3,6,27,90
LogisticRegression_fulltext,1,6,7,3,1,8,4,1,6,30,71
LogisticRegression_first100_last100,2,8,9,7,3,3,6,1,6,25,92


Highest mistakes per class for each run:


Unnamed: 0,LogisticRegression_first200_last0,LogisticRegression_first100_last0,LogisticRegression_fulltext,LogisticRegression_first100_last100
Voordracht,Besluit (1 out of 1),Besluit (1 out of 2),Besluit (1 out of 1),Besluit (1 out of 2)
Besluit,Raadsadres (6 out of 8),Raadsadres (8 out of 8),Raadsadres (4 out of 6),Raadsadres (5 out of 8)
Schriftelijke Vraag,Raadsadres (4 out of 8),Raadsadres (4 out of 8),Raadsadres (5 out of 7),Raadsadres (4 out of 9)
Brief,Motie (2 out of 5),Motie (2 out of 5),Agenda (1 out of 3),Onderzoeksrapport (3 out of 7)
Raadsadres,Onderzoeksrapport (3 out of 3),Onderzoeksrapport (4 out of 5),Schriftelijke Vraag (1 out of 1),Onderzoeksrapport (2 out of 3)
Onderzoeksrapport,Raadsadres (5 out of 6),Raadsadres (4 out of 4),Raadsadres (4 out of 8),Raadsadres (3 out of 3)
Raadsnotulen,Agenda (3 out of 3),Agenda (2 out of 2),Actualiteit (2 out of 4),Agenda (3 out of 6)
Agenda,Onderzoeksrapport (1 out of 1),Voordracht (2 out of 3),Schriftelijke Vraag (1 out of 1),Onderzoeksrapport (1 out of 1)
Motie,Brief (3 out of 6),Brief (3 out of 6),Brief (3 out of 6),Raadsadres (4 out of 6)
Actualiteit,Raadsadres (11 out of 24),Raadsadres (15 out of 27),Raadsadres (11 out of 30),Raadsadres (9 out of 25)


Unnamed: 0,LogisticRegression_first200_last0,LogisticRegression_first100_last0,LogisticRegression_fulltext,LogisticRegression_first100_last100
Voordracht,102,103,103,102
Besluit,95,96,96,96
Schriftelijke Vraag,92,92,98,91
Brief,99,99,104,96
Raadsadres,134,140,143,129
Onderzoeksrapport,183,185,149,200
Raadsnotulen,97,98,96,94
Agenda,105,102,105,105
Motie,99,101,104,100
Actualiteit,80,74,73,79


Bad pipe message: %s [b'\xa2\xdb\x88f\x96{:\xd3\xcdU\xc1Jx\xafy\x97\x8c\xb1 \x82o\xbbv\xdbx#7\xb6x\xae\xc4\xe9\x93f>{\xd9K~\xec.w\x00\xa1\x98\xb7\x99\xa5\xde\xee\\\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 l\x88\x7fG\x1a<\xd7u2\rY\xa1\\\x85\x91\xa4\xe7^\x9b\xf5\x86\xb1\xba\x06p,^{\x88\xc0\x1f\x1f']
Bad pipe message: %s [b'9I\x01\x9d\x9e\x80\xf4[\xd3\xe7\xfe\xc3\xecu~)\xac\xe0\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0']
Bad pipe message: %s [b"\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\

Mistakes: most trouble with Factsheet, they get mistaken as onderzoeksrapport. It can be seen that factsheet gets predicted very little, while onderzoeksrapporten a lot more. Factsheets do tend to contain a lot of image gibberish, the same as onderzoeksrapport.