In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


import pandas as pd

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

### Notebook Overview
Goal: get insight into the predictions made.

In [2]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [30]:
from transformers import AutoTokenizer
from collections import Counter
from sklearn.metrics import classification_report
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import warnings
warnings.simplefilter('ignore')


def get_tokens(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    return df

def format_label(label):
    format = f"""{{'categorie': {label}}}"""
    return format
    

def get_response_length(df, model_name):
    # convert response column into tokens, using models tokenizer (so either geitje, mistral or Llama)
    df = get_tokens(model_name, df, 'response', 'responseTokens')

    # convert label into ideal format
    df['label_formatted'] = df['label'].apply(format_label)
    # convert ideal format into tokens, so that we know the ideal length of the responses
    df_ideal = get_tokens(model_name, df.loc[df['run_id']==df.iloc[0]['run_id']], 'label_formatted', 'label_formattedTokens')

    # for each run, describe the response lengths
    description = df.groupby('run_id')['count_responseTokens'].describe()
    description.loc[len(description)] = df_ideal['count_label_formattedTokens'].describe()
    
    print("RESPONSE LENGTH")
    display(description.rename(index={description.index[-1]: 'IDEAL'}))
   

def prediction_errors(df):
    error_names = ['NoPredictionInOutput', 'MultiplePredictionErrorInFormatting','NoPredictionFormat', 'MultiplePredictionErrorInOutput']

    # only select row that have prediction error -> response of which a prediction could not be extracted. 
    errors_df = df.loc[df['prediction'].isin(error_names)]

    # count for each error the instances
    count = dict(Counter(errors_df['prediction']))

    # check if all errors are included, else set that error to 0
    for error in error_names:
        if error not in count.keys():
            count[error]=0

    # format into df to display
    class_count = errors_df.groupby('prediction')['label'].value_counts().reset_index(name='count')

    classes_in_responses = []
    correct_class_in_response = []
    for index, row in errors_df.iterrows():
        # for each response, return list with all labels that are named in response
        classes_in_response = [category.lower() for category in pt.get_class_list() if category.lower() in row['response'].lower()]
        classes_in_responses.append(classes_in_response)

        # for each response, check if true label is named in response.
        if row['label'].lower() in classes_in_response:
            correct_class_in_response.append(True)
        else:
            correct_class_in_response.append(False)

    # count how many classes are named in a response
    amount_of_classes = dict(Counter([len(response) for response in classes_in_responses]))

    # fomat print statement
    print_amount_of_classes = ''
    for amount in amount_of_classes.keys():
        print_amount_of_classes += f'There are {amount_of_classes[amount]} response that contain {amount} classes.'

        
    print('PREDICTION ERRORS')
    print(count)
    print(print_amount_of_classes)
    print(f"{correct_class_in_response.count(True)} responses out of {len(errors_df)} ({round(correct_class_in_response.count(True)/len(errors_df)*100,1)}%) prediction errors contain the correct label.")
    display(class_count)

def evaluation_metrics(df):
    warnings.filterwarnings("ignore", category=UserWarning)
    report = classification_report(df['label'], df['prediction'])
    print('EVALUATION METRICS')
    print(report)


def mistakes(df, detailed=False):
    mistakes = df.loc[df['label'] != df['prediction']]
    count_mistakes_per_class = dict(Counter(mistakes['label']))

    class_count = mistakes.groupby('label')['prediction'].value_counts().reset_index(name='count')
    print_class_count = 'Out of the mistakes, for each class:\n'
    for label in set(class_count['label']):
        highest_count_row = class_count.iloc[class_count[class_count['label'] == label]['count'].idxmax()]
        print_class_count += f"{label} gets most predicted as {highest_count_row['prediction']} ({highest_count_row['count']} out of {count_mistakes_per_class[label]})\n"

    print('MISTAKES. INCLUDES PREDICTIONS ERRORS.')
    print(f"{len(mistakes)} out of {len(df)} ({round(len(mistakes)/len(df)*100,1)}%) predictions are wrong.")
    print(f"Number of mistakes per class: {dict(sorted(count_mistakes_per_class.items(), key=lambda item: item[1], reverse=True))}")
    print(print_class_count)


    if detailed==True:
        display(class_count)

def runtime(df):
    print("RUNTIME")

    # select all docs that have runtime longer than 75th percentile.
    percentile_75 = df['runtime'].quantile(0.75)

    # count how many docs for each class take longer than 75th percentile
    df_count_long_runtimes = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])

    # calculate average runtime per doc for each class
    df_average_runtime = pd.DataFrame(columns=[category.lower() for category in pt.get_class_list()])

    # for each run_id, calculate average runtime and count how many docs exceed 75th percentile
    for run_id in set(df['run_id']):
        subdf = df.loc[df['run_id']==run_id]

        # get very long runtimes
        percentile_75 = subdf['runtime'].quantile(0.75)
        above_75th_percentile = subdf[subdf['runtime'] > percentile_75]
        count_long_runtimes_per_class = dict(Counter(above_75th_percentile['label']))
        df_count_long_runtimes.loc[len(df_count_long_runtimes)] = count_long_runtimes_per_class

        # average runtime per class
        average_runtime_per_class = subdf.groupby('label')['runtime'].mean()
        df_average_runtime.loc[len(df_average_runtime)] = average_runtime_per_class

    df_count_long_runtimes.index = list(set(df['run_id']))
    df_average_runtime.index = list(set(df['run_id']))

    print('Description of runtime per doc:')
    display(df.groupby('run_id')['runtime'].describe())

    print(f"Amount of docs that took longer than 75th percentile per class:")
    display(df_count_long_runtimes)

    print('Average runtime per doc for each class')
    display(df_average_runtime.transpose().sort_values(by=list(set(df['run_id']))).round(1))


#### GEITje

In [31]:
# in-context learning: zero-
predictions = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last0Predictions.pkl")
predictions2 =  pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First200Last0Predictions.pkl")
predictions3 =  pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last100Predictions.pkl")
combined = pd.concat([predictions, predictions2, predictions3])
# evaluation_metrics(predictions)


In [32]:
get_response_length(combined, 'Rijgersberg/GEITje-7B-chat-v2')

RESPONSE LENGTH


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,1100.0,16.062727,5.847522,6.0,14.0,16.0,18.0,136.0
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,1100.0,16.266364,7.003506,2.0,14.0,16.0,18.0,236.0
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,1100.0,16.2,7.677887,4.0,14.0,16.0,18.0,240.0
IDEAL,1100.0,9.272727,1.601348,7.0,8.0,9.0,11.0,12.0


In [33]:
runtime(combined)

RUNTIME
Description of runtime per doc:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,1100.0,29.072228,8.35857,15.46493,26.34445,29.156197,31.54701,204.018789
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,1100.0,30.80486,10.431518,12.26292,28.439818,30.02864,32.51871,358.624361
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,1100.0,32.834855,11.719529,14.116654,30.475324,32.522379,34.299403,375.201662


Amount of docs that took longer than 75th percentile per class:


Unnamed: 0,voordracht,besluit,schriftelijke vraag,brief,raadsadres,onderzoeksrapport,raadsnotulen,agenda,motie,actualiteit,factsheet
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,6,12,52,49,12,40,60,7,3,13,21
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,9,11,31,56,10,40,54,11,13,19,21
IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0,16,14,59,53,6,32,45,8,7,17,18


Average runtime per doc for each class


Unnamed: 0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens200_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_0traintest_numEx0,IC_GEITje-7B-chat-v2zeroshot_prompt_geitjeLlamaTokens100_100traintest_numEx0
raadsadres,30.2,28.9,29.0
agenda,30.4,26.9,28.9
motie,30.8,26.4,29.4
factsheet,31.3,28.1,29.4
voordracht,32.3,29.2,30.8
besluit,32.4,28.8,29.8
brief,34.1,31.0,31.4
onderzoeksrapport,34.3,30.1,30.9
schriftelijke vraag,34.5,27.7,32.5
raadsnotulen,35.2,31.6,33.0


In [149]:
prediction_errors(predictions)

PREDICTION ERRORS
{'NoPredictionInOutput': 109, 'NoPredictionFormat': 14, 'MultiplePredictionErrorInOutput': 9, 'MultiplePredictionErrorInFormatting': 3}
There are 70 response that contain 0 classes.There are 12 response that contain 2 classes.There are 44 response that contain 1 classes.There are 1 response that contain 5 classes.There are 2 response that contain 11 classes.There are 2 response that contain 3 classes.There are 2 response that contain 7 classes.There are 1 response that contain 4 classes.There are 1 response that contain 10 classes.
22 responses out of 135 (16.3%) prediction errors contain the correct label.


Unnamed: 0,prediction,label,count
0,MultiplePredictionErrorInFormatting,actualiteit,1
1,MultiplePredictionErrorInFormatting,factsheet,1
2,MultiplePredictionErrorInFormatting,motie,1
3,MultiplePredictionErrorInOutput,besluit,9
4,NoPredictionFormat,factsheet,8
5,NoPredictionFormat,onderzoeksrapport,5
6,NoPredictionFormat,raadsnotulen,1
7,NoPredictionInOutput,schriftelijke vraag,31
8,NoPredictionInOutput,onderzoeksrapport,15
9,NoPredictionInOutput,actualiteit,12


In [150]:
mistakes(predictions, False)


MISTAKES. INCLUDES PREDICTIONS ERRORS.
635 out of 1100 (57.7%) predictions are wrong.
Number of mistakes per class: {'brief': 99, 'onderzoeksrapport': 97, 'actualiteit': 92, 'factsheet': 91, 'raadsadres': 78, 'schriftelijke vraag': 48, 'motie': 45, 'besluit': 37, 'agenda': 23, 'voordracht': 17, 'raadsnotulen': 8}
Out of the mistakes, for each class:
besluit gets most predicted as voordracht (19 out of 37)
brief gets most predicted as raadsnotulen (44 out of 99)
factsheet gets most predicted as besluit (35 out of 91)
raadsnotulen gets most predicted as NoPredictionInOutput (5 out of 8)
schriftelijke vraag gets most predicted as NoPredictionInOutput (31 out of 48)
voordracht gets most predicted as besluit (7 out of 17)
onderzoeksrapport gets most predicted as besluit (38 out of 97)
actualiteit gets most predicted as agenda (29 out of 92)
motie gets most predicted as besluit (15 out of 45)
agenda gets most predicted as NoPredictionInOutput (10 out of 23)
raadsadres gets most predicted as 