In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


import pandas as pd

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

### Notebook Overview
Goal: get insight into the predictions made.

In [2]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [125]:
from transformers import AutoTokenizer
from collections import Counter
from sklearn.metrics import classification_report
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import warnings


def get_tokens(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    return df

def format_label(label):
    format = f"""{{'categorie': {label}}}"""
    return format
    

def get_response_length(df, model_name):
    df = get_tokens(model_name, df, 'response', 'responseTokens')
    df['label_formatted'] = df['label'].apply(format_label)
    df = get_tokens(model_name, df, 'label_formatted', 'label_formattedTokens')
    print("RESPONSE LENGTH")
    print(f"Average tokens of IDEAL response: {round(df['count_label_formattedTokens'].describe()['mean'],1)} tokens (std = {round(df['count_label_formattedTokens'].describe()['std'], 1)}) ")
    print(f"Average tokens of PREDICTION response: {round(df['count_responseTokens'].describe()['mean'],1)} tokens (std = {round(df['count_responseTokens'].describe()['std'], 1)}) \n")
   

def prediction_errors(df):
    error_names = ['NoPredictionInOutput', 'MultiplePredictionErrorInFormatting','NoPredictionFormat', 'MultiplePredictionErrorInOutput']

    # only select row that have prediction error -> response of which a prediction could not be extracted. 
    errors_df = df.loc[df['prediction'].isin(error_names)]

    # count for each error the instances
    count = dict(Counter(errors_df['prediction']))

    # check if all errors are included, else set that error to 0
    for error in error_names:
        if error not in count.keys():
            count[error]=0

    # format into df to display
    class_count = errors_df.groupby('prediction')['label'].value_counts().reset_index(name='count')

    classes_in_responses = []
    correct_class_in_response = []
    for index, row in errors_df.iterrows():
        # for each response, return list with all labels that are named in response
        classes_in_response = [category.lower() for category in pt.get_class_list() if category.lower() in row['response'].lower()]
        classes_in_responses.append(classes_in_response)

        # for each response, check if true label is named in response.
        if row['label'].lower() in classes_in_response:
            correct_class_in_response.append(True)
        else:
            correct_class_in_response.append(False)

    # count how many classes are named in a response
    amount_of_classes = dict(Counter([len(response) for response in classes_in_responses]))

    # fomat print statement
    print_amount_of_classes = ''
    for amount in amount_of_classes.keys():
        print_amount_of_classes += f'There are {amount_of_classes[amount]} response that contain {amount} classes.'

        
    print('PREDICTION ERRORS')
    print(count)
    print(print_amount_of_classes)
    print(f"{correct_class_in_response.count(True)} responses out of {len(errors_df)} ({round(correct_class_in_response.count(True)/len(errors_df)*100,1)}%) prediction errors contain the correct label.")
    display(class_count)

def evaluation_metrics(df):
    warnings.filterwarnings("ignore", category=UserWarning)
    report = classification_report(df['label'], df['prediction'])
    print('EVALUATION METRICS')
    print(report)


def mistakes(df, detailed=False):
    mistakes = df.loc[df['label'] != df['prediction']]
    count_mistakes_per_class = dict(Counter(mistakes['label']))

    class_count = mistakes.groupby('label')['prediction'].value_counts().reset_index(name='count')
    print_class_count = 'Out of the mistakes, for each class:\n'
    for label in set(class_count['label']):
        highest_count_row = class_count.iloc[class_count[class_count['label'] == label]['count'].idxmax()]
        print_class_count += f"{label} gets most predicted as {highest_count_row['prediction']} ({highest_count_row['count']} out of {count_mistakes_per_class[label]})\n"

    print('MISTAKES. INCLUDES PREDICTIONS ERRORS.')
    print(f"{len(mistakes)} out of {len(df)} ({round(len(mistakes)/len(df)*100,1)}%) predictions are wrong.")
    print(f"Number of mistakes per class: {dict(sorted(count_mistakes_per_class.items(), key=lambda item: item[1], reverse=True))}")
    print(print_class_count)


    if detailed==True:
        display(class_count)

def runtime(df):
    print("RUNTIME")

    # select all docs that have runtime longer than 75th percentile.
    percentile_75 = df['runtime'].quantile(0.75)
    above_75th_percentile = df[df['runtime'] > percentile_75]

    # count how many docs for each class take longer than 75th percentile
    count_long_runtimes_per_class = dict(Counter(above_75th_percentile['label']))

    # average per class
    average_runtime_per_class = df.groupby('label').agg(mean_runtime=('runtime', 'mean'), total_runtime=('runtime', 'sum')).reset_index().sort_values(by=['mean_runtime', 'total_runtime'])

    display(df['runtime'].describe())
    print(f"Amount of docs that took longer than 75th percentile per class: {dict(sorted(count_long_runtimes_per_class.items(), key=lambda item: item[1], reverse=True))}")
    display(average_runtime_per_class.round(1))


# get_response_length(predictions, 'Rijgersberg/GEITje-7B-chat-v2')
# prediction_errors(predictions)
# evaluation_metrics(predictions)
# mistakes(predictions, False)
runtime(predictions)

RUNTIME


count    1100.000000
mean      127.476122
std        64.374614
min        27.456430
25%        70.539185
50%       111.897850
75%       171.998433
max       353.908433
Name: runtime, dtype: float64

Amount of docs that took longer than 75th percentile per class: {'brief': 33, 'factsheet': 32, 'voordracht': 32, 'motie': 31, 'besluit': 28, 'onderzoeksrapport': 25, 'raadsnotulen': 25, 'agenda': 21, 'actualiteit': 19, 'raadsadres': 15, 'schriftelijke vraag': 14}


Unnamed: 0,label,mean_runtime,total_runtime
7,raadsadres,105.2,10524.4
9,schriftelijke vraag,122.2,12220.7
6,onderzoeksrapport,124.5,12453.2
0,actualiteit,124.6,12456.0
1,agenda,124.7,12473.8
3,brief,129.6,12958.4
2,besluit,130.9,13090.6
8,raadsnotulen,132.2,13218.2
4,factsheet,134.4,13443.4
5,motie,134.9,13486.6


#### GEITje

In [108]:
# in-context learning: zero-
predictions = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Llama/zeroshot_prompt_mistral_llama/First100Last0Predictions.pkl")
display(predictions)

Unnamed: 0,id,path,text_column,prompt_function,response,prediction,label,runtime,date,prompt,run_id,train_set,test_set,shots
0,26304,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the provided text, the document can ...",raadsnotulen,raadsnotulen,83.301298,2024-05-19 13:32:58.557631+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
1,32939,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the text you provided, I would class...",besluit,factsheet,164.686586,2024-05-19 13:35:43.314140+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
2,33085,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the information provided, I would cl...",NoPredictionInOutput,factsheet,211.769842,2024-05-19 13:39:15.086478+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
3,22985,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the content of the document, I would...",raadsnotulen,brief,193.673920,2024-05-19 13:42:28.762329+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
4,32991,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,Based on the content of the document you prov...,NoPredictionInOutput,factsheet,284.011896,2024-05-19 13:47:12.776150+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,13888,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the provided text, I would classify ...",agenda,agenda,139.480164,2024-05-22 21:21:32.170387+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
6,28826,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Based on the content of the document ""VN2021-...",raadsnotulen,voordracht,92.682888,2024-05-22 21:23:04.854819+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
7,25722,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,"Sure, I can classify the document for you. Ba...",agenda,agenda,179.203237,2024-05-22 21:26:04.059592+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
8,23998,/home/azureuser/cloudfiles/code/blobfuse/raads...,TruncationLlamaTokensFront100Back0,zeroshot_prompt_mistral_llama,Based on the content of the document provided...,besluit,raadsadres,67.000105,2024-05-22 21:27:11.061521+02:00,Classificeer het document in één van de catego...,IC_Llama-2-7b-chat-hfzeroshot_prompt_mistral_l...,train,test,0
