In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


## Notebook overview
- Get insight into tokenizer, tokens and doc lengths.
- Test different text truncation thresholds on the baseline.

#### Text truncation -- overview in tokenizer/doc lengths
- check how many docs exceed max

- first, tokenize text using tokenizer of mistral, geitje and Llama.
- Check if mistral and geitje indeed have the same tokenizer.
- After getting the tokens, check distribution and how many exceed max_threshold.

In [3]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [13]:
from transformers import AutoTokenizer

def get_tokens(model_name, df, save_to_path, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    df.to_pickle(save_to_path)
    return df

# subdf = df.iloc[0:2]
# # display(subdf)
# get_token_length('Rijgersberg/GEITje-7B-chat-v2', subdf, f"{cf.output_path}/try_out_token_count.pkl", 'text', 'token_count_geitje')

def fraction_token(df, max_token, token_len_col):
    for col in token_len_col:
        print(f"{len(df.loc[df[col]>max_token])} out of {len(df)} ({round(len(df.loc[df[col]>max_token])/len(df)*100, 2)}%) docs exceed a token length of {max_token}")

    for col in token_len_col:
        print(df[col].describe())

    


    



##### Get token lengths of model tokenizers

In [16]:
"""GEITje""" ## not necesarry -> since tokenizer is the same as mistral
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# get_tokens('Rijgersberg/GEITje-7B-chat-v2', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'GEITjeTokens')

"""Mistral"""
df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
get_tokens('mistralai/Mistral-7B-v0.1', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'MistralTokens')

"""Llama"""
df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
get_tokens('meta-llama/Llama-2-7b-hf', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'LlamaTokens')

##### Analyse token length of model tokenizers

In [None]:
df_token_len = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
fraction_token(df_token_len, 4096, ['count_MistralTokens', 'count_LlamaTokens'])

4005 out of 26704 (15.0%) docs exceed a token length of 4096


KeyError: 'count_LlamaTokens'

#### Test text truncation on baseline

In [None]:
import pandas as pd
tok = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# display(tok)

In [None]:
def add_truncation_column(df,text_col, token_col_name, front_token_threshold, back_token_threshold=0):
    input = []
    for index, row in df.iterrows():
        # select text according to the token threshold -> first FRONT
        # select first n (= token_theshold) tokens using the model tokenizer
        tokens = row[token_col_name][0:front_token_threshold]

        # combine tokens into txt
        tokens_txt = ''.join(tokens)

        # \n is converted by tokenizer to <0x0A>, we reverse this to get original length
        len_char = len(tokens_txt.replace("<0x0A>", "\n")) # get character length

        # select the same amount of characters as the tokens
        front_txt = row[text_col][0:len_char]

        # Check if back of document also given as input
        if back_token_threshold != 0:
            # select LAST n (= token_theshold) tokens using the model tokenizer
            tokens = row[token_col_name][-back_token_threshold:]

            # combine tokens into txt
            tokens_txt = ''.join(tokens)

            # \n is converted by tokenizer to <0x0A>, we reverse this to get original length
            len_char = len(tokens_txt.replace("<0x0A>", "\n")) # get character length

            # select the same amount of characters as the tokens
            back_txt = row[text_col][-len_char:]

            # combine front and back text
            input_txt = front_txt + ' ' + back_txt

        else:
            input_txt = front_txt

        input.append(input_txt)

    df[f"Truncation{token_col_name}Front{front_token_threshold}Back{back_token_threshold}"] = input
    trunc_col = f"Truncation{token_col_name}Front{front_token_threshold}Back{back_token_threshold}"
    return df, trunc_col




# trunc = add_truncation_column(tok, 'text', 'GEITjeTokens', 50,50)
# display(trunc)

In [None]:
import itertools
front_thresholds = [0,100, 200, 500, 1000, 2000]
back_thresholds = [0,100,200,500,1000,2000]
all_combinations = list(itertools.product(front_thresholds, back_thresholds))

# remove combinations which have more than 2000 tokens.
all_combinations = [comb for comb in all_combinations if sum(comb) <= 2000]

# load file with baseline function
import sys
sys.path.append('../scripts/') 
import baselineTruncationFunctions as bf


from sklearn.svm import LinearSVC
# variables for text truncation
DATAFRAME = tok
TEXT_COL = 'text'
TOKENS_COL = 'LlamaTokens'

# variables for baseline
BASELINE_FUNCTION = LinearSVC()
MODEL_NAME = 'LinearSVC'
TRAIN_SET = 'dev' # must be dev or train
TEST_SET = 'val' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
PREDICTION_PATH = f"{cf.output_path}/predictions/baselineTruncationPredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineTruncationOverview.pkl"

In [None]:
for thresholds in all_combinations:
    front_threshold = thresholds[0]
    back_threshold = thresholds[1]
    trunc, trunc_col = add_truncation_column(DATAFRAME, TEXT_COL, TOKENS_COL, front_threshold,back_threshold)
    bf.run_baseline(BASELINE_FUNCTION,MODEL_NAME, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, trunc_col, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

         Actualiteit       1.00      0.78      0.88         9
              Agenda       0.88      0.94      0.91        31
             Besluit       0.91      0.91      0.91        11
               Brief       0.78      0.88      0.82        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.94      0.92      0.93        86
   Onderzoeksrapport       0.69      0.79      0.73        14
          Raadsadres       0.78      0.96      0.86        26
        Raadsnotulen       0.00      0.00      0.00         2
Schriftelijke Vragen       1.00      0.90      0.95        29
       Termijnagenda       0.33      0.20      0.25         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.89       268
           macro avg       0.69      0.69      0.69       268
        weighted avg       0.88      0.89      0.88       268



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                      precision    recall  f1-score   support

         Actualiteit       0.86      0.67      0.75         9
              Agenda       0.87      0.87      0.87        31
             Besluit       0.83      0.91      0.87        11
               Brief       0.93      0.81      0.87        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.88      0.97      0.92        86
   Onderzoeksrapport       0.65      0.79      0.71        14
          Raadsadres       0.88      0.88      0.88        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       1.00      0.90      0.95        29
       Termijnagenda       0.33      0.20      0.25         5
          Voordracht       1.00      1.00      1.00        36

            accuracy                           0.88       268
           macro avg       0.77      0.71      0.73       268
        weighted avg       0.88      0.88      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.56      0.71         9
              Agenda       0.88      0.90      0.89        31
             Besluit       1.00      0.91      0.95        11
               Brief       0.78      0.88      0.82        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.88      0.95      0.92        86
   Onderzoeksrapport       0.64      0.64      0.64        14
          Raadsadres       0.86      0.96      0.91        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       1.00      0.90      0.95        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.88       268
           macro avg       0.79      0.70      0.73       268
        weighted avg       0.88      0.88      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.67      0.80         9
              Agenda       0.85      0.90      0.88        31
             Besluit       1.00      1.00      1.00        11
               Brief       0.61      0.88      0.72        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.93      0.97      0.95        86
   Onderzoeksrapport       0.62      0.57      0.59        14
          Raadsadres       0.96      0.88      0.92        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.88       268
           macro avg       0.78      0.71      0.73       268
        weighted avg       0.88      0.88      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       0.88      0.78      0.82         9
              Agenda       0.88      0.94      0.91        31
             Besluit       0.91      0.91      0.91        11
               Brief       0.93      0.88      0.90        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.97      0.99      0.98        86
   Onderzoeksrapport       0.72      0.93      0.81        14
          Raadsadres       0.86      0.92      0.89        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       1.00      0.90      0.95        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.92       268
           macro avg       0.80      0.74      0.76       268
        weighted avg       0.91      0.92      0.91       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.67      0.80         9
              Agenda       0.84      0.87      0.86        31
             Besluit       0.91      0.91      0.91        11
               Brief       0.93      0.81      0.87        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.93      1.00      0.97        86
   Onderzoeksrapport       0.68      0.93      0.79        14
          Raadsadres       0.89      0.92      0.91        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       1.00      0.90      0.95        29
       Termijnagenda       0.00      0.00      0.00         5
          Voordracht       1.00      1.00      1.00        36

            accuracy                           0.90       268
           macro avg       0.77      0.71      0.73       268
        weighted avg       0.90      0.90      0.90       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.67      0.80         9
              Agenda       0.85      0.90      0.88        31
             Besluit       1.00      0.91      0.95        11
               Brief       0.82      0.88      0.85        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.92      0.95      0.94        86
   Onderzoeksrapport       0.69      0.79      0.73        14
          Raadsadres       0.83      0.92      0.87        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.96      0.90      0.93        29
       Termijnagenda       0.00      0.00      0.00         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.89       268
           macro avg       0.75      0.70      0.72       268
        weighted avg       0.88      0.89      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.67      0.80         9
              Agenda       0.84      0.87      0.86        31
             Besluit       1.00      1.00      1.00        11
               Brief       0.67      0.88      0.76        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.93      0.99      0.96        86
   Onderzoeksrapport       0.62      0.57      0.59        14
          Raadsadres       0.96      0.88      0.92        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.33      0.20      0.25         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.89       268
           macro avg       0.77      0.70      0.73       268
        weighted avg       0.88      0.89      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       0.80      0.44      0.57         9
              Agenda       0.88      0.94      0.91        31
             Besluit       0.90      0.82      0.86        11
               Brief       0.76      0.81      0.79        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.93      1.00      0.97        86
   Onderzoeksrapport       0.81      0.93      0.87        14
          Raadsadres       0.89      0.92      0.91        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.96      0.90      0.93        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.90       268
           macro avg       0.78      0.70      0.73       268
        weighted avg       0.89      0.90      0.89       268





                      precision    recall  f1-score   support

         Actualiteit       0.83      0.56      0.67         9
              Agenda       0.85      0.90      0.88        31
             Besluit       0.90      0.82      0.86        11
               Brief       0.81      0.81      0.81        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.93      1.00      0.97        86
   Onderzoeksrapport       0.81      0.93      0.87        14
          Raadsadres       0.92      0.92      0.92        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.00      0.00      0.00         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.90       268
           macro avg       0.75      0.69      0.71       268
        weighted avg       0.88      0.90      0.89       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.44      0.62         9
              Agenda       0.90      0.90      0.90        31
             Besluit       0.91      0.91      0.91        11
               Brief       0.81      0.81      0.81        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.89      0.99      0.94        86
   Onderzoeksrapport       0.75      0.86      0.80        14
          Raadsadres       0.88      0.88      0.88        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.50      0.40      0.44         5
          Voordracht       1.00      0.97      0.99        36

            accuracy                           0.89       268
           macro avg       0.80      0.71      0.74       268
        weighted avg       0.89      0.89      0.88       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.33      0.50         9
              Agenda       0.85      0.90      0.88        31
             Besluit       1.00      0.91      0.95        11
               Brief       0.81      0.81      0.81        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.90      1.00      0.95        86
   Onderzoeksrapport       0.73      0.79      0.76        14
          Raadsadres       0.96      0.88      0.92        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.87      0.90      0.88        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.95      0.97      0.96        36

            accuracy                           0.88       268
           macro avg       0.80      0.68      0.71       268
        weighted avg       0.88      0.88      0.87       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.33      0.50         9
              Agenda       0.88      0.94      0.91        31
             Besluit       0.90      0.82      0.86        11
               Brief       0.88      0.94      0.91        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.92      1.00      0.96        86
   Onderzoeksrapport       0.81      0.93      0.87        14
          Raadsadres       0.89      0.96      0.93        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.96      0.90      0.93        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.95      0.97      0.96        36

            accuracy                           0.91       268
           macro avg       0.81      0.71      0.73       268
        weighted avg       0.90      0.91      0.89       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.33      0.50         9
              Agenda       0.85      0.94      0.89        31
             Besluit       0.90      0.82      0.86        11
               Brief       0.88      0.88      0.88        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.91      0.99      0.95        86
   Onderzoeksrapport       0.81      0.93      0.87        14
          Raadsadres       0.93      0.96      0.94        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       0.97      1.00      0.99        36

            accuracy                           0.90       268
           macro avg       0.81      0.70      0.73       268
        weighted avg       0.89      0.90      0.89       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.33      0.50         9
              Agenda       0.85      0.90      0.88        31
             Besluit       0.91      0.91      0.91        11
               Brief       0.82      0.88      0.85        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.91      1.00      0.95        86
   Onderzoeksrapport       0.81      0.93      0.87        14
          Raadsadres       0.96      0.96      0.96        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.50      0.20      0.29         5
          Voordracht       1.00      0.97      0.99        36

            accuracy                           0.90       268
           macro avg       0.81      0.71      0.73       268
        weighted avg       0.90      0.90      0.89       268





                      precision    recall  f1-score   support

         Actualiteit       1.00      0.44      0.62         9
              Agenda       0.90      0.90      0.90        31
             Besluit       1.00      0.91      0.95        11
               Brief       0.82      0.88      0.85        16
          Factsheets       0.00      0.00      0.00         3
               Motie       0.91      1.00      0.96        86
   Onderzoeksrapport       0.75      0.86      0.80        14
          Raadsadres       0.96      0.92      0.94        26
        Raadsnotulen       1.00      0.50      0.67         2
Schriftelijke Vragen       0.93      0.90      0.91        29
       Termijnagenda       0.67      0.40      0.50         5
          Voordracht       0.95      1.00      0.97        36

            accuracy                           0.91       268
           macro avg       0.82      0.73      0.76       268
        weighted avg       0.90      0.91      0.90       268



In [None]:
yeet = pd.read_pickle(OVERVIEW_PATH)
yeet = yeet.sort_values(by=['accuracy','macro_avg_f1'], ascending=False)
display(yeet)

Unnamed: 0,model,date,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,LinearSVC,2024-04-22 11:53:32.325423+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront200Back100,4.768539,0.91791,0.801205,0.744494,0.758719,precision recall f1-...
0,LinearSVC,2024-04-22 11:55:41.107555+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront1000Back1000,15.012036,0.906716,0.824521,0.725711,0.755677,precision recall f1-...
0,LinearSVC,2024-04-22 11:54:52.427752+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront1000Back100,11.645491,0.906716,0.808345,0.706949,0.730486,precision recall f1-...
0,LinearSVC,2024-04-22 11:55:23.319916+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront1000Back500,14.683352,0.902985,0.807415,0.706628,0.730136,precision recall f1-...
0,LinearSVC,2024-04-22 11:55:06.543479+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront1000Back200,14.01658,0.902985,0.806824,0.703086,0.727933,precision recall f1-...
0,LinearSVC,2024-04-22 11:54:05.271379+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront500Back100,8.040571,0.902985,0.784633,0.704901,0.727381,precision recall f1-...
0,LinearSVC,2024-04-22 11:53:38.959993+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront200Back200,5.487458,0.902985,0.765775,0.708952,0.725404,precision recall f1-...
0,LinearSVC,2024-04-22 11:54:15.483299+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront500Back200,8.803601,0.899254,0.747185,0.694805,0.711049,precision recall f1-...
0,LinearSVC,2024-04-22 11:54:26.777715+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront500Back500,9.658875,0.891791,0.798562,0.714014,0.739446,precision recall f1-...
0,LinearSVC,2024-04-22 11:53:55.948157+02:00,dev,val,1068,268,4split,TruncationGEITjeTokensFront200Back1000,7.458469,0.88806,0.77109,0.704467,0.725183,precision recall f1-...
