In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


## Notebook overview
- Get insight into tokenizer, tokens and doc lengths.
- Test different text truncation thresholds on the baseline.

#### Text truncation -- overview in tokenizer/doc lengths
- tokenize text using tokenizer of mistral, geitje and Llama.
- Check if mistral and geitje indeed have the same tokenizer.
- After getting the tokens, check distribution.
- Truncate text and test multiple thresholds on baseline

Results are saved in txtfiles_tokenizer.pkl, so that txtfiles.pkl is a back-up file, in case anything gets messed up

In [2]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [3]:
from transformers import AutoTokenizer

def get_tokens(model_name, df, save_to_path, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    df.to_pickle(save_to_path)
    return df

# subdf = df.iloc[0:2]
# # display(subdf)
# get_token_length('Rijgersberg/GEITje-7B-chat-v2', subdf, f"{cf.output_path}/try_out_token_count.pkl", 'text', 'token_count_geitje')

def fraction_token(df, max_token, token_len_col):
    for col in token_len_col:
        print(f"{len(df.loc[df[col]>max_token])} out of {len(df)} ({round(len(df.loc[df[col]>max_token])/len(df)*100, 2)}%) docs exceed a token length of {max_token}")

    for col in token_len_col:
        print(df[col].describe())

    


    



##### Tokenize text

In [4]:
"""GEITje""" ## not necesarry -> since tokenizer is the same as mistral
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# get_tokens('Rijgersberg/GEITje-7B-chat-v2', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'GEITjeTokens')

"""Mistral"""
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# get_tokens('mistralai/Mistral-7B-v0.1', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'MistralTokens')

"""Llama"""
# df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# get_tokens('meta-llama/Llama-2-7b-hf', df, f"{cf.output_path}/txtfiles_tokenizer.pkl", 'text', 'LlamaTokens')

'Llama'

##### Analyse token length of model tokenizers

In [6]:
import pandas as pd
tok = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# fraction_token(tok, 4096, ['count_MistralTokens', 'count_LlamaTokens'])

#### Test text truncation on baseline

In [12]:
import itertools

# load file with baseline function
import sys
sys.path.append('../scripts/') 
import baseline as bf

# load file with truncation function
from truncation import add_truncation_column

from sklearn.svm import LinearSVC
# variables for text truncation
DATAFRAME = tok
TEXT_COL = 'text'
TOKENS_COL = 'LlamaTokens'

# variables for baseline
BASELINE_FUNCTION = LinearSVC()
MODEL_NAME = 'LinearSVC'
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
# PREDICTION_PATH = f"{cf.output_path}/predictions/baselineTruncationPredictions.pkl"
# OVERVIEW_PATH = f"{cf.output_path}/overview/baselineTruncationOverview.pkl"
PREDICTION_PATH = f"{cf.output_path}/predictions/tryoutBaselineTruncationPredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/tryoutBaselineTruncationOverview.pkl"
TRUNC_COLUMN = 'trunc_txt'
threshold_combinations =[(100,0), (200,0), (500,0), (1000,0), (2000,0), (100,100),(200,200), (500,500), (1000,1000), (0,100), (0,200), (0,500), (0,1000), (0,2000)]
threshold_combinations = [(100,0)]

In [13]:
# loop through all thresholds and save predictions
for thresholds in threshold_combinations:
    front_threshold = thresholds[0]
    back_threshold = thresholds[1]
    trunc = add_truncation_column(DATAFRAME, TEXT_COL, TOKENS_COL, front_threshold,back_threshold)
    bf.run_baseline(BASELINE_FUNCTION,MODEL_NAME, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.97      0.78      0.86       183
              Agenda       0.99      0.99      0.99       935
             Besluit       0.99      0.99      0.99       145
               Brief       0.93      0.87      0.90       396
          Factsheets       0.61      0.36      0.45        47
               Motie       0.95      0.98      0.96      1644
   Onderzoeksrapport       0.82      0.95      0.88       263
          Raadsadres       0.90      0.94      0.91       385
        Raadsnotulen       1.00      0.98      0.99        55
Schriftelijke Vragen       1.00      0.95      0.97       591
          Voordracht       0.99      1.00      1.00       696

            accuracy                           0.96      5340
           macro avg       0.92      0.89      0.90      5340
        weighted avg       0.96      0.96      0.95      5340



In [None]:
import pandas as pd
yeet = pd.read_pickle(OVERVIEW_PATH)
# yeet = yeet.sort_values(by=['accuracy','macro_avg_f1'], ascending=False)
display(yeet)

Unnamed: 0,model,date,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,LinearSVC,2024-04-23 11:22:12.183407+02:00,train,test,20028,5340,4split,trunc_txt,13.129241,0.930712,0.905293,0.847095,0.86421,precision recall f1-...
0,LinearSVC,2024-04-23 11:22:27.937809+02:00,train,test,20028,5340,4split,trunc_txt,34.073219,0.935206,0.907365,0.853826,0.868709,precision recall f1-...
0,LinearSVC,2024-04-23 11:23:07.017441+02:00,train,test,20028,5340,4split,trunc_txt,58.535419,0.935206,0.908623,0.860225,0.873773,precision recall f1-...
0,LinearSVC,2024-04-23 11:24:11.680326+02:00,train,test,20028,5340,4split,trunc_txt,79.908275,0.935955,0.911425,0.867819,0.880778,precision recall f1-...
0,LinearSVC,2024-04-23 11:25:38.335958+02:00,train,test,20028,5340,4split,trunc_txt,111.088073,0.933895,0.90538,0.865393,0.878232,precision recall f1-...
0,LinearSVC,2024-04-23 11:27:21.722641+02:00,train,test,20028,5340,4split,trunc_txt,123.940536,0.929588,0.897872,0.85456,0.868741,precision recall f1-...
0,LinearSVC,2024-04-23 11:29:30.009609+02:00,train,test,20028,5340,4split,trunc_txt,137.084519,0.934644,0.902038,0.860988,0.873781,precision recall f1-...
0,LinearSVC,2024-04-23 11:31:54.946633+02:00,train,test,20028,5340,4split,trunc_txt,164.22276,0.932772,0.898562,0.859814,0.871942,precision recall f1-...
0,LinearSVC,2024-04-23 11:34:48.167669+02:00,train,test,20028,5340,4split,trunc_txt,191.184826,0.934082,0.904531,0.863928,0.876408,precision recall f1-...
0,LinearSVC,2024-04-23 11:37:48.367258+02:00,train,test,20028,5340,4split,trunc_txt,217.061265,0.885768,0.850296,0.783976,0.808938,precision recall f1-...
