In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


## Notebook overview
- Get insight into tokenizer, tokens and doc lengths.
- Test different text truncation thresholds on the baseline.

*Previous notebook: clean_data*

*Next notebook: FinetuningDataFormatting*

#### Text truncation -- overview in tokenizer/doc lengths
- tokenize text using tokenizer of mistral, geitje and Llama.
- Check if mistral and geitje indeed have the same tokenizer.
- After getting the tokens, check distribution.
- Truncate text and test multiple thresholds on baseline

Results are saved in txtfiles_tokenizer.pkl, so that txtfiles.pkl is a back-up file, in case anything gets messed up

In [15]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [11]:
from transformers import AutoTokenizer

""" Add column with tokenizes tokens using the the models tokenizer """
def add_tokenized_tokens_column(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    # df.to_pickle(save_to_path)
    return df

""" Calculate fraction of tokens that exceeds a certain amount of tokens """
def fraction_token(df, max_token, token_len_col):
    for col in token_len_col:
        print(f"{len(df.loc[df[col]>max_token])} out of {len(df)} ({round(len(df.loc[df[col]>max_token])/len(df)*100, 2)}%) docs exceed a token length of {max_token}")

    for col in token_len_col:
        print(df[col].describe())

##### Tokenize text

In [13]:
print(f"Columns before: {list(df.columns)}")
"""GEITje and Mistral """ # same tokenizer for both models
df = add_tokenized_tokens_column('mistralai/Mistral-7B-v0.1', df, 'text', 'MistralTokens')

"""Llama"""
df = add_tokenized_tokens_column('meta-llama/Llama-2-7b-hf', df, 'text', 'LlamaTokens')

print(f"Columns after: {list(df.columns)}")

# df.to_pickle(f"{cf.output_path}/txtfiles.pkl")


Columns before: ['label', 'path', 'id', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'num_pages', 'clean_text', '4split', '2split', 'MistralTokens', 'count_MistralTokens', 'LlamaTokens', 'count_LlamaTokens', 'old_label', 'md5_hash', 'balanced_split', 'GEITjeTokens', 'count_GEITjeTokens']




Columns after: ['label', 'path', 'id', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'num_pages', 'clean_text', '4split', '2split', 'MistralTokens', 'count_MistralTokens', 'LlamaTokens', 'count_LlamaTokens', 'old_label', 'md5_hash', 'balanced_split', 'GEITjeTokens', 'count_GEITjeTokens']


##### Analyse token length of model tokenizers

In [16]:
import pandas as pd
fraction_token(df, 4096, ['count_MistralTokens', 'count_LlamaTokens'])

2903 out of 20818 (13.94%) docs exceed a token length of 4096
2797 out of 20818 (13.44%) docs exceed a token length of 4096
count     20818.000000
mean       4491.198434
std       15975.578413
min          75.000000
25%         630.000000
50%        1058.000000
75%        2455.000000
max      621995.000000
Name: count_MistralTokens, dtype: float64
count     20818.000000
mean       4340.207897
std       15456.312555
min          74.000000
25%         612.000000
50%        1031.000000
75%        2378.000000
max      618067.000000
Name: count_LlamaTokens, dtype: float64


## END NOTEBOOK

#### Test text truncation on baseline

In [None]:
import itertools
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC


# load file with baseline function
import sys
sys.path.append('../scripts/') 
import baseline as bf

# load file with truncation function
from truncation import add_truncation_column

# variables for text truncation
DATAFRAME = tok
TEXT_COL = 'text'
TOKENS_COL = 'LlamaTokens'

# variables for baseline
BASELINE_FUNCTION = MultinomialNB()
MODEL_NAME = 'MultinomialNB'
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
PREDICTION_PATH = f"{cf.output_path}/predictions/baselineTruncationPredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineTruncationOverview.pkl"
# PREDICTION_PATH = f"{cf.output_path}/predictions/tryoutBaselineTruncationPredictions.pkl"
# OVERVIEW_PATH = f"{cf.output_path}/overview/tryoutBaselineTruncationOverview.pkl"
TRUNC_COLUMN = 'trunc_txt'
threshold_combinations =[(100,0), (200,0), (500,0), (1000,0), (2000,0), (100,100),(200,200), (500,500), (1000,1000), (0,100), (0,200), (0,500), (0,1000), (0,2000)]
threshold_combinations =[(200,200), (500,500), (1000,1000), (0,100), (0,200), (0,500), (0,1000), (0,2000)]

# threshold_combinations = [(100,0)]

In [None]:
# loop through all thresholds and save predictions
for thresholds in threshold_combinations:
    front_threshold = thresholds[0]
    back_threshold = thresholds[1]
    trunc = add_truncation_column(DATAFRAME, TEXT_COL, TOKENS_COL, front_threshold,back_threshold)
    bf.run_baseline(BASELINE_FUNCTION,MODEL_NAME, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH)

In [None]:
import pandas as pd
PREDICTION_PATH = f"{cf.output_path}/predictions/baselineTruncationPredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineTruncationOverview.pkl"

yeet = pd.read_pickle(OVERVIEW_PATH)
yeet = yeet.sort_values(by=['macro_avg_f1', 'accuracy'], ascending=False)
display(yeet)

bl = pd.read_pickle(f"{cf.output_path}/overview/baselineOverview.pkl")


In [None]:

# ls = yeet.loc[yeet['model']=='LinearSVC']
ls = pd.concat([bl.loc[bl['model']=='LinearSVC'], yeet.loc[yeet['model']=='LinearSVC']]).sort_values(by=['macro_avg_f1', 'accuracy'], ascending=False)
ls = ls.loc[~ls['text_col'].isin(['TruncationLlamaTokensFront2000Back0', 'TruncationLlamaTokensFront1000Back0', 'TruncationLlamaTokensFront1000Back1000', 'TruncationLlamaTokensFront0Back2000',''])]

In [None]:
display(ls)

In [None]:
nb = yeet.loc[yeet['model']=='MultinomialNB']
# nb = nb.loc[~nb['text_col'].isin(['TruncationLlamaTokensFront2000Back0', 'TruncationLlamaTokensFront1000Back0', 'TruncationLlamaTokensFront1000Back1000', 'TruncationLlamaTokensFront0Back2000',''])]
display(nb)



### Finding best cut-off point