In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


## Notebook overview
- Get insight into tokenizer, tokens and doc lengths.
- Test different text truncation thresholds on the baseline.

*Previous notebook: clean_data*

*Next notebook: FinetuningDataFormatting*

#### Text truncation -- overview in tokenizer/doc lengths
- tokenize text using tokenizer of mistral, geitje and Llama.
- Check if mistral and geitje indeed have the same tokenizer.
- After getting the tokens, check distribution.
- Truncate text and test multiple thresholds on baseline

Results are saved in txtfiles_tokenizer.pkl, so that txtfiles.pkl is a back-up file, in case anything gets messed up

In [16]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [3]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles_notcleaned.pkl")

In [4]:
from transformers import AutoTokenizer

""" Add column with tokenizes tokens using the the models tokenizer """
def add_tokenized_tokens_column(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_tokens = []
    all_tokens_len = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_tokens.append(tokens)
        all_tokens_len.append(len(tokens))

    df[new_col_name] = all_tokens
    df[f"count_{new_col_name}"] = all_tokens_len
    # df.to_pickle(save_to_path)
    return df

""" Calculate fraction of tokens that exceeds a certain amount of tokens """
def fraction_token(df, max_token, token_len_col):
    for col in token_len_col:
        print(f"{len(df.loc[df[col]>max_token])} out of {len(df)} ({round(len(df.loc[df[col]>max_token])/len(df)*100, 2)}%) docs exceed a token length of {max_token}")

    for col in token_len_col:
        print(df[col].describe())



##### Tokenize text

In [5]:
print(f"Columns before: {list(df.columns)}")
"""GEITje and Mistral """ # same tokenizer for both models
# df = add_tokenized_tokens_column('mistralai/Mistral-7B-v0.1', df, 'text', 'MistralTokens')

"""Llama"""
df = add_tokenized_tokens_column('meta-llama/Llama-2-7b-hf', df, 'text', 'LlamaTokens')

print(f"Columns after: {list(df.columns)}")

# df.to_pickle(f"{cf.output_path}/txtfiles.pkl")


Columns before: ['label', 'path', 'id', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'pdf_path', 'num_pages']




Columns after: ['label', 'path', 'id', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'pdf_path', 'num_pages', 'LlamaTokens', 'count_LlamaTokens']


In [6]:
df.to_pickle(f"{cf.output_path}/txtfiles_notcleaned_llama.pkl")

##### Analyse token length of model tokenizers

In [14]:
import pandas as pd
fraction_token(df, 4096, ['count_MistralTokens', 'count_LlamaTokens'])

2903 out of 20818 (13.94%) docs exceed a token length of 4096
2797 out of 20818 (13.44%) docs exceed a token length of 4096
count     20818.000000
mean       4491.198434
std       15975.578413
min          75.000000
25%         630.000000
50%        1058.000000
75%        2455.000000
max      621995.000000
Name: count_MistralTokens, dtype: float64
count     20818.000000
mean       4340.207897
std       15456.312555
min          74.000000
25%         612.000000
50%        1031.000000
75%        2378.000000
max      618067.000000
Name: count_LlamaTokens, dtype: float64
