In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


## Notebook overview
Goal: turn dataframe with txt files, into dataset format, to finetune GEITje.
The dataset is save to huggingface.

Funtion allows to truncate text. txtfiles_tokenizer, should already indlue columns MistralTokens and LlamaTokens, with the tokens split using the models tokenizer.

*Previous notebook: text_truncation*

*Next notebook: finetuning*

**Load example data**

Datasets used in Finetuning.py example of the GitHub of GEITje. 

In [3]:

from datasets import DatasetDict, load_dataset, concatenate_datasets,Dataset	


no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
print(no_robots_nl)
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
print(ultrachat_nl)
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

Downloading readme:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/956k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating test_sft split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/9500 [00:00<?, ? examples/s]

DatasetDict({
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 500
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 9500
    })
})


Downloading readme:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.2M [00:00<?, ?B/s]

Generating test_sft split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/9500 [00:00<?, ? examples/s]

DatasetDict({
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'messages_nl'],
        num_rows: 500
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'messages_nl'],
        num_rows: 9500
    })
})


**Inspection**: we only need the columns prompt_id (equal to doc id) and messages_nl. 

#### Tokenize text using GEITje/Mistral tokenizer


In [4]:
# load amsterdam data     
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [None]:
import pandas as pd
from datasets import DatasetDict, load_dataset, concatenate_datasets,Dataset	
import sys
sys.path.append('../src/') 
import prompt_template as pt

def format_message(input_txt, label):
    message_user = {
        "content":pt.zeroshot_prompt_mistral_llama(input_txt, remove_template=True),
        'role':'user'
    }

    message_model = {
        "content":f"{{'categorie': {label}}}",
        'role':'assistant'
    }

    return [message_user, message_model]



def format_data(df, text_col, model_token_col, label_col, split_col,  token_threshold='full_text'):
    format_df = pd.DataFrame(columns=['prompt_id', 'message', split_col])

    for index, row in df.iterrows():

        # select whole text
        if token_threshold == 'full_text':
            input_txt = row[text_col]

        # else select text according to the token threshold
        else:
            first_tokens_threshold = token_threshold[0]
            last_tokens_threshold = token_threshold[1]

            # select first tokens
            tokens = row[model_token_col][0:first_tokens_threshold]

            # combine tokens into txt
            tokens_txt = ''.join(tokens)

            # \n is converted by tokenizer to <0x0A>, we reverse this to get original length
            len_char = len(tokens_txt.replace("<0x0A>", "\n")) # get character length

            # select the same amount of characters as the tokens
            front_txt = row[text_col][0:len_char]

            # Check if back of document also given as input
            if last_tokens_threshold != 0:
                # select LAST n (= token_theshold) tokens using the model tokenizer
                tokens = row[model_token_col][-last_tokens_threshold:]

                # combine tokens into txt
                tokens_txt = ''.join(tokens)

                # \n is converted by tokenizer to <0x0A>, we reverse this to get original length
                len_char = len(tokens_txt.replace("<0x0A>", "\n")) # get character length

                # select the same amount of characters as the tokens
                back_txt = row[text_col][-len_char:]

                # combine front and back text
                input_txt = front_txt + ' ' + back_txt

            else:
                input_txt = front_txt
        
        # format message
        label = row[label_col]
        message = format_message(input_txt, label)

        # save in dataframe
        format_df.loc[len(format_df)] = {'prompt_id':row['id'], 'message':message, split_col:row[split_col]}

    # split data
    train_set = format_df.loc[format_df[split_col]=='train'].drop(columns=[split_col])
    test_set = format_df.loc[format_df[split_col]=='test'].drop(columns=[split_col])


    if split_col == '4split':
        dev_set = format_df.loc[format_df[split_col]=='dev'].drop(columns=[split_col])
        val_set = format_df.loc[format_df[split_col]=='val'].drop(columns=[split_col])

        chat_dataset = DatasetDict({
            'train': Dataset.from_pandas(train_set).remove_columns('__index_level_0__'),
            'test': Dataset.from_pandas(test_set).remove_columns('__index_level_0__'),
            'dev': Dataset.from_pandas(dev_set).remove_columns('__index_level_0__'),
            'val': Dataset.from_pandas(val_set).remove_columns('__index_level_0__')
        })

    elif split_col == '2split':
            chat_dataset = DatasetDict({
            'train': Dataset.from_pandas(train_set).remove_columns('__index_level_0__'),
            'test': Dataset.from_pandas(test_set).remove_columns('__index_level_0__'),
        })

    elif split_col == 'balanced_split':
        val_set = format_df.loc[format_df[split_col]=='val'].drop(columns=[split_col])
        discarded_set = format_df.loc[format_df[split_col]=='discard'].drop(columns=[split_col])

        chat_dataset = DatasetDict({
            'train': Dataset.from_pandas(train_set).remove_columns('__index_level_0__'),
            'test': Dataset.from_pandas(test_set).remove_columns('__index_level_0__'),
            'val': Dataset.from_pandas(val_set).remove_columns('__index_level_0__'),
            'discard': Dataset.from_pandas(discarded_set).remove_columns('__index_level_0__')
        })


    return chat_dataset


data = format_data(df, 'text', 'LlamaTokens', 'label', 'balanced_split', [200,0])
# data.push_to_hub("FemkeBakker/AmsterdamBalancedFirst200Tokens")

In [None]:
load_dataset('FemkeBakker/AmsterdamBalancedFirst200Tokens')