In [6]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [43]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


### Load example data
Datasets used in Finetuning.py example of the GitHub of GEITje. 

In [44]:

from datasets import DatasetDict, load_dataset, concatenate_datasets


no_robots_nl = load_dataset('Rijgersberg/no_robots_nl')
print(no_robots_nl)
no_robots_nl["train_sft"]=no_robots_nl["train_sft"].select(range(2))
no_robots_nl["test_sft"]=no_robots_nl["test_sft"].select(range(2))

ultrachat_nl = load_dataset('Rijgersberg/ultrachat_10k_nl')
print(ultrachat_nl)
ultrachat_nl["train_sft"]=ultrachat_nl["train_sft"].select(range(2))
ultrachat_nl["test_sft"]=ultrachat_nl["test_sft"].select(range(2))

chat_dataset = DatasetDict({
    'train_sft': concatenate_datasets([no_robots_nl['train_sft'],
                                        ultrachat_nl['train_sft']]).shuffle(seed=42),
    'test_sft': concatenate_datasets([no_robots_nl['test_sft'],
                                        ultrachat_nl['test_sft']]).shuffle(seed=42),
})

DatasetDict({
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 500
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'messages_nl'],
        num_rows: 9500
    })
})
DatasetDict({
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'messages_nl'],
        num_rows: 500
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'messages_nl'],
        num_rows: 9500
    })
})


Inspection: we only need the columns prompt_id (equal to doc id) and messages_nl. 

### Load Amsterdam data

In [45]:
import pandas as pd

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
# display(df)

### Tokenize text using GEITje/Mistral tokenizer

In [41]:
from transformers import AutoTokenizer

def get_tokens(model_name, df, text_col, new_col_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_texts = list(df[text_col].values)

    all_len_tokens = []
    for txt in all_texts:
        tokens = tokenizer.tokenize(txt)
        all_len_tokens.append(tokens)

    df[new_col_name] = all_len_tokens
    return df

tokens_df = get_tokens('Rijgersberg/GEITje-7B-chat-v2', df, 'text', 'GEITjeTokens')
# display(tokens_df)

In [42]:
def format_message(input_txt, label):
    message_user = {
        "content":input_txt,
        'role':'user'
    }

    message_model = {
        "content":label,
        'role':'assistant'
    }

    return [message_user, message_model]



def format_data(df, text_col, model_token_col, label_col, split_col,  token_threshold='full_text'):
    format_df = pd.DataFrame(columns=['prompt_id', 'message', split_col])

    for index, row in df.iterrows():

        # select whole text
        if token_threshold == 'full_text':
            input_txt = row[text_col]

        # else select text according to the token threshold
        else:
            # select first n (= token_theshold) tokens using the model tokenizer
            tokens = row[model_token_col][0:token_threshold]

            # combine tokens into txt
            tokens_txt = ''.join(tokens)

            # \n is converted by tokenizer to <0x0A>, we reverse this to get original length
            len_char = len(tokens_txt.replace("<0x0A>", "\n")) # get character length

            # select the same amount of characters as the tokens
            input_txt = row[text_col][0:len_char]

        
        # format message
        label = row[label_col]
        message = format_message(input_txt, label)

        # save in dataframe
        format_df.loc[len(format_df)] = {'prompt_id':row['id'], 'message':message, split_col:row[split_col]}

    # split data
    # if split_col = 4split -> split into dev, train, val and test
    # if split_col = 2split -> split into train and test. Dev and val will be left empty
    train_set = format_df.loc[format_df[split_col]=='train'].drop(columns=[split_col])
    test_set = format_df.loc[format_df[split_col]=='test'].drop(columns=[split_col])

    if split_col == '4split':
        dev_set = format_df.loc[format_df[split_col]=='dev'].drop(columns=[split_col])
        val_set = format_df.loc[format_df[split_col]=='val'].drop(columns=[split_col])

        chat_dataset = DatasetDict({
            'train': Dataset.from_pandas(train_set).remove_columns('__index_level_0__'),
            'test': Dataset.from_pandas(test_set).remove_columns('__index_level_0__'),
            'dev': Dataset.from_pandas(dev_set).remove_columns('__index_level_0__'),
            'val': Dataset.from_pandas(val_set).remove_columns('__index_level_0__')
        })

    elif split_col == '2split':
            chat_dataset = DatasetDict({
            'train': Dataset.from_pandas(train_set).remove_columns('__index_level_0__'),
            'test': Dataset.from_pandas(test_set).remove_columns('__index_level_0__'),
        })


    return chat_dataset
     

            


data = format_data(tokens_df, 'text', 'GEITjeTokens', 'label', '4split', 200)
data.push_to_hub("FemkeBakker/AmsterdamGEITjeFormat200Tokens")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/666 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/FemkeBakker/trialdataet/commit/343b8b8f01dbdb7e18d63ee38d602aa954021a47', commit_message='Upload dataset', commit_description='', oid='343b8b8f01dbdb7e18d63ee38d602aa954021a47', pr_url=None, pr_revision=None, pr_num=None)

In [36]:
data

DatasetDict({
    train: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 38
    })
    test: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 5
    })
    dev: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 4
    })
    val: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 3
    })
})

In [37]:
data.save_to_disk(f"{cf.output_path}/trialdataset")

Saving the dataset (0/1 shards):   0%|          | 0/38 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3 [00:00<?, ? examples/s]

In [38]:
data.push_to_hub("FemkeBakker/trialdataet")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/FemkeBakker/trialdataet/commit/5e1cc8684a76d55561867203620fb87d73098818', commit_message='Upload dataset', commit_description='', oid='5e1cc8684a76d55561867203620fb87d73098818', pr_url=None, pr_revision=None, pr_num=None)

In [39]:
load_dataset('FemkeBakker/trialdataet')

Downloading readme:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 15.3k/15.3k [00:00<00:00, 47.0kB/s]
Downloading data: 100%|██████████| 6.65k/6.65k [00:00<00:00, 22.7kB/s]
Downloading data: 100%|██████████| 4.72k/4.72k [00:00<00:00, 18.0kB/s]
Downloading data: 100%|██████████| 4.82k/4.82k [00:00<00:00, 19.3kB/s]


Generating train split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 38
    })
    test: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 5
    })
    dev: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 4
    })
    val: Dataset({
        features: ['prompt_id', 'message'],
        num_rows: 3
    })
})