In [1]:
run_local = True

In [2]:
import wandb
import huggingface_hub


if run_local:
    import os
    from dotenv import load_dotenv

    load_dotenv()

    # TODO - write your own token here
    hf_token = os.getenv("HF_TOKEN")
    wandb_token = os.getenv("wandb_api_key")
    huggingface_hub.login(token=hf_token)

    # login into the clients
    wandb.login(key=wandb_token)
    huggingface_hub.login(token=hf_token)

else:
    if run_on_kaggle:
        # access the secrets
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()

        # fetch the tokens from secrets
        wandb_token = user_secrets.get_secret("wandb_api_key")
        hf_token = user_secrets.get_secret("HF_TOKEN")

        # login into the clients
        wandb.login(key=wandb_token)
        huggingface_hub.login(token=hf_token)
    else:
        from google.colab import userdata

        # get the token from the userdata
        hf_token = userdata.get("HF_TOKEN")
        wandb_token = userdata.get("wandb_api_key")

        # login into the clients
        wandb.login(key=wandb_token)
        huggingface_hub.login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbode-karl-erik[0m ([33merikbodedev[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/erik/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset

# Load the datasets
common_voice = load_dataset("mozilla-foundation/common_voice_13_0", "en", trust_remote_code=True)


In [4]:
import copy

In [5]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 1013968
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 16372
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 16372
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 278333
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        n

In [6]:
common_voice_modified = copy.deepcopy(common_voice)

In [7]:
common_voice_modified = common_voice_modified.remove_columns(["client_id","path", "audio","up_votes", "down_votes","age", "gender", "accent", "locale", "segment", "variant"])

In [8]:
common_voice_modified

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 1013968
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 16372
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 16372
    })
    other: Dataset({
        features: ['sentence'],
        num_rows: 278333
    })
    invalidated: Dataset({
        features: ['sentence'],
        num_rows: 264713
    })
})

In [9]:
# only keep train, validation and test subsets
import datasets
temp = datasets.DatasetDict({"train": common_voice_modified["train"], "validation": common_voice_modified["validation"], "test": common_voice_modified["test"]})

In [10]:
temp = temp.rename_column("sentence", "text")

In [11]:
# check for empty entries in train and validation and remove them
temp = temp.filter(function=lambda x: len(x["text"]) > 30)

In [12]:
# cast to lowercase
temp = temp.map(function=lambda x: {"text": x["text"].lower()})

In [13]:
temp

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 961433
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 14452
    })
    test: Dataset({
        features: ['text'],
        num_rows: 14017
    })
})

In [14]:
max_len = 0
min_len = 1000
for line in temp["train"]["text"]:
    temp_len = len(line)
    if temp_len > max_len:
        max_len = temp_len
    if temp_len < min_len:
        min_len = temp_len



In [15]:
max_len, min_len

(229, 31)

In [16]:
temp.save_to_disk("kaggle/input/train-datasets/cv_mod_30.hf")

Saving the dataset (0/1 shards):   0%|          | 0/961433 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14452 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14017 [00:00<?, ? examples/s]

In [17]:
# tokenize the data to see min length of the text
import os
from dotenv import load_dotenv

load_dotenv()

# TODO - write your own token here
hf_token = os.getenv("HF_TOKEN")
wandb_token = os.getenv("wandb_api_key")
huggingface_hub.login(token=hf_token)

# login into the clients
wandb.login(key=wandb_token)
huggingface_hub.login(token=hf_token)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.padding_side = 'right'
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=hf_token)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/erik/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [18]:
temp

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 961433
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 14452
    })
    test: Dataset({
        features: ['text'],
        num_rows: 14017
    })
})

In [19]:
data_train = temp["train"]
data_validation = temp["validation"]
data_test = temp["test"]

In [20]:
!rm -rf "kaggle/working/temp_data_sets"
!mkdir "kaggle/working/temp_data_sets"

working_dir = "kaggle/working/temp_data_sets/"


In [21]:


max_length = 16
# Map function to apply tokenization and caching - TODO use formatting function to avoid code redudancy
train_data = data_train.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",    # Pad to the maximum sequence length
        truncation=True,         # Truncate sequences longer than max_length
        max_length=max_length,          # Maximum sequence length
        return_attention_mask=True,  # Return attention masks
        return_tensors="pt"      # Return PyTorch tensors
    ),
    batched=True,
    cache_file_name=working_dir + "vp_train.cache"
)


val_data = data_validation.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_valid.cache"
)

test_data = data_test.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_test.cache"
)

Map:   0%|          | 0/961433 [00:00<?, ? examples/s]

Map:   0%|          | 0/14452 [00:00<?, ? examples/s]

Map:   0%|          | 0/14017 [00:00<?, ? examples/s]

In [22]:
import numpy as np

In [23]:
min_enc = 1000
max_enc = 0
for enc_data in [np.array(train_data["input_ids"]), np.array(val_data["input_ids"]), np.array(test_data["input_ids"])]:
    # enc data is 2D array
    # find the min and max length of the encoded data excluding zeros
    for enc in enc_data:
        temp_len = len(enc[enc != 0])
        if temp_len < min_enc:
            min_enc = temp_len
        if temp_len > max_enc:
            max_enc = temp_len

In [24]:
min_enc, max_enc

(4, 16)