In [130]:
run_local = True

In [131]:
import wandb
import huggingface_hub


if run_local:
    import os
    from dotenv import load_dotenv

    load_dotenv()

    # TODO - write your own token here
    hf_token = os.getenv("HF_TOKEN")
    wandb_token = os.getenv("wandb_api_key")
    huggingface_hub.login(token=hf_token)

    # login into the clients
    wandb.login(key=wandb_token)
    huggingface_hub.login(token=hf_token)

else:
    if run_on_kaggle:
        # access the secrets
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()

        # fetch the tokens from secrets
        wandb_token = user_secrets.get_secret("wandb_api_key")
        hf_token = user_secrets.get_secret("HF_TOKEN")

        # login into the clients
        wandb.login(key=wandb_token)
        huggingface_hub.login(token=hf_token)
    else:
        from google.colab import userdata

        # get the token from the userdata
        hf_token = userdata.get("HF_TOKEN")
        wandb_token = userdata.get("wandb_api_key")

        # login into the clients
        wandb.login(key=wandb_token)
        huggingface_hub.login(token=hf_token)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/erik/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [132]:
from datasets import load_dataset

# Load the datasets
voxpopuli = load_dataset(
    "esb/datasets", "voxpopuli", trust_remote_code=True
)

In [133]:
import copy

In [134]:
voxpopuli

DatasetDict({
    train: Dataset({
        features: ['audio', 'dataset', 'text', 'id'],
        num_rows: 182482
    })
    validation: Dataset({
        features: ['audio', 'dataset', 'text', 'id'],
        num_rows: 1753
    })
    test: Dataset({
        features: ['audio', 'dataset', 'text', 'id'],
        num_rows: 1842
    })
})

In [135]:
voxpopuli_modified = copy.deepcopy(voxpopuli)

In [136]:
voxpopuli_modified = voxpopuli_modified.remove_columns(["audio", "dataset", "id"])

In [137]:
# check for empty entries in train and validation and remove them
voxpopuli_modified = voxpopuli_modified.filter(function=lambda x: len(x["text"]) > 90)

Filter:   0%|          | 0/182482 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1753 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1842 [00:00<?, ? examples/s]

In [138]:
frac = len(voxpopuli_modified["validation"])/len(voxpopuli_modified["train"])
temp = voxpopuli_modified["train"].train_test_split(test_size=frac)

In [139]:
temp["validation"] = voxpopuli_modified["validation"]

In [140]:
temp

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 133647
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1245
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1245
    })
})

In [141]:
max_len = 0
min_len = 1000
for line in temp["train"]["text"]:
    temp_len = len(line)
    if temp_len > max_len:
        max_len = temp_len
    if temp_len < min_len:
        min_len = temp_len



In [142]:
max_len, min_len

(2905, 91)

In [143]:
temp.save_to_disk("kaggle/input/train-datasets/vp_mod_90.hf")

Saving the dataset (0/1 shards):   0%|          | 0/133647 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1245 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1245 [00:00<?, ? examples/s]

In [144]:
# tokenize the data to see min length of the text
import os
from dotenv import load_dotenv

load_dotenv()

# TODO - write your own token here
hf_token = os.getenv("HF_TOKEN")
wandb_token = os.getenv("wandb_api_key")
huggingface_hub.login(token=hf_token)

# login into the clients
wandb.login(key=wandb_token)
huggingface_hub.login(token=hf_token)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.padding_side = 'right'
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=hf_token)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/erik/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [145]:
temp

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 133647
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1245
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1245
    })
})

In [146]:
data_train = temp["train"]
data_validation = temp["validation"]
data_test = temp["test"]

In [147]:
!rm -rf "kaggle/working/temp_data_sets"
!mkdir "kaggle/working/temp_data_sets"

working_dir = "kaggle/working/temp_data_sets/"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [148]:


max_length = 16
# Map function to apply tokenization and caching - TODO use formatting function to avoid code redudancy
train_data = data_train.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",    # Pad to the maximum sequence length
        truncation=True,         # Truncate sequences longer than max_length
        max_length=max_length,          # Maximum sequence length
        return_attention_mask=True,  # Return attention masks
        return_tensors="pt"      # Return PyTorch tensors
    ),
    batched=True,
    cache_file_name=working_dir + "vp_train.cache"
)


val_data = data_validation.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_valid.cache"
)

test_data = data_test.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_test.cache"
)

Map:   0%|          | 0/133647 [00:00<?, ? examples/s]

Map:   0%|          | 0/1245 [00:00<?, ? examples/s]

Map:   0%|          | 0/1245 [00:00<?, ? examples/s]

In [149]:
import numpy as np

In [150]:
min_enc = 1000
max_enc = 0
for enc_data in [np.array(train_data["input_ids"]), np.array(val_data["input_ids"]), np.array(test_data["input_ids"])]:
    # enc data is 2D array
    # find the min and max length of the encoded data excluding zeros
    for enc in enc_data:
        temp_len = len(enc[enc != 0])
        if temp_len < min_enc:
            min_enc = temp_len
        if temp_len > max_enc:
            max_enc = temp_len

In [151]:
min_enc, max_enc

(12, 16)