In [1]:
!unzstd shuff2fullencoded.zst -o bigais.txt

shuff2fullencoded.zst: 49643184475 bytes                                       


In [32]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import copy
import json
import os
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
import collections
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM , AutoConfig
from datasets import load_dataset
from datasets import DatasetDict
import datasets
from transformers import DataCollatorForLanguageModeling 
from huggingface_hub.hf_api import HfFolder



In [46]:
""" AISTokenizer for Hugging Face Transformers.

"""


class AISTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab: str, model_max_length: int, **kwargs):
        """Character tokens for Hugging Face transformers.

        Args:
            vocab str: Filename of a file containing with desired tokens
            on each newline
                    "<|endoftext|>": 0

                an id (starting at 1) will be assigned to each token.

            model_max_length (int): Model maximum sequence length.
        """
        self.vocab = []
        with open(vocab, 'r') as file:
            for line in file:
                self.vocab.append(line.strip())
        self.model_max_length = model_max_length
        
        bos_token = AddedToken("<|endoftext|>", lstrip=False, rstrip=False)
        eos_token = AddedToken("<|endoftext|>", lstrip=False, rstrip=False)
        pad_token = AddedToken("<|pad|>", lstrip=False, rstrip=False)
        unk_token = AddedToken("<|endoftext|>", lstrip=False, rstrip=False)
        self._vocab_str_to_int = {
            "<|endoftext|>": 0,"<|pad|>":1,
            **{ch: i + 2 for i, ch in enumerate(self.vocab)},
        }
        self._vocab_int_to_str = {v: k + " " for k, v in self._vocab_str_to_int.items()}
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self._vocab_str_to_int.items()])

        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=False,
            model_max_length=model_max_length,
            **kwargs,
        )


    @property
    def vocab_size(self) -> int:
        return len(self._vocab_str_to_int)

    def _tokenize(self, text: str) -> List[str]:
        return text.split()

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab_str_to_int.get(token, self._vocab_str_to_int[self.unk_token])

    def _convert_id_to_token(self, index: int) -> str:
        return self._vocab_int_to_str[index]

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)

    def create_token_type_ids_from_sequences(
            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ) -> List[int]:
        bos_token_id = []
        eos_token_id = []

        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)

        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

        return output
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if True:
            bos_token_ids = [self.bos_token_id]
        else:
            bos_token_ids = []

        output = bos_token_ids + token_ids_0

        if token_ids_1 is None:
            return output

        return output + bos_token_ids + token_ids_1


    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.bos_token_id, self.eos_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
    def get_vocab(self) -> Dict[str, int]:
        return (self._vocab_str_to_int)
    def save_vocabulary(self, vocab_path,filename_prefix: Optional[str] = None):
        """
        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
        Args:
            vocab_path (:obj:`str`):
                The directory in which to save the vocabulary.
        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        """
        index = 0
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, "vocab_file.txt")
        else:
            vocab_file = vocab_path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self._vocab_str_to_int.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    print(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
        return (vocab_file,)

In [47]:
vocabfile = "/workspace/vocab.txt" 

In [48]:
context_length = 72
tokenizer = AISTokenizer(vocabfile,context_length)

In [49]:
#Load in the datasets and convert to memory-mapped for efficiency reasons
train_dataset = load_dataset('text', data_files='/workspace/bigais.txt',
                        split='train') 
train_dataset = train_dataset.to_iterable_dataset(num_shards=32).shuffle(buffer_size=10_000)

test_dataset = load_dataset('text', data_files='/workspace/AIStest.txt',
                        split='train')
test_dataset = test_dataset.to_iterable_dataset(num_shards=32).shuffle(buffer_size=10_000)


Loading dataset shards:   0%|          | 0/99 [00:00<?, ?it/s]

In [50]:
# README:
# Wrapper around the tokenizer that concatenates samples in each dataset mapping batch. So say it's performing a mapping (which we specified
# as mapping this function) on a batch of the dataset. It will take every sample in the batch, concatenate them to one long string, then split
# that up into chunks of context_length. This effectively packs the samples, most of which are smaller than context_length, into a tighter
# representation which makes training more efficient (otherwise there would be hella padding). It also lets the model learn where the end of
# sentence will be as we use EOS token between columns. 
def tokenize(element):      
    # Remove token_type_ids and attention_mask columns
    input_ids = tokenizer(element["text"])['input_ids']


    # Concatenate input_ids up to max_length
    concatenated_ids = [token for sublist in input_ids for token in sublist]
    # Split concatenated string into batches
    batch_list = [concatenated_ids[i:i+context_length] for i in range(0, len(concatenated_ids), context_length)]
    # Return processed output
    return {'input_ids': batch_list} # Remove token_type_ids and attention_mask columns


In [51]:
test_dataset = test_dataset.map(tokenize,batched=True,remove_columns=["text"])
train_dataset = train_dataset.map(tokenize,batched=True,remove_columns=["text"])

In [52]:
# Set up our model parameters
config = AutoConfig.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    vocab_size=len(tokenizer),
    intermediate_size = 1600,
    hidden_size = 432,
    num_attention_heads = 8, 
    num_hidden_layers = 12,
    max_position_embeddings = context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    sliding_window=context_length,
    use_cache=False
    
)


In [53]:
# Load the model. You cannot use flash attention 2 or bfloat16 on t4 or p100 unfortunately. 
model = AutoModelForCausalLM.from_config(config,torch_dtype=torch.bfloat16,attn_implementation="flash_attention_2")
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

params = model.state_dict()
total_params = sum(p.numel() for p in params.values())
# printing the number of params
print("Total Parameters:", total_params)

Model size: 34.9M parameters
Total Parameters: 34871472


In [54]:

# This makes our attention masks and labels for us for each batch. Note, do NOT set pad_token to eos_token like many guides to, 
# this will cause the data_collator to set the label for the eos_token to -100 which leads to it being ignored in loss calculation and
# giving us a model that only knows how to yap and not to send a sequence.
data_collator = DataCollatorForLanguageModeling (tokenizer,mlm=False)

In [55]:
# Sanity check. Note, the datasets are iterable so you have to wrap them in iter if you want to examine them. You cannot index them with [].
it = iter(test_dataset)
test = data_collator([next(it) for _ in range(2)])
test

{'input_ids': tensor([[   0,  288,  505,  823,    7,  270,  271,  431,    3,  822,    8,  270,
          270,  270,  687,    8,  394,    3,  287,    4,  138,    3,   15,  511,
            4,  474,  244,  348,    8,  270,  270,    8,    4,  271,    7,    0,
          511,   15,  109,    3,  505,  245, 1078,    7, 1130, 1130, 1078,    3,
          244,  431,    8,  271,  272,  521,  272,  271,    8,    4, 1130, 1130,
            7,    4, 1078,    7, 1131, 1147, 1134, 1147, 1082,    7,  348,    7],
        [ 270,  270,    7,    0,  511,   15,  103,    3,  474,  687,    7,  270,
          272,  521, 1083,    8, 1080,    3,  387,    4, 1130, 1130, 1130, 1078,
            8,    7,    4,  751,    7,    8,  270,  270,  270,  818,    7,  271,
          495,  271,    8,    0,  286,  794,    3,  474,  138,    3,  467,    4,
           15,  511,    4,  103,    3,   15,  511,    4,  474,  244,  240,   90,
            3,  387,    4,    3,  387,    4,  387,    0,  286,  247, 1032,    3]]), 'attention

In [56]:
# Pack test, train into one dataset
AISDataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [57]:
from transformers import Trainer, TrainingArguments
max_steps = 120000 
args = TrainingArguments(
    bf16=True, 
    output_dir="CHANGE_ME", 
    max_steps = max_steps,
    per_device_train_batch_size=712,
    per_device_eval_batch_size=712,
    evaluation_strategy="steps",
    eval_steps=int((max_steps/20)),
    logging_strategy = "steps",
    logging_steps=int((max_steps/100)),
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=(max_steps/40),
    learning_rate=5e-4,
    lr_scheduler_type="reduce_lr_on_plateau",
    save_steps=int(max_steps/8),
    push_to_hub=True,
    logging_first_step=True,
    dataloader_num_workers=32,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=AISDataset["train"].with_format("torch"),
    eval_dataset=AISDataset["test"].with_format("torch"),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
#trainer.train()

Step,Training Loss,Validation Loss
6000,0.6667,0.663473
12000,0.6553,0.654182
18000,0.6507,0.650171
24000,0.6483,0.647097
30000,0.6465,0.645241
