In [3]:
import logging
import os
import pickle
import sys
from contextlib import nullcontext

import numpy as np
import pandas as pd
from tqdm import tqdm

import huggingface_hub
import torch
import transformers
from transformers import AutoConfig, AutoModel
import dataclasses

from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import HfArgumentParser

from tevatron.arguments import ModelArguments, DataArguments, \
    TevatronTrainingArguments as TrainingArguments
from trainer import TevatronTrainer
from data import HFQueryDataset, HFCorpusDataset, HFTrainDataset, TrainDataset, TrainCollator
from transformers.utils import is_flash_attn_2_available

from repllama import RepLLaMA
from data import EncodeDataset, EncodeCollator
from utils import replace_with_xformers_attention

logger = logging.getLogger(__name__)
%reload_ext autoreload
%autoreload 2

In [7]:
huggingface_hub.__path__

_NamespacePath(['/home/azureuser/data/miniconda3/envs/train_emb3/lib/python3.11/site-packages/huggingface_hub'])

In [None]:
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 120)

In [None]:
torch.cuda.is_available(), is_flash_attn_2_available()

# 1) Load parameters:

In [4]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_json_file(json_file='./train_params.json')

In [5]:
len(sys.argv), sys.argv

(2,
 ['/home/azureuser/data/miniconda3/envs/train_emb3/lib/python3.11/site-packages/ipykernel_launcher.py',
  '--f=/home/azureuser/.local/share/jupyter/runtime/kernel-v2-1751JPgbsTT5Wre2.json'])

In [6]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
#if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
    #model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
model_args, data_args, training_args = parser.parse_json_file(json_file='./train_params.json')
#else:
#    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
#    model_args: ModelArguments
#    data_args: DataArguments
#    training_args: TrainingArguments

if training_args.local_rank > 0 or training_args.n_gpu > 1:
    raise NotImplementedError('Multi-GPU encoding is not supported.')

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)

tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir, ) #token='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')
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
dataclasses.asdict(data_args), model_args.cache_dir, data_args.dataset_split

({'train_dir': None,
  'dataset_name': 'Tevatron/msmarco-passage',
  'passage_field_separator': ' ',
  'dataset_proc_num': 12,
  'train_n_passages': 16,
  'positive_passage_no_shuffle': False,
  'negative_passage_no_shuffle': False,
  'encode_in_path': None,
  'encoded_save_path': None,
  'encode_is_qry': False,
  'encode_num_shard': 1,
  'encode_shard_index': 0,
  'q_max_len': 32,
  'p_max_len': 196,
  'data_cache_dir': None},
 None,
 'train')

# 2) Load Data:

In [8]:
train_dataset = HFTrainDataset(tokenizer=tokenizer, data_args=data_args,
                                   cache_dir=data_args.data_cache_dir or model_args.cache_dir)
train_dataset.dataset

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
    num_rows: 10000
})

In [9]:
train_dataset.dataset[999]

{'query_id': '503854',
 'query': 'stock option definition',
 'positive_passages': [{'docid': '5912909',
   'title': 'Definitions &Translations',
   'text': 'stock option(noun) the right to buy or sell a stock at a specified price within a stated period. stock option(noun) a benefit given by a company to an employee in the form of an option to buy stock in the company at a discount or at a fixed price. stock options are not much use as an incentive if the price at which they can be exercised is out of reach.'}],
 'negative_passages': [{'docid': '2718054',
   'title': '-',
   'text': 'Capital Management Outline Definition of bank capital Role of bank capital Capital adequacy Shareholdersâ\x80\x99 viewpoint Trends in bank capital Definition of bank capital Equity Common stock, preferred stock, surplus, and undivided profits equals the book value of equity.'},
  {'docid': '1885414',
   'title': 'Blocks',
   'text': 'Brick shaped blocks provide stabilizing support, while oval shaped blocks 

In [10]:
#train_dataset.dataset[0:10000]['query']
train_dataset.dataset.num_rows

10000

In [11]:
#train_dataset.dataset.size_in_bytes
#dir(train_dataset.dataset)

In [12]:
train_dataset = TrainDataset(data_args, train_dataset.process(), tokenizer)

In [13]:
train_dataset.total_len

10000

# 3) Train:

In [14]:
model = RepLLaMA.build(
        model_args,
        training_args,
        cache_dir=model_args.cache_dir,
        attn_implementation = "flash_attention_2")
        #token='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')

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s]


In [15]:
next(model.parameters()).is_cuda

False

In [17]:
model.lm_q.to_bettertransformer()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


LlamaModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaFlashAttention2(
        (q_proj): lora.Linear(
          (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=4096, out_features=32, bias=False)
          )
          (lora_B): ModuleDict(
            (default): Linear(in_features=32, out_features=4096, bias=False)
          )
          (lora_embedding_A): ParameterDict()
          (lora_embedding_B): ParameterDict()
        )
        (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (v_proj): lora.Linear(
          (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )


In [16]:
model = model.to_bettertransformer()

AttributeError: 'RepLLaMA' object has no attribute 'to_bettertransformer'

In [None]:
effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
max_steps = int(np.ceil(train_dataset.total_len / effective_batch_size))
training_args.max_steps = max_steps

dataclasses.asdict(training_args)

In [None]:
trainer_cls = TevatronTrainer
trainer = trainer_cls(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=TrainCollator(
            tokenizer,
            max_p_len=data_args.p_max_len,
            max_q_len=data_args.q_max_len
        ),
    )
train_dataset.trainer = trainer

In [None]:
trainer.train()