In [1]:
import logging
import os
import pickle
import sys
from contextlib import nullcontext

import numpy as np
import pandas as pd
from tqdm import tqdm

import pynvml
import huggingface_hub
import torch
import transformers
from transformers import AutoConfig, AutoModel
import dataclasses

from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import HfArgumentParser

from tevatron.arguments import ModelArguments, DataArguments, \
    TevatronTrainingArguments as TrainingArguments
from trainer import TevatronTrainer
from data import HFQueryDataset, HFCorpusDataset, HFTrainDataset, TrainDataset, TrainCollator
from transformers.utils import is_flash_attn_2_available

from repllama import RepLLaMA
from data import EncodeDataset, EncodeCollator
from utils import replace_with_xformers_attention

pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 120)

logger = logging.getLogger(__name__)
%reload_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def gpu_util_nv(device=0):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def gpu_util_pt(device=0):

    print(f"Pytorch GPU memory_allocated: {torch.cuda.memory_allocated(device)//1024**2} MB.")
    print(f"Pytorch GPU max_memory_allocated: {torch.cuda.max_memory_allocated(device)//1024**2} MB.")
    print(f"Pytorch GPU memory_reserved: {torch.cuda.memory_reserved(device)//1024**2} MB.")
    print(f"Pytorch GPU max_memory_reserved: {torch.cuda.max_memory_reserved(device)//1024**2} MB.")

gpu_util_nv()
gpu_util_pt()

GPU memory occupied: 887 MB.
Pytorch GPU memory_allocated: 0 MB.
Pytorch GPU max_memory_allocated: 0 MB.
Pytorch GPU memory_reserved: 0 MB.
Pytorch GPU max_memory_reserved: 0 MB.


In [3]:
torch.cuda.is_available(), is_flash_attn_2_available()

(True, True)

# 1) Load parameters:

In [4]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_json_file(json_file='./train_params.json')

In [5]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
#if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
    #model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
model_args, data_args, training_args = parser.parse_json_file(json_file='./train_params.json')
#else:
#    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
#    model_args: ModelArguments
#    data_args: DataArguments
#    training_args: TrainingArguments

if training_args.local_rank > 0 or training_args.n_gpu > 1:
    raise NotImplementedError('Multi-GPU encoding is not supported.')

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)

tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir, ) #token='')
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
dataclasses.asdict(data_args), model_args.cache_dir, data_args.dataset_split

({'train_dir': None,
  'dataset_name': 'Tevatron/msmarco-passage',
  'passage_field_separator': ' ',
  'dataset_proc_num': 12,
  'train_n_passages': 16,
  'positive_passage_no_shuffle': False,
  'negative_passage_no_shuffle': False,
  'encode_in_path': None,
  'encoded_save_path': None,
  'encode_is_qry': False,
  'encode_num_shard': 1,
  'encode_shard_index': 0,
  'q_max_len': 32,
  'p_max_len': 196,
  'data_cache_dir': None},
 None,
 'train')

# 2) Load Data:

In [7]:
train_dataset = HFTrainDataset(tokenizer=tokenizer, data_args=data_args,
                                   cache_dir=data_args.data_cache_dir or model_args.cache_dir)
train_dataset.dataset

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
    num_rows: 10000
})

In [8]:
train_dataset.dataset[999]

{'query_id': '503854',
 'query': 'stock option definition',
 'positive_passages': [{'docid': '5912909',
   'title': 'Definitions &Translations',
   'text': 'stock option(noun) the right to buy or sell a stock at a specified price within a stated period. stock option(noun) a benefit given by a company to an employee in the form of an option to buy stock in the company at a discount or at a fixed price. stock options are not much use as an incentive if the price at which they can be exercised is out of reach.'}],
 'negative_passages': [{'docid': '2718054',
   'title': '-',
   'text': 'Capital Management Outline Definition of bank capital Role of bank capital Capital adequacy Shareholdersâ\x80\x99 viewpoint Trends in bank capital Definition of bank capital Equity Common stock, preferred stock, surplus, and undivided profits equals the book value of equity.'},
  {'docid': '1885414',
   'title': 'Blocks',
   'text': 'Brick shaped blocks provide stabilizing support, while oval shaped blocks 

In [9]:
#train_dataset.dataset[0:10000]['query']
train_dataset.dataset.num_rows

10000

In [10]:
#train_dataset.dataset.size_in_bytes
#dir(train_dataset.dataset)

In [11]:
train_dataset = TrainDataset(data_args, train_dataset.process(), tokenizer)

In [12]:
train_dataset.total_len

10000

# 3) Train:

In [13]:
model = RepLLaMA.build(
        model_args,
        training_args,
        cache_dir=model_args.cache_dir,
        attn_implementation = "flash_attention_2")
        #token='')
#"optim":"adamw_bnb_8bit"

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s]


In [14]:
next(model.parameters()).is_cuda

False

In [15]:
#model.cuda(device=0)

In [16]:
gpu_util_nv()
gpu_util_pt()

GPU memory occupied: 887 MB.
Pytorch GPU memory_allocated: 0 MB.
Pytorch GPU max_memory_allocated: 0 MB.
Pytorch GPU memory_reserved: 0 MB.
Pytorch GPU max_memory_reserved: 0 MB.


In [17]:
# model.lm_q.to_bettertransformer()

# Transformer implementation does not support padding during training, 
# as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result 
# in unexpected outputs. 
# Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.

In [18]:
#model = model.to_bettertransformer()

In [19]:
effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
max_steps = int(np.ceil(train_dataset.total_len / effective_batch_size))
training_args.max_steps = max_steps

dataclasses.asdict(training_args)

{'output_dir': 'repllama_test',
 'overwrite_output_dir': True,
 'do_train': False,
 'do_eval': False,
 'do_predict': False,
 'evaluation_strategy': <IntervalStrategy.NO: 'no'>,
 'prediction_loss_only': False,
 'per_device_train_batch_size': 1,
 'per_device_eval_batch_size': 32,
 'per_gpu_train_batch_size': None,
 'per_gpu_eval_batch_size': None,
 'gradient_accumulation_steps': 1,
 'eval_accumulation_steps': None,
 'eval_delay': 0,
 'learning_rate': 0.0001,
 'weight_decay': 0.0,
 'adam_beta1': 0.9,
 'adam_beta2': 0.999,
 'adam_epsilon': 1e-08,
 'max_grad_norm': 1.0,
 'num_train_epochs': 1,
 'max_steps': 10000,
 'lr_scheduler_type': <SchedulerType.LINEAR: 'linear'>,
 'lr_scheduler_kwargs': {},
 'warmup_ratio': 0.1,
 'warmup_steps': 10,
 'log_level': 'passive',
 'log_on_each_node': True,
 'logging_dir': 'repllama_test/runs/Mar20_17-40-13_alex-gpu-1',
 'logging_strategy': <IntervalStrategy.STEPS: 'steps'>,
 'logging_first_step': False,
 'logging_steps': 10,
 'logging_nan_inf_filter': True,

In [20]:
trainer_cls = TevatronTrainer
trainer = trainer_cls(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=TrainCollator(
            tokenizer,
            max_p_len=data_args.p_max_len,
            max_q_len=data_args.q_max_len
        ),
    )
train_dataset.trainer = trainer

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [21]:
#trainer.is_in_train

gpu_util_nv()
gpu_util_pt()

GPU memory occupied: 14120 MB.
Pytorch GPU memory_allocated: 12803 MB.
Pytorch GPU max_memory_allocated: 12803 MB.
Pytorch GPU memory_reserved: 12812 MB.
Pytorch GPU max_memory_reserved: 12812 MB.


In [22]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using t

Step,Training Loss
10,3.5336
20,3.8414
30,2.85
40,2.3
50,2.4008
60,1.9531


KeyboardInterrupt: 

In [None]:
gpu_util_nv()
gpu_util_pt()

In [28]:
trainer.optimizer.optimizer

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.0001
    lr: 9.948948948948949e-05
    weight_decay: 0.0

Parameter Group 1
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.0001
    lr: 9.948948948948949e-05
    weight_decay: 0.0
)

In [None]:
def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

from accelerate.utils import release_memory

release_memory(model)