In [1]:
# !pip install torch==2.0.1
# !pip install bitsandbytes
# !pip install transformers==4.31.0
# !pip install peft
# !pip install azureml-evaluate-mlflow

!nvidia-smi

In [2]:
import transformers
import torch
print(f'transformer version: {transformers.__version__}, torch version: {torch.__version__}')

  from .autonotebook import tqdm as notebook_tqdm


transformer version: 4.31.0, torch version: 2.0.1


In [3]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification
from transformers import LlamaTokenizerFast, LlamaForCausalLM, LlamaTokenizer, LlamaForSequenceClassification
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from azureml.metrics import compute_metrics, constants
from datasets import Dataset
from datasets import load_dataset
from tqdm import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [4]:
# Load original tokenizer
tokenizer_path = '../70b_hf_cnverted'
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token_id = 0

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [5]:
# Load original model
model_path = '../70b_hf_cnverted'
model = LlamaForSequenceClassification.from_pretrained(model_path, device_map='auto', load_in_8bit=True, torch_dtype=torch.float16, num_labels=4)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 15/15 [33:53<00:00, 135.56s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../70b_hf_cnverted and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Dataset 20-News group

In [6]:
# data_dir = "text-dnn-data"  # Local directory to store data
# blobstore_datadir = data_dir  # Blob store directory to store data in
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
target_column_name = "label"
feature_column_name = "sentence"


def get_20newsgroups_data():
    """Fetches 20 Newsgroups data from scikit-learn
    Returns them in form of pandas dataframes
    """
    remove = ("headers", "footers", "quotes")
    categories = [
        "rec.sport.baseball",
        "rec.sport.hockey",
        "comp.graphics",
        "sci.space",
    ]

    data = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )
    data = pd.DataFrame(
        {feature_column_name: data.data, target_column_name: data.target}
    )

    data_train = data[:200]
    data_test = data[200:300]

    data_train = remove_blanks_20news(
        data_train, feature_column_name, target_column_name
    )
    data_test = remove_blanks_20news(data_test, feature_column_name, target_column_name)
    return Dataset.from_pandas(data_train), Dataset.from_pandas(data_test)


def remove_blanks_20news(data, feature_column_name, target_column_name):

    for index, row in data.iterrows():
        data.at[index, feature_column_name] = (
            row[feature_column_name].replace("\n", " ").strip()
        )

    data = data[data[feature_column_name] != ""]

    return data

In [7]:
data_train, data_test = get_20newsgroups_data()

In [8]:
data_train

Dataset({
    features: ['sentence', 'label', '__index_level_0__'],
    num_rows: 193
})

In [9]:
data_test

Dataset({
    features: ['sentence', 'label', '__index_level_0__'],
    num_rows: 98
})

In [10]:
def tokenize(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256)
    return outputs

In [11]:
train_dataset = data_train.map(
    lambda samples: tokenize(samples), remove_columns=["__index_level_0__", "sentence"], load_from_cache_file=False)

validation_dataset = data_test.map(
    lambda samples: tokenize(samples), remove_columns=["__index_level_0__", "sentence"], load_from_cache_file=False)

Map: 100%|██████████| 193/193 [00:00<00:00, 865.48 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 858.78 examples/s]


In [12]:
len(train_dataset), len(validation_dataset)

(193, 98)

In [13]:
print(validation_dataset[0])

{'label': 3, 'input_ids': [1, 360, 29943, 29956, 471, 8688, 411, 278, 6850, 29903, 297, 3458, 313, 4716, 2289, 2099, 1407, 2217, 467, 29871, 18927, 310, 1009, 4688, 12089, 5518, 750, 20407, 411, 263, 528, 4774, 280, 25325, 322, 1023, 470, 2211, 4045, 20043, 701, 304, 29341, 29889, 29871, 306, 4140, 896, 892, 1811, 304, 22884, 920, 12862, 278, 4799, 637, 471, 29889, 29871, 1152, 27043, 4072, 29901, 29871, 1954, 22094, 278, 6216, 4989, 412, 29894, 457, 322, 6600, 1747, 723, 367, 2534, 565, 278, 528, 4774, 280, 399, 3289, 25325, 472, 360, 29943, 29956, 29889, 313, 2831, 278, 1791, 29892, 896, 526, 5279, 2534, 777, 3081, 10205, 793, 1546, 278, 4799, 637, 322, 18830, 14368, 467, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
# Evaluate model on test dataset
model.eval()

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192, padding_idx=0)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (k_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (up_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (down_proj): Linear8bitLt(in_features=28620, out_features=8192, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layerno

In [15]:
# Metrics Computation
device = "cuda"
l = len(validation_dataset)
batch_size = 1

predictions = []
references = []

for i in range(0, 4, batch_size):
    print('Processing: ', i)
    data_batch = validation_dataset[i:i + batch_size]
    # NOTE: Before passing data_batch['input_ids] to the model, cast them using torch.LongTensor()
    # Same for data_batch['attention_mask']. So that .to(device) call can work.
    #print(data_batch)
    with torch.no_grad():
        outputs = model(input_ids=torch.LongTensor(data_batch['input_ids']).to(device), 
                        attention_mask=torch.LongTensor(data_batch['attention_mask']).to(device))
    batch_predictions = outputs.logits.argmax(dim=-1)
    batch_predictions, batch_references = batch_predictions.detach().cpu().numpy().tolist(), data_batch["label"]
    predictions.extend(batch_predictions)
    references.extend(batch_references)

print(predictions)
print(references)

#Compute metrics
metrics = compute_metrics(task_type=constants.Tasks.CLASSIFICATION,
                          y_test=predictions,
                          y_pred=references)["metrics"]

print(metrics)

Processing:  0
Processing:  1
Processing:  2
Processing:  3


Metrics skipped due to missing y_pred_proba:
 ['AUC_weighted', 'accuracy_table', 'average_precision_score_weighted', 'norm_macro_recall', 'average_precision_score_micro', 'average_precision_score_macro', 'average_precision_score_binary', 'log_loss', 'AUC_macro', 'AUC_binary', 'AUC_micro']


[2, 3, 3, 3]
[3, 1, 1, 3]
{'f1_score_micro': 0.25, 'f1_score_macro': 0.13333333333333333, 'precision_score_micro': 0.25, 'recall_score_binary': nan, 'weighted_accuracy': 0.3, 'accuracy': 0.25, 'precision_score_binary': nan, 'recall_score_micro': 0.25, 'recall_score_weighted': 0.25, 'f1_score_binary': nan, 'precision_score_weighted': 0.375, 'balanced_accuracy': 0.1111111111111111, 'precision_score_macro': 0.16666666666666666, 'matthews_correlation': -0.2886751345948129, 'f1_score_weighted': 0.30000000000000004, 'recall_score_macro': 0.1111111111111111}


In [16]:
model.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 1,
 'model.layers.10': 1,
 'model.layers.11': 1,
 'model.layers.12': 1,
 'model.layers.13': 1,
 'model.layers.14': 1,
 'model.layers.15': 1,
 'model.layers.16': 1,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 2,
 'model.layers.21': 2,
 'model.layers.22': 2,
 'model.layers.23': 2,
 'model.layers.24': 2,
 'model.layers.25': 2,
 'model.layers.26': 2,
 'model.layers.27': 2,
 'model.layers.28': 2,
 'model.layers.29': 2,
 'model.layers.30': 2,
 'model.layers.31': 3,
 'model.layers.32': 3,
 'model.layers.33': 3,
 'model.layers.34': 3,
 'model.layers.35': 3,
 'model.layers.36': 3,
 'model.layers.37': 3,
 'model.layers.38': 3,
 'model.layers.39': 3,
 'model.layers.40': 3,
 'model.layers.41': 3,
 'model.layers.42'

In [17]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192, padding_idx=0)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (k_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (up_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (down_proj): Linear8bitLt(in_features=28620, out_features=8192, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layerno

In [18]:
model.train()

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192, padding_idx=0)
    (layers): ModuleList(
      (0-79): 80 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (k_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=8192, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (up_proj): Linear8bitLt(in_features=8192, out_features=28620, bias=False)
          (down_proj): Linear8bitLt(in_features=28620, out_features=8192, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layerno

In [30]:
#peft_model = prepare_model_for_kbit_training(model)
model = prepare_model_for_int8_training(model)
peft_model = model

config = LoraConfig(
   r=4,
   lora_alpha=16,
   target_modules= [
       "q_proj",
       "v_proj",
   ],
   lora_dropout=.05,
   bias="none",
   task_type="SEQ_CLS", # use this to get the task type: https://github.com/huggingface/peft/blob/96c0277a1b9a381b10ab34dbf84917f9b3b992e6/src/peft/utils/config.py#L38
)
#config.save_pretrained(OUTPUT_DIR)

peft_model = get_peft_model(peft_model, config)
peft_model.print_trainable_parameters()

trainable params: 8,192,000 || all params: 68,722,761,728 || trainable%: 0.011920359127043492




In [31]:
print(peft_model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 8192, padding_idx=0)
        (layers): ModuleList(
          (0-79): 80 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=8192, out_features=8192, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=8192, out_features=10

In [32]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

In [33]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_steps=0,
    num_train_epochs=1,
#     max_steps=16,
    learning_rate=3e-4,
    fp16=False,
    #bf16=True, #False, # Setting mixed precision to true also works without quantization
    #optim="adamw_torch_fused",
    evaluation_strategy="steps",
    save_strategy="no",
    output_dir='.',
    ddp_find_unused_parameters=None,
    remove_unused_columns=False,
    logging_steps=8)

trainer = Trainer(
                  model=peft_model,
                  train_dataset=train_dataset,
                  eval_dataset=validation_dataset,
                  args=training_args,
                 )
#peft_model.config.use_cache = False

In [34]:
trainer.train()



Step,Training Loss,Validation Loss
8,0.9036,1.400316
16,1.0066,1.345799
24,1.2785,1.329462


TrainOutput(global_step=25, training_loss=1.044743628501892, metrics={'train_runtime': 607.9105, 'train_samples_per_second': 0.317, 'train_steps_per_second': 0.041, 'total_flos': 2.0295013204230144e+16, 'train_loss': 1.044743628501892, 'epoch': 1.0})

In [35]:
predictions = trainer.predict(validation_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

(98, 4) (98,)


In [36]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [37]:
preds

array([3, 3, 3, 3, 3, 2, 0, 3, 0, 3, 3, 2, 2, 0, 2, 2, 2, 0, 3, 1, 2, 2,
       1, 3, 2, 3, 3, 2, 0, 3, 2, 1, 1, 1, 0, 3, 1, 0, 2, 2, 0, 2, 2, 2,
       3, 1, 1, 3, 2, 0, 0, 0, 0, 2, 0, 1, 3, 1, 0, 0, 1, 3, 2, 2, 2, 2,
       3, 2, 2, 3, 3, 2, 3, 1, 3, 0, 3, 0, 3, 1, 1, 3, 2, 3, 3, 0, 3, 1,
       1, 1, 0, 2, 3, 2, 0, 3, 0, 3])

In [38]:
# import evaluate
# metric = evaluate.load("glue", "mrpc")
# metric.compute(predictions=preds, references=predictions.label_ids)
from sklearn.metrics import accuracy_score
accuracy_score(predictions.label_ids, preds)

0.5306122448979592

In [39]:
# Evaluate model on test dataset
peft_model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 8192, padding_idx=0)
        (layers): ModuleList(
          (0-79): 80 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=8192, out_features=8192, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=8192, out_features=10

In [40]:
# Metrics Computation
device = "cuda"
l = len(validation_dataset)
batch_size = 1

predictions = []
references = []

for i in range(0, 4, batch_size):
    print('Processing: ', i)
    data_batch = validation_dataset[i:i + batch_size]
    # NOTE: Before passing data_batch['input_ids] to the model, cast them using torch.LongTensor()
    # Same for data_batch['attention_mask']. So that .to(device) call can work.
    #print(data_batch)
    with torch.no_grad():
        outputs = peft_model(input_ids=torch.LongTensor(data_batch['input_ids']).to(device), 
                             attention_mask=torch.LongTensor(data_batch['attention_mask']).to(device))
    batch_predictions = outputs.logits.argmax(dim=-1)
    batch_predictions, batch_references = batch_predictions.detach().cpu().numpy().tolist(), data_batch["label"]
    predictions.extend(batch_predictions)
    references.extend(batch_references)

print(predictions)
print(references)

#Compute metrics
metrics = compute_metrics(task_type=constants.Tasks.CLASSIFICATION,
                          y_test=predictions,
                          y_pred=references)["metrics"]

print(metrics)

Processing:  0




Processing:  1
Processing:  2
Processing:  3


Metrics skipped due to missing y_pred_proba:
 ['AUC_weighted', 'accuracy_table', 'average_precision_score_weighted', 'norm_macro_recall', 'average_precision_score_micro', 'average_precision_score_macro', 'average_precision_score_binary', 'log_loss', 'AUC_macro', 'AUC_binary', 'AUC_micro']


[3, 3, 3, 3]
[3, 1, 1, 3]
{'f1_score_micro': 0.5, 'f1_score_macro': 0.3333333333333333, 'precision_score_micro': 0.5, 'recall_score_binary': 0.5, 'weighted_accuracy': 0.5, 'accuracy': 0.5, 'precision_score_binary': 1.0, 'recall_score_micro': 0.5, 'recall_score_weighted': 0.5, 'f1_score_binary': 0.6666666666666666, 'precision_score_weighted': 1.0, 'balanced_accuracy': 0.25, 'precision_score_macro': 0.5, 'matthews_correlation': 0.0, 'f1_score_weighted': 0.6666666666666666, 'recall_score_macro': 0.25}
