### Installing and Importing all the necessary packages

*Hey, We are Siddhant & Pavana and we are participating in this Hackathon :3. To begin our OCR-TO-JSON journey, we need to install and import the necessary libraries. This setup ensures we have all the tools required for data manipulation, visualization, and interaction with the PHI-3-Mini Model.*

In [None]:
!pip install -U -q transformers peft==0.4.0 accelerate bitsandbytes trl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.[0m[31m
[0m

In [None]:
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from accelerate import PartialState
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from tqdm import tqdm
from trl import SFTConfig, SFTTrainer
from trl.trainer import ConstantLengthDataset
import json
from rich import (inspect, print, pretty)
from rich.console import Console
from rich.syntax import Syntax
from random import randrange
pretty.install()
torch.backends.cudnn.benchmark = True

2024-07-09 15:22:39.971291: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 15:22:39.971392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 15:22:40.144294: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading and Preparing Data

*We load and prepare the dataset for fine-tuning our model. This involves splitting the dataset into training and evaluation subsets, ensuring a representative distribution for effective model training and validation.*

In [None]:
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1")
new_model = "elucidator8918/phi_miniocr_to_json"
dataset

Downloading readme:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/70 [00:00<?, ? examples/s]


[1;35mDatasetDict[0m[1m([0m[1m{[0m
    train: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'image'[0m, [32m'id'[0m, [32m'parsed_data'[0m, [32m'raw_data'[0m[1m][0m,
        num_rows: [1;36m2043[0m
    [1m}[0m[1m)[0m
    test: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'image'[0m, [32m'id'[0m, [32m'parsed_data'[0m, [32m'raw_data'[0m[1m][0m,
        num_rows: [1;36m125[0m
    [1m}[0m[1m)[0m
    valid: [1;35mDataset[0m[1m([0m[1m{[0m
        features: [1m[[0m[32m'image'[0m, [32m'id'[0m, [32m'parsed_data'[0m, [32m'raw_data'[0m[1m][0m,
        num_rows: [1;36m70[0m
    [1m}[0m[1m)[0m
[1m}[0m[1m)[0m

In [None]:
print(f"dataset size: {len(dataset['train'])}")
sample=dataset['train'][randrange(len(dataset))]
ocr_words=eval(sample['raw_data'])['ocr_words']
ocr_boxes=eval(sample['raw_data'])['ocr_boxes']
ocr_json=eval(sample['parsed_data'])['json']

### Alpaca Prompt Template Definition

*The alpaca_prompt template is defined to structure input for the Phi model fine-tuning. It sets the context for generating responses based on the instructional context given.*

In [None]:
def format_train_instruction(sample):
    return f""" <|user|>
### Instruction:
You are an expert in POS receipts and a receipt data engineer with many years of experience working with complex receipt structures.
Your task is to parse, detect, recognize, and convert the following receipt OCR image result into a structured receipt format.
The output must be a well-formed JSON object.

### Input:
{eval(sample['raw_data'])['ocr_boxes']}
<|end|>

<|system|>
{eval(sample['parsed_data'])['json']}
<|end|>
"""

In [None]:
from random import randrange
print(format_train_instruction(dataset["train"][randrange(len(dataset))]))

In [None]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = format_train_instruction(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [None]:
dataset_id="mychen76/invoices-and-receipts_ocr_v1"
data_dir="data/finetune"

def create_datasets(tokenizer,dataset_id,data_dir=None,seq_length=2048,num_workers=6,streaming=False,size_valid_set=10,shuffle_buffer=1000):
    dataset = load_dataset(
        dataset_id,
        data_dir=data_dir,
        split="train",
        num_proc=num_workers if not streaming else None,
        streaming=streaming,
    )
    if streaming:
        print("Loading the dataset in streaming mode")
        valid_data = dataset.take(size_valid_set)
        train_data = dataset.skip(size_valid_set)
        train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=0.003, seed=None)
        train_data = dataset["train"]
        valid_data = dataset["test"]
        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=format_train_instruction,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=format_train_instruction,
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

### Configuring Parameters



In [None]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

In [None]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = True

In [None]:
# Load base model
model_id = "microsoft/Phi-3-mini-4k-instruct"
device_string = PartialState().process_index

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={'':device_string},
)

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

In [None]:
model.eval()


[1;35mPhi3ForCausalLM[0m[1m([0m
  [1m([0mmodel[1m)[0m: [1;35mPhi3Model[0m[1m([0m
    [1m([0membed_tokens[1m)[0m: [1;35mEmbedding[0m[1m([0m[1;36m32064[0m, [1;36m3072[0m, [33mpadding_idx[0m=[1;36m32000[0m[1m)[0m
    [1m([0membed_dropout[1m)[0m: [1;35mDropout[0m[1m([0m[33mp[0m=[1;36m0[0m[1;36m.0[0m, [33minplace[0m=[3;91mFalse[0m[1m)[0m
    [1m([0mlayers[1m)[0m: [1;35mModuleList[0m[1m([0m
      [1m([0m[1;36m0[0m-[1;36m31[0m[1m)[0m: [1;36m32[0m x [1;35mPhi3DecoderLayer[0m[1m([0m
        [1m([0mself_attn[1m)[0m: [1;35mPhi3Attention[0m[1m([0m
          [1m([0mo_proj[1m)[0m: [1;35mLinear4bit[0m[1m([0m[33min_features[0m=[1;36m3072[0m, [33mout_features[0m=[1;36m3072[0m, [33mbias[0m=[3;91mFalse[0m[1m)[0m
          [1m([0mqkv_proj[1m)[0m: [1;35mLinear4bit[0m[1m([0m[33min_features[0m=[1;36m3072[0m, [33mout_features[0m=[1;36m9216[0m, [33mbias[0m=[3;91mFalse[0m[1m)[0m
         

In [None]:
print(model.get_memory_footprint()/1024/1024/1024, "GB")
print(model.config.max_position_embeddings)
model.hf_device_map

[1m{[0m[32m''[0m: [1;36m0[0m[1m}[0m

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
        r=32,
        lora_alpha=8,
        target_modules=[
            "q_proj",
            "v_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.print_trainable_parameters()

trainable params: 13,707,264 || all params: 2,022,847,488 || trainable%: 0.6776222172612946


In [None]:
train_dataset, eval_dataset = create_datasets(tokenizer, dataset_id, seq_length=max_seq_length)

  0%|          | 0/400 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10082 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:06<00:00, 63.42it/s]


In [None]:
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    packing=packing,  # Set packing here
    max_seq_length=max_seq_length  # Set max_seq_length here
)

trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    args=sft_config  # Pass the SFTConfig object
)

### QLoRa Based SFT Training begin

In [None]:
trainer.train()

You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
25,0.9487
50,0.7971
75,0.7522
100,0.7186
125,0.6989
150,0.6804
175,0.671
200,0.6663
225,0.6534
250,0.6295



[1;35mTrainOutput[0m[1m([0m
    [33mglobal_step[0m=[1;36m509[0m,
    [33mtraining_loss[0m=[1;36m0[0m[1;36m.6860241861849262[0m,
    [33mmetrics[0m=[1m{[0m
        [32m'train_runtime'[0m: [1;36m34766.3633[0m,
        [32m'train_samples_per_second'[0m: [1;36m0.059[0m,
        [32m'train_steps_per_second'[0m: [1;36m0.015[0m,
        [32m'total_flos'[0m: [1;36m9.347578330506854e+16[0m,
        [32m'train_loss'[0m: [1;36m0.6860241861849262[0m,
        [32m'epoch'[0m: [1;36m1.0[0m
    [1m}[0m
[1m)[0m

In [None]:
trainer.model.save_pretrained(new_model)
import gc
del model
del trainer
gc.collect()
torch.cuda.empty_cache()

### Merging and pushing to huggingface

In [None]:
# Reload model in FP16 and merge it with LoRA weights
model_id = "microsoft/Phi-3-mini-4k-instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, "/kaggle/working/phi_miniocr_to_json")
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from huggingface_hub import login
login("token")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model.push_to_hub("phi_miniocr_to_json", use_temp_dir=False)
tokenizer.push_to_hub("phi_miniocr_to_json", use_temp_dir=False)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]


[1;35mCommitInfo[0m[1m([0m
    [33mcommit_url[0m=[32m'https://huggingface.co/elucidator8918/phi_miniocr_to_json/commit/d3603b85beb184b0448c5736f54dc55cf9c53cef'[0m,
    [33mcommit_message[0m=[32m'Upload tokenizer'[0m,
    [33mcommit_description[0m=[32m''[0m,
    [33moid[0m=[32m'd3603b85beb184b0448c5736f54dc55cf9c53cef'[0m,
    [33mpr_url[0m=[3;35mNone[0m,
    [33mpr_revision[0m=[3;35mNone[0m,
    [33mpr_num[0m=[3;35mNone[0m
[1m)[0m