In [3]:
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops
# !pip install openpyxl
# !pip install htmllaundry
# !pip install lxml_html_clean
# !pip install bs4
# ! pip install matplotlib
# !pip install tensorflow
# !pip install --upgrade transformers
# !pip install --upgrade datasets
# !pip install pip install tf-keras
# !pip install sentencepiece

In [4]:
from transformers import file_utils
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
import tensorflow as tf
import pprint
from datasets import Dataset  
print(file_utils.default_cache_path)

/root/.cache/huggingface/hub


In [None]:
df = pd.read_csv("./train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,question,answer,text
0,0,What significant challenges does the rapid exp...,"Balancing scalability and security, computatio...",###Human:\nAnswer this question in the context...
1,1,How does the proposed framework address the in...,By employing edge aggregating servers and Ethe...,###Human:\nAnswer this question in the context...
2,2,What are the primary benefits of using blockch...,"Data integrity, device authentication, and pro...",###Human:\nAnswer this question in the context...
3,3,Why are traditional blockchain-based solutions...,"Due to scalability, cost issues, and computati...",###Human:\nAnswer this question in the context...
4,4,How does the proposed framework ensure data pr...,Through the use of Zero-Knowledge Proofs (ZKPs...,###Human:\nAnswer this question in the context...


In [6]:
train_df = pd.DataFrame(df.loc[:,"text"])
train_df

Unnamed: 0,text
0,###Human:\nAnswer this question in the context...
1,###Human:\nAnswer this question in the context...
2,###Human:\nAnswer this question in the context...
3,###Human:\nAnswer this question in the context...
4,###Human:\nAnswer this question in the context...
...,...
799,###Human:\nAnswer this question in the context...
800,###Human:\nAnswer this question in the context...
801,###Human:\nAnswer this question in the context...
802,###Human:\nAnswer this question in the context...


In [7]:
model_name = "microsoft/Phi-3-mini-4k-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=False
)

model.config.use_cache = False

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=3072, out_

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token

In [10]:
text = "what are blockchain's impact on accounting and auditing practices ?"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs,  max_new_tokens=5000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

what are blockchain's impact on accounting and auditing practices ?

### Answer:
Blockchain technology has the potential to significantly impact accounting and auditing practices in several ways:

1. **Transparency and Traceability**: Blockchain's inherent characteristics of being a distributed ledger that is immutable and transparent can enhance the transparency and traceability of financial transactions. Every transaction on a blockchain is recorded in a way that it cannot be altered or deleted, which can help in ensuring the integrity of financial records.

2. **Real-time Auditing**: With blockchain, auditors can have access to real-time financial data, which can streamline the auditing process. Instead of relying on periodic financial statements, auditors can verify transactions as they occur, reducing the time and resources required for the audit.

3. **Smart Contracts**: Blockchain enables the use of smart contracts, which are self-executing contracts with the terms of the agreem

In [11]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=["qkv_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

In [12]:
output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 10
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    num_train_epochs=1,
    lr_scheduler_type=lr_scheduler_type,
)

In [13]:
dataset = Dataset.from_pandas(train_df)
dataset

Dataset({
    features: ['text'],
    num_rows: 804
})

In [14]:
for x in dataset.take(1):
  pprint.pprint(x)

{'text': '###Human:\n'
         'Answer this question in the context of cryptocurrency, stock markets '
         'and blockchain.\n'
         'What significant challenges does the rapid expansion of IoT '
         'introduce in data authentication?\n'
         '\n'
         '###Assistant:\n'
         'Balancing scalability and security, computational and storage '
         'bottlenecks.'}


In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args = SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        warmup_steps = 5,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = True,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "model_traning_outputs",
        report_to = "none",
        max_seq_length = 512,
        dataset_num_proc = 4,
        packing = False, 
    ),
)


  trainer = SFTTrainer(
  self.pid = os.fork()


Converting train dataset to ChatML (num_proc=4):   0%|          | 0/804 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=4):   0%|          | 0/804 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/804 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/804 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [17]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=402, training_loss=0.9470151075676306, metrics={'train_runtime': 461.7713, 'train_samples_per_second': 3.482, 'train_steps_per_second': 0.871, 'total_flos': 2703143364304896.0, 'train_loss': 0.9470151075676306})

In [18]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [19]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)
model



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
              (qkv_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=9216, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=9216, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict(

In [20]:
text = "what are blockchain's impact on accounting and auditing practices ?"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=5000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

what are blockchain's impact on accounting and auditing practices ?

### Answer:
Blockchain technology has the potential to significantly impact accounting and auditing practices in several ways:

1. **Transparency and Traceability**: Blockchain's inherent characteristics of being a distributed ledger that is immutable and transparent can enhance the transparency and traceability of financial transactions. Every transaction on a blockchain is recorded in a way that it cannot be altered or deleted, which can help in ensuring the integrity of financial records.

2. **Real-time Auditing**: With blockchain, auditors can have access to real-time financial data, which can streamline the auditing process. Instead of relying on periodic financial statements, auditors can verify transactions as they occur, reducing the time and resources required for the audit.

3. **Smart Contracts**: Blockchain enables the use of smart contracts, which are self-executing contracts with the terms of the agreem