In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"


# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
FinQA_prompt = """Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer.

### Pre-text:
{}

### Table:
{}

### Post-text:
{}

### Question:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    
    pre_texts      = examples["pre_text"]
    tables         = examples["table"]
    post_texts     = examples["post_text"]
    programs = examples["expanded_program_re"]
    questions      = examples["question"]
    answers        = examples["final_result"]

    
    texts = []
    for pre_text, table, post_text, program, question, answer in zip(pre_texts, tables, post_texts, programs, questions, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = FinQA_prompt.format(pre_text, table, post_text, question, program + ' = ' + answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
train = load_dataset("n3Er/FinQA-Infix", split = "train")
test = load_dataset("n3Er/FinQA-Infix", split = "test")
validation = load_dataset("n3Er/FinQA-Infix", split = "validation")
train = train.map(formatting_prompts_func, batched = True)
#test = test.map(formatting_prompts_func, batched = True)
#validation = validation.map(formatting_prompts_func, batched = True)

README.md:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/801k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6251 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/883 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

In [None]:
train['final_result']

In [None]:
train['text'][194]

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 3,
        #remove_unused_columns=False,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/6251 [00:00<?, ? examples/s]

In [6]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.418 GB of memory reserved.


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,251 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 3 | Gradient Accumulation steps = 6
\        /    Total batch size = 18 | Total steps = 60
 "-____-"     Number of trainable parameters = 20,971,520


Step,Training Loss
1,1.6857
2,1.7426
3,1.8265
4,1.6897
5,1.6886
6,1.6787
7,1.5487
8,1.5621
9,1.5957
10,1.4571


In [8]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

4196.7241 seconds used for training.
69.95 minutes used for training.
Peak reserved memory = 8.123 GB.
Peak reserved memory for training = 2.705 GB.
Peak reserved memory % of max memory = 55.105 %.
Peak reserved memory for training % of max memory = 18.35 %.


In [9]:
# pre_text, table, post_text, program, question, answer

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    FinQA_prompt.format(
        "24 2017 annual report performance graph the following chart presents a comparison for the five-year period ended june 30 , 2017 , of the market performance of the company 2019s common stock with the s&p 500 index and an index of peer companies selected by the company : comparison of 5 year cumulative total return among jack henry & associates , inc. , the s&p 500 index , and a peer group the following information depicts a line graph with the following values: .", # Pre-text
        """	2012	2013	2014	2015	2016	2017
jkhy	100.00	138.34	177.10	195.72	267.64	322.60
peer group	100.00	117.87	161.90	203.87	233.39	271.10
s&p 500	100.00	120.60	150.27	161.43	167.87	197.92""",
        """this comparison assumes $ 100 was invested on june 30 , 2012 , and assumes reinvestments of dividends . total returns are calculated according to market capitalization of peer group members at the beginning of each period . peer companies selected are in the business of providing specialized computer software , hardware and related services to financial institutions and other businesses . companies in the peer group are aci worldwide , inc. ; bottomline technology , inc. ; broadridge financial solutions ; cardtronics , inc. ; convergys corp. ; corelogic , inc. ; dst systems , inc. ; euronet worldwide , inc. ; fair isaac corp. ; fidelity national information services , inc. ; fiserv , inc. ; global payments , inc. ; moneygram international , inc. ; ss&c technologies holdings , inc. ; total systems services , inc. ; tyler technologies , inc. ; verifone systems , inc. ; and wex , inc.. .""", #post-text
        "jkhy's total 5 year return was what percent of the peer group?",
        "", #responce
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer.\n\n### Pre-text:\n24 2017 annual report performance graph the following chart presents a comparison for the five-year period ended june 30, 2017, of the market performance of the company 2019s common stock with the s&p 500 index and an index of peer companies selected by the company : comparison of 5 year cumulative total return among jack henry & associates, inc., the s&p 500 index, and a peer group the following information depicts a line graph with the following values:.\n\n### Table:\n\t2012\t2013\t2014\t2015\t2016\t2017\njkhy\t100.00\t138.34\t177.10\t195.72\t267.64\t322.60\npeer group\t100.00\t117.87\t161.90\t203.87\t233.39\t271.10\ns&p 500\t100.00\t120.60\t150.27\t161.43\t167.87\t197.92\n\n### Post-text:\nthis comparison assumes $ 100 was invested on june 30, 2012, and assumes reinvestments of dividends. total returns are cal

In [None]:
test['text'][0]

In [11]:
test = load_dataset("n3Er/FinQA-Infix", split = "test")

def formatting_prompts_test(examples):
    
    pre_texts      = examples["pre_text"]
    tables         = examples["table"]
    post_texts     = examples["post_text"]
    programs       = examples["expanded_program_re"]
    questions      = examples["question"]
    answers        = examples["final_result"]

    
    texts = []
    responses = []
    for pre_text, table, post_text, program, question, answer in zip(pre_texts, tables, post_texts, programs, questions, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = FinQA_prompt.format(pre_text, table, post_text, question, "")
        True_response = program + ' = ' + answer
        texts.append(text)
        responses.append(True_response)
    return { "text" : texts, "true_responses" : responses,}

test = test.map(formatting_prompts_test, batched = True)

Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

In [None]:
print(test["text"][0])
#print(test["true_responses"][0])

In [12]:
FastLanguageModel.for_inference(model)

test_input = tokenizer(test["text"][1], return_tensors = "pt").to("cuda")
outputs = model.generate(**test_input, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer.\n\n### Pre-text:\n['item 1b.', 'unresolved staff comments not applicable.', 'item 2.', 'properties as of december 26, 2015, our major facilities consisted of : ( square feet in millions ) united states countries total owned facilities1.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '30.7 17.2 47.9 leased facilities2.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '

In [13]:
import re

test_output = []



def extract_response(text):
    pattern = r"Response:\n(.*?)<\|eot_id\|>"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None


for i in range(len(test["text"])):
    FastLanguageModel.for_inference(model)
    test_input = tokenizer(test["text"][i], return_tensors = "pt").to("cuda")
    outputs = model.generate(**test_input, max_new_tokens = 64, use_cache = True)
    decoded_output = tokenizer.batch_decode(outputs)
    test_output.append(extract_response(decoded_output[0]))

In [16]:
test_output[6]

'463 / 4612 = 10.0%'

In [17]:
true_test_output = test["true_responses"]

In [18]:
true_test_output[6]

'463 / 4612 = 10%'

In [14]:
import pickle
with open("test_output.pkl", 'wb') as file:
    pickle.dump(test_output, file)

In [19]:
import pickle
with open("true_test_output.pkl", 'wb') as file:
    pickle.dump(true_test_output, file)

In [20]:
!pip install rouge-score


  pid, fd = os.forkpty()


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e0306b6f3724149647ff0c0953c0bcfc63717645e647d839cd59dcf34e69a555
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [21]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

scores = []


scores = []
attribute_error_count = 0

try:
    for ref, hyp in zip(true_test_output, test_output):
        try:
            score = scorer.score(ref, hyp)
            scores.append(score)
        except AttributeError as e:
            print(f"An AttributeError occurred: {e}")
            attribute_error_count += 1
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Print the total number of AttributeError exceptions
print(f"Total number of AttributeError exceptions: {attribute_error_count}")

    

An AttributeError occurred: 'NoneType' object has no attribute 'lower'
An AttributeError occurred: 'NoneType' object has no attribute 'lower'
Total number of AttributeError exceptions: 2


In [30]:
suum1 = 0
suum2 = 0
suum3 = 0

for i in range(len(scores)):
    suum1 += float(scores[i]['rougeL'][0])
    suum2 += float(scores[i]['rougeL'][1])
    suum3 += float(scores[i]['rougeL'][2])

precision = suum1 / len(scores)
recall = suum2 / len(scores)
fmeasure = suum3 / len(scores)
print("The Precision (Rouge-L): {0:.2f}".format(precision))
print("The Recall (Rouge-L):    {0:.2f}".format(recall))
print("The F-Measure (Rouge-L): {0:.2f}".format(fmeasure))


The Precision (Rouge-L): 0.64
The Recall (Rouge-L):    0.63
The F-Measure (Rouge-L): 0.62


In [23]:
indexes = []
for i in range(len(scores)):
    f_val = float(scores[i]['rougeL'][2])
    if f_val < 0.4:
        indexes.append(i)

print(len(indexes))

295


In [None]:
rougeL_fmeasure = []

for i in range(len(scores)):
    f_val = float(scores[i]['rougeL'][2])
    rougeL_fmeasure.append(f_val)

print(rougeL_fmeasure.index(min(rougeL_fmeasure)))

In [None]:
import pickle

with open("/kaggle/input/the-finetuned-llama-output/test_output.pkl", 'rb') as file:
    test_output = pickle.load(file)
with open("/kaggle/input/the-finetuned-llama-output/true_test_output.pkl", 'rb') as file:
    true_test_output = pickle.load(file)


In [None]:
type(test_output)

In [24]:
n = 675
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", test_output[n])

The F-measure score for this responce is:  Score(precision=1.0, recall=1.0, fmeasure=1.0)
The true responce is:                      47162 / 83659 = 56%
The model output is:                       47162 / 83659 = 56.3%


In [25]:
n = indexes[3]
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", test_output[n])

The F-measure score for this responce is:  Score(precision=0.0, recall=0.0, fmeasure=0.0)
The true responce is:                      table_average(net change for the year) = 3298
The model output is:                       13928 / 3 = 4660


In [26]:
n = 963
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", test_output[n])

The F-measure score for this responce is:  Score(precision=0.25, recall=0.25, fmeasure=0.25)
The true responce is:                      (6569200 * 4.55) / const_1000000 = 29.9
The model output is:                       6569.2 / 4.55 = 1443.9
