In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip index versions unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "BijanProjects/Llama_3.1_8B_IBM_FinQA_FineTuned", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
from datasets import load_dataset

test = load_dataset("zhoujun/hitab", split = "test")

In [None]:
FinQA_prompt = """Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer.

### Pre-text:
""

### Table:
{}

### Post-text:
""

### Question:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    

    tables         = examples["table"]
    questions      = examples["question"]
    answers        = examples["answer"]

    
    texts = []
    for table, question, answer in zip(tables, questions, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = FinQA_prompt.format(table, question, answer[0]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset

test = load_dataset("zhoujun/hitab", split = "test")
test = test.map(formatting_prompts_func, batched = True)

In [None]:
test['text'][0]

In [None]:
len(test)

# Show current memory stats

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Deployment

In [None]:
test = load_dataset("zhoujun/hitab", split = "test")

def formatting_prompts_test(examples):
    
    tables         = examples["table"]
    questions      = examples["question"]
    answers        = examples["answer"]

    
    texts = []
    responses = []
    for table, question, answer in zip(tables, questions, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = FinQA_prompt.format(table, question, "")
        True_response = answer[0]
        texts.append(text)
        responses.append(True_response)
    return { "text" : texts, "true_responses" : responses,}

test = test.map(formatting_prompts_test, batched = True)

In [None]:
print(test["text"][0])
print(test["true_responses"][0])

In [None]:
FastLanguageModel.for_inference(model)

prompt = ["""### pretext:
Can you explain about what are you able to do?
### Response:
"""]


test_input = tokenizer(prompt, return_tensors = "pt").to("cuda")
outputs = model.generate(**test_input, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
import re

pred_test_output = []
pred_final_answer = []


def extract_response(text):
    pattern = r"Response:\n(.*?)<\|eot_id\|>"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None

def extract_answer(text):
    pattern = r"=(.*?)<\|eot_id\|>"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None


for i in range(len(test["text"])):
    FastLanguageModel.for_inference(model)
    test_input = tokenizer(test["text"][i], return_tensors = "pt").to("cuda")
    outputs = model.generate(**test_input, max_new_tokens = 64, use_cache = True, temperature = 1e-10)
    decoded_output = tokenizer.batch_decode(outputs)
    pred_test_output.append(extract_response(decoded_output[0]))
    pred_final_answer.append(extract_answer(decoded_output[0]))

In [None]:
true_test_output = test["true_responses"]

In [None]:
import pickle
with open("pred_test_output.pkl", 'wb') as file:
    pickle.dump(pred_test_output, file)

In [None]:
import pickle
with open("true_test_output.pkl", 'wb') as file:
    pickle.dump(true_test_output, file)

In [None]:
# with open("true_test_output.pkl", 'rb') as file:
    # true_test_output = pickle.load(file)

In [None]:
import pickle
with open("test_prompts.pkl", 'wb') as file:
    pickle.dump(test["text"], file)

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

scores = []


scores = []
attribute_error_count = 0

try:
    for ref, hyp in zip(true_test_output, pred_final_answer):
        try:
            score = scorer.score(ref, hyp)
            scores.append(score)
        except AttributeError as e:
            print(f"An AttributeError occurred: {e}")
            attribute_error_count += 1
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Print the total number of AttributeError exceptions
print(f"Total number of AttributeError exceptions: {attribute_error_count}")

    

In [None]:
suum1 = 0
suum2 = 0
suum3 = 0

for i in range(len(scores)):
    suum1 += float(scores[i]['rougeL'][0])
    suum2 += float(scores[i]['rougeL'][1])
    suum3 += float(scores[i]['rougeL'][2])

precision = suum1 / len(scores)
recall = suum2 / len(scores)
fmeasure = suum3 / len(scores)
print("The Precision (Rouge-L): {0:.2f}".format(precision))
print("The Recall (Rouge-L):    {0:.2f}".format(recall))
print("The F-Measure (Rouge-L): {0:.2f}".format(fmeasure))


In [None]:
indexes = []
for i in range(len(scores)):
    f_val = float(scores[i]['rougeL'][2])
    if f_val < 0.4:
        indexes.append(i)

print(len(indexes))

In [None]:
rougeL_fmeasure = []

for i in range(len(scores)):
    f_val = float(scores[i]['rougeL'][2])
    rougeL_fmeasure.append(f_val)

print(rougeL_fmeasure.index(min(rougeL_fmeasure)))

# Loading The Outputs and Metric Reports:

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

scores = []


scores = []
attribute_error_count = 0

try:
    for ref, hyp in zip(true_test_output, pred_test_output):
        try:
            score = scorer.score(ref, hyp)
            scores.append(score)
        except AttributeError as e:
            print(f"An AttributeError occurred: {e}")
            attribute_error_count += 1
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Print the total number of AttributeError exceptions
print(f"Total number of AttributeError exceptions: {attribute_error_count}")

    

In [None]:
import pickle

with open("/kaggle/input/the-finetuned-llama-output/pred_test_output.pkl", 'rb') as file:
    pred_test_output = pickle.load(file)
with open("/kaggle/input/the-finetuned-llama-output/true_test_output.pkl", 'rb') as file:
    true_test_output = pickle.load(file)


In [None]:
type(pred_test_output)

In [None]:
n = 675
print("The sample index:                         ",n )
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", pred_test_output[n])

In [None]:
n = 3
print("The sample index:                         ",n )
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", pred_test_output[n])

In [None]:
n = 963
print("The sample index:                         ",n )
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", pred_test_output[n])

In [None]:
n = 594
print("The sample index:                         ",n )
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", pred_test_output[n])

In [None]:
test["text"][22]

In [None]:
n = 22
print("The sample index:                         ",n )
print("The F-measure score for this responce is: ", scores[n]['rougeL'])
print("The true responce is:                     ", true_test_output[n])
print("The model output is:                      ", pred_test_output[n])

In [None]:
type(test["text"])

In [None]:
true_test_output[1000]

In [None]:
import re

# Sample lists
# true_test_output 
list_1 = pred_test_output
list_2 = pred_final_answer

# Function to check if a string contains a numerical value
def contains_number(s):
    return bool(re.search(r'\d', s))

# Get boolean mask for numerical values
is_numeric = [contains_number(item) for item in true_test_output]

# Filtering lists based on numeric values
filtered_list_1_numeric = [val for val, flag in zip(list_1, is_numeric) if flag]
filtered_list_2_numeric = [val for val, flag in zip(list_2, is_numeric) if flag]
filtered_true_test_output_numeric = [val for val, flag in zip(true_test_output, is_numeric) if flag]

# Display results
print("Filtered Numeric True Test Output:", filtered_true_test_output_numeric)
print("Filtered Numeric List 1:", filtered_list_1_numeric)
print("Filtered Numeric List 2:", filtered_list_2_numeric)


In [None]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

scores = []


scores = []
attribute_error_count = 0

try:
    for ref, hyp in zip(filtered_true_test_output_numeric, filtered_list_2_numeric):
        try:
            score = scorer.score(ref, hyp)
            scores.append(score)
        except AttributeError as e:
            print(f"An AttributeError occurred: {e}")
            attribute_error_count += 1
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Print the total number of AttributeError exceptions
print(f"Total number of AttributeError exceptions: {attribute_error_count}")

    

In [None]:
suum1 = 0
suum2 = 0
suum3 = 0

for i in range(len(scores)):
    suum1 += float(scores[i]['rougeL'][0])
    suum2 += float(scores[i]['rougeL'][1])
    suum3 += float(scores[i]['rougeL'][2])

precision = suum1 / len(scores)
recall = suum2 / len(scores)
fmeasure = suum3 / len(scores)
print("The Precision (Rouge-L): {0:.2f}".format(precision))
print("The Recall (Rouge-L):    {0:.2f}".format(recall))
print("The F-Measure (Rouge-L): {0:.2f}".format(fmeasure))
