In [2]:
from llama_cpp import Llama
import pandas as pd
import re

llm = Llama(model_path="Llama-3.2-3B-Instruct-uncensored-f16.gguf")
eval_df = pd.read_csv("training_data/addition_operations_eval.csv")

llama_model_loader: loaded meta data with 29 key-value pairs and 256 tensors from Llama-3.2-3B-Instruct-uncensored-f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
llama_model_loader: - kv   3:                       general.organization str              = Meta Llama
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_loader: - kv   5:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   6:                         general.size_label str              = 3B
llama_model_loader: - kv   7:           

In [3]:
num_problems = 50


def solve_math_problem(math_problem):
    response = llm(
        f"Solve this math problem, only print the final answer, do not provide steps: {math_problem}",
        max_tokens=50
    )
    solution = response['choices'][0]['text'].strip()
    return solution


def extract_number(text):
    match = re.search(r'\d+', text)
    return int(match.group()) if match else None


if num_problems <= 0:
    num_problems = len(eval_df)
elif num_problems > len(eval_df):
    num_problems = len(eval_df)

eval_subset = eval_df.head(num_problems)

eval_subset['Raw_Result'] = eval_subset['Operation'].apply(
    solve_math_problem)
eval_subset['Model_Result'] = eval_subset['Raw_Result'].apply(extract_number)
eval_subset['Correct'] = eval_subset.apply(
    lambda row: row['Result'] == row['Model_Result'], axis=1)

llama_perf_context_print:        load time =     716.93 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    23 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    49 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    6541.95 ms /    72 tokens
Llama.generate: 19 prefix-match hit, remaining 4 prompt tokens to eval
llama_perf_context_print:        load time =     716.93 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     4 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    49 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    5947.37 ms /    53 tokens
Llama.generate: 19 prefix-match hit, remaining 4 prompt tokens to eval
llama_perf_context_print:        load time =     716.93 ms
llama_perf_context_print: pro

In [4]:
eval_subset

Unnamed: 0,Operation,Result,Raw_Result,Model_Result,Correct
0,945 + 859,1804,.\n\n\n## Step 1: We are given the task of ad...,1,False
1,324 + 558,882,"+ 215 + 987 + 106 + 23\n\n\n## Step 1: First,...",215,False
2,373 + 449,822,= \n\n## Step 1: We need to add the numbers 3...,1,False
3,339 + 51,390,= \nThe final answer is: $398$,398,False
4,437 + 568,1005,+ 213 + 137.\n\n\n## Step 1: Add the numbers 4...,213,False
5,747 + 322,1069,"=\n\n\n## Step 1: First, we need to add the nu...",1,False
6,296 + 726,1022,+ 423 + 138 + 46 = ? . . . . . . . ...,423,False
7,920 + 397,1317,Answer: 1317 920 + 397 = 1317.,1317,True
8,465 + 277,742,8 = 4243. (Note: Do not include the equals s...,8,False
9,779 + 905,1684,= \n\n\n= 1684,1684,True


In [5]:
correct_percentage = (eval_subset['Correct'].sum() / len(eval_subset)) * 100
print(f"Correct results: {correct_percentage:.2f}%")

Correct results: 20.00%


## Base model


In [6]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from concurrent.futures import ThreadPoolExecutor

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "HuggingFaceTB/SmolLM2-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM2-1.7B-Instruct")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Load the evaluation data from the CSV file
eval_df = pd.read_csv("training_data/addition_operations_eval.csv")

# Function to solve a math problem using the model


def solve_math_problem(math_problem):
    response = generator(
        f"Solve this math problem, only print the final answer: {math_problem}",
        max_length=150
    )
    print(response)
    solution = response[0]['generated_text'].strip()
    return solution

# Function to extract only numbers from the model's result


def extract_number(text):
    match = re.search(r'\d+', text)
    return int(match.group()) if match else None


# Set the number of problems to process
num_problems = 10

# Ensure num_problems is within the valid range
if num_problems <= 0:
    num_problems = len(eval_df)
elif num_problems > len(eval_df):
    num_problems = len(eval_df)

# Process only first num_problems rows
eval_subset = eval_df.head(num_problems)

# Function to process each row


def process_row(operation):
    raw_response = solve_math_problem(operation)
    model_result = extract_number(raw_response)
    return raw_response, model_result


# Use ThreadPoolExecutor to process rows in parallel
with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_row, eval_subset['Operation']))

# Unpack the results and add them to the DataFrame
eval_subset['Raw_Response'], eval_subset['Model_Result'] = zip(*results)
eval_subset['Correct'] = eval_subset.apply(
    lambda row: row['Result'] == row['Model_Result'], axis=1)

# Calculate the percentage of correct results
correct_percentage = (eval_subset['Correct'].sum() / len(eval_subset)) * 100

# Print the comparison results and the percentage of correct results
print(eval_subset[['Operation', 'Result',
      'Raw_Response', 'Model_Result', 'Correct']])
print(f"Correct results: {correct_percentage:.2f}%")

  from .autonotebook import tqdm as notebook_tqdm
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Solve this math problem, only print the final answer: 373 + 449 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 747 + 322 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 324 + 558 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 437 + 568 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 339 + 518 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 920 + 397 + 1234 + 5678 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 296 + 726 + 1234 + 5678 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 945 + 859 + 763 + 642 + 531 + 420 + 311 + 200 + 100 ='}]
[{'generated_text': 'Solve this math problem, only print the final answer: 779 + 905 + 863 + 741 + 629 + 517 + 405 + 393 + 381 + 379 + 377 + 375 + 373 + 371 + 369 + 367 + 365 + 363 + 361 + 359 + 357 + 355 + 353

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_subset['Raw_Response'], eval_subset['Model_Result'] = zip(*results)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_subset['Raw_Response'], eval_subset['Model_Result'] = zip(*results)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_subset['Correct'] = eval_subset.apply(
