In [1]:
!pip install unsloth
!pip install transformers
!pip install trl
!pip install rouge
!pip install scikit-learn

Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.2.7 (from unsloth)
  Downloading unsloth_zoo-2025.2.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import AutoTokenizer
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from tqdm import tqdm
import json


# Initialize the same prompt template
finqa_prompt = """Below contains texts before table (pre-text), text after the table (post-text) and the table itself with a question that you must answer by providing only the mathematical or logical operations needed to solve it along with the result.
### Pre-text:
{}
### Table:
{}
### Post-text:
{}
### Question:
{}
### Response:
{}"""

def format_test_input(example):
    """Format a test example for inference"""
    pre_text = ' '.join(example['pre_text']) if example['pre_text'] else ""
    post_text = ' '.join(example['post_text']) if example['post_text'] else ""
    table = "\n".join(" | ".join(str(cell) for cell in row) for row in example['table']) if example['table'] else ""
    
    return finqa_prompt.format(
        pre_text,
        table,
        post_text,
        example['question'],
        ""  # Empty response for generation
    )

def extract_operations_result(response):
    """Extract operations and result from model response"""
    if "### Response:" in response:
        response = response.split("### Response:")[-1].strip()
    
    if "=" in response:
        parts = response.split("=", 1)
        operations = parts[0].strip()
        result = parts[1].strip() if len(parts) > 1 else None
        return operations, result
    
    return None, None

def evaluate_sample(model, tokenizer, example):
    """Evaluate model on a single example"""
    input_text = format_test_input(example)
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
    
    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=1e-10,
            # top_p=0.9,
            use_cache = True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            # repetition_penalty=1.2
        )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        generated_ops, generated_result = extract_operations_result(response)
        expected_ops = example['program_re']
        expected_result = str(example['answer'])
        
        # Calculate ROUGE scores
        rouge = Rouge()
        try:
            rouge_scores = rouge.get_scores(
                generated_ops or "",
                expected_ops
            )[0]
        except:
            rouge_scores = {
                'rouge-1': {'f': 0.0},
                'rouge-2': {'f': 0.0},
                'rouge-l': {'f': 0.0}
            }
        
        # Calculate Cosine similarity
        vectorizer = TfidfVectorizer()
        try:
            tfidf_matrix = vectorizer.fit_transform([
                generated_ops or "",
                expected_ops
            ])
            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            cosine_sim = 0.0
        
        # Check if the generated result matches the expected result
        result_match = False
        if generated_result and expected_result:
            try:
                # Try to convert both to float for numerical comparison
                gen_float = float(generated_result.replace(',', ''))
                exp_float = float(expected_result.replace(',', ''))
                result_match = abs(gen_float - exp_float) < 1e-5  # Allow small floating-point differences
            except:
                # If conversion fails, do string comparison
                result_match = generated_result.strip() == expected_result.strip()
        
        return {
            'question': example['question'],
            'expected_equation': f"{expected_ops} = {expected_result}",
            'generated_equation': f"{generated_ops} = {generated_result}" if generated_ops else None,
            'rouge1_f': rouge_scores['rouge-1']['f'],
            'rouge2_f': rouge_scores['rouge-2']['f'],
            'rougeL_f': rouge_scores['rouge-l']['f'],
            'cosine_similarity': cosine_sim,
            'result_match': result_match,
            'raw_response': response
        }
    
    except Exception as e:
        print(f"Error processing example: {str(e)}")
        return None

def evaluate_full_dataset():
    """Evaluate model on the entire test set"""
    print("Loading model and tokenizer from HuggingFace...")
    model_name = "n3Er/qwen2.5-7b-instruct-finqa-ht"
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load model with unsloth optimizations
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    
    # Set model to inference mode
    FastLanguageModel.for_inference(model)
    
    print("Loading test dataset...")
    dataset = load_dataset("ibm/finqa", split="test", trust_remote_code=True)
    
    # Filter valid examples
    valid_examples = [
        ex for ex in dataset 
        if ex['program_re'] and ex['answer'] is not None
    ]
    
    results = []
    total_samples = len(valid_examples)
    print(f"\nEvaluating {total_samples} samples...")
    
    # Use tqdm for progress bar
    for sample in tqdm(valid_examples, desc="Evaluating"):
        result = evaluate_sample(model, tokenizer, sample)
        if result:
            results.append(result)
    
    # Calculate metrics
    avg_metrics = {
        'rouge1_f': np.mean([r['rouge1_f'] for r in results]),
        'rouge2_f': np.mean([r['rouge2_f'] for r in results]),
        'rougeL_f': np.mean([r['rougeL_f'] for r in results]),
        'cosine_similarity': np.mean([r['cosine_similarity'] for r in results]),
        'result_accuracy': np.mean([1 if r['result_match'] else 0 for r in results]),
        'total_samples': len(results),
        'successful_evaluations': len(results),
        'failed_evaluations': total_samples - len(results)
    }
    
    # Print results
    print("\nOverall Results:")
    print("=" * 50)
    print(f"Total samples processed: {avg_metrics['total_samples']}")
    print(f"Successful evaluations: {avg_metrics['successful_evaluations']}")
    print(f"Failed evaluations: {avg_metrics['failed_evaluations']}")
    print(f"Average ROUGE-1 F1: {avg_metrics['rouge1_f']:.3f}")
    print(f"Average ROUGE-2 F1: {avg_metrics['rouge2_f']:.3f}")
    print(f"Average ROUGE-L F1: {avg_metrics['rougeL_f']:.3f}")
    print(f"Average Cosine Similarity: {avg_metrics['cosine_similarity']:.3f}")
    print(f"Result Accuracy: {avg_metrics['result_accuracy']:.3f}")
    
    # Save results
    output_file = "evaluation_results.json"
    
    with open(output_file, 'w') as f:
        json.dump({
            'metrics': avg_metrics,
            'detailed_results': results
        }, f, indent=2)
    
    print(f"\nDetailed results saved to {output_file}")
    return results, avg_metrics

if __name__ == "__main__":
    print("Starting full dataset evaluation...")
    results, metrics = evaluate_full_dataset()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Starting full dataset evaluation...
Loading model and tokenizer from HuggingFace...


tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/112k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/162M [00:00<?, ?B/s]

Unsloth 2025.2.15 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Loading test dataset...


README.md:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

finqa.py:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]


Evaluating 1147 samples...


Evaluating:   3%|▎         | 33/1147 [01:32<49:49,  2.68s/it]  Unsloth: Input IDs of length 2532 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2532) at non-singleton dimension 2


Evaluating:  12%|█▏        | 141/1147 [07:02<56:57,  3.40s/it]  Unsloth: Input IDs of length 2545 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2545) at non-singleton dimension 2


Evaluating:  16%|█▋        | 188/1147 [09:10<45:20,  2.84s/it]Unsloth: Input IDs of length 2290 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2290) at non-singleton dimension 2


Evaluating:  24%|██▍       | 275/1147 [13:36<48:55,  3.37s/it]  Unsloth: Input IDs of length 2300 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2300) at non-singleton dimension 2


Evaluating:  26%|██▋       | 302/1147 [14:53<39:25,  2.80s/it]Unsloth: Input IDs of length 2307 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2307) at non-singleton dimension 2


Evaluating:  27%|██▋       | 315/1147 [15:29<38:48,  2.80s/it]Unsloth: Input IDs of length 2093 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2093) at non-singleton dimension 2


Evaluating:  29%|██▉       | 330/1147 [16:14<45:03,  3.31s/it]Unsloth: Input IDs of length 2283 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2283) at non-singleton dimension 2


Evaluating:  30%|███       | 347/1147 [17:00<40:20,  3.03s/it]Unsloth: Input IDs of length 2272 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2272) at non-singleton dimension 2


Evaluating:  35%|███▌      | 406/1147 [19:51<37:38,  3.05s/it]Unsloth: Input IDs of length 2298 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2298) at non-singleton dimension 2


Evaluating:  40%|████      | 459/1147 [22:12<29:57,  2.61s/it]

Error processing example: The size of tensor a (2048) must match the size of tensor b (2532) at non-singleton dimension 2


Evaluating:  46%|████▌     | 522/1147 [25:09<25:51,  2.48s/it]Unsloth: Input IDs of length 2102 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2102) at non-singleton dimension 2


Evaluating:  59%|█████▊    | 672/1147 [32:32<22:23,  2.83s/it]

Error processing example: The size of tensor a (2048) must match the size of tensor b (2545) at non-singleton dimension 2


Evaluating:  68%|██████▊   | 778/1147 [37:42<19:20,  3.14s/it]Unsloth: Input IDs of length 2536 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2536) at non-singleton dimension 2


Evaluating:  68%|██████▊   | 784/1147 [37:57<17:38,  2.92s/it]

Error processing example: The size of tensor a (2048) must match the size of tensor b (2283) at non-singleton dimension 2


Evaluating:  69%|██████▉   | 790/1147 [38:10<14:02,  2.36s/it]Unsloth: Input IDs of length 2280 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2280) at non-singleton dimension 2


Evaluating:  77%|███████▋  | 878/1147 [42:22<15:19,  3.42s/it]Unsloth: Input IDs of length 2278 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2278) at non-singleton dimension 2


Evaluating:  81%|████████  | 924/1147 [44:31<11:16,  3.03s/it]Unsloth: Input IDs of length 2542 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2542) at non-singleton dimension 2


Evaluating:  86%|████████▌ | 986/1147 [47:30<07:12,  2.69s/it]Unsloth: Input IDs of length 2276 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Error processing example: The size of tensor a (2048) must match the size of tensor b (2276) at non-singleton dimension 2


Evaluating: 100%|██████████| 1147/1147 [55:48<00:00,  2.92s/it]


Overall Results:
Total samples processed: 1129
Successful evaluations: 1129
Failed evaluations: 18
Average ROUGE-1 F1: 0.743
Average ROUGE-2 F1: 0.673
Average ROUGE-L F1: 0.743
Average Cosine Similarity: 0.828
Result Accuracy: 0.377

Detailed results saved to evaluation_results.json



