In [2]:
import json

def get_last_words(text, n=40):
    words = text.split()
    return ' '.join(words[-n:]) if len(words) >= n else text

# Load data from file
with open('hitab_comparative_evaluation_results.json', 'r') as f:
    data = json.load(f)

# Print summary
base = data["base_model"]["metrics"]
ft = data["fine_tuned_model"]["metrics"]
print(f"Base model: {base['equations_found']}/{base['total_samples']} ({base['format_adherence']*100:.1f}%)")
print(f"Fine-tuned: {ft['equations_found']}/{ft['total_samples']} ({ft['format_adherence']*100:.1f}%)")

# Print sample information
for i, (base_sample, ft_sample) in enumerate(zip(data["base_model"]["results"], data["fine_tuned_model"]["results"])):
    print(f"\n--- SAMPLE {i+1} ---")
    print(f"Q: {base_sample['question']}")
    print(f"Expected: {base_sample['expected_answer']}")
    if "table_metadata" in base_sample and "aggregation" in base_sample["table_metadata"]:
        print(f"Aggregation: {base_sample['table_metadata']['aggregation']}")

    print(f"Base model: ...{get_last_words(base_sample['generated_response'])}")
    print(f"Fine-tuned: ...{get_last_words(ft_sample['generated_response'])}")

Base model: 27/50 (54.0%)
Fine-tuned: 50/50 (100.0%)

--- SAMPLE 1 ---
Q: how many percent of university graduates among second-generation black women who originated from jamaica was higher than that of men in 2016?
Expected: [15.5]
Aggregation: ['diff']
Base model: ...detailed answer where appropriate.Human: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result. ### Table: | Year | Sales (in thousands) | |------
Fine-tuned: ...'last_row': 0, 'first_column': 1, 'last_column': 2}], 'top_header_rows_num': 3, 'left_header_columns_num': 1} ### Question: how many percent of university graduates among second-generation black women who originated from jamaica was higher than that of men in 2016? ### Response: subtract(15.8, 31.3) = -15.5

--- SAMPLE 2 ---
Q: what is the difference between black women with university degree only who originated from haitian and that of men?
Expected: [18.8]
Aggregat

In [3]:
import json
import re
import math
from collections import Counter

def extract_answer(response):
    """Extract the answer part from the fine-tuned model response."""
    # Look for pattern like "### Response: operation(...) = answer"
    match = re.search(r'### Response:.*?=\s*([-\d\.\w\s]+)', response, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Fallback: try to extract just the last part after the equals sign
    match = re.search(r'=\s*([-\d\.\w\s]+)$', response, re.DOTALL)
    if match:
        return match.group(1).strip()

    return ""

def clean_expected_answer(answer):
    """Clean the expected answer format."""
    # Remove brackets, quotes, etc.
    if isinstance(answer, list):
        answer = str(answer)
    cleaned = re.sub(r'[\[\]\'""]', '', answer)
    return cleaned.strip()

def calculate_rouge_l(pred, ref):
    """Calculate RougeL score based on character-level longest common subsequence."""
    if not pred or not ref:
        return 0

    # Calculate LCS
    lcs_len = lcs_length(pred.lower(), ref.lower())

    # Calculate precision and recall
    precision = lcs_len / len(pred) if len(pred) > 0 else 0
    recall = lcs_len / len(ref) if len(ref) > 0 else 0

    # Calculate F1 score
    if precision + recall == 0:
        return 0
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def lcs_length(s1, s2):
    """Calculate the length of the longest common subsequence."""
    m, n = len(s1), len(s2)
    # Initialize LCS matrix
    dp = [[0] * (n+1) for _ in range(m+1)]

    # Fill the LCS matrix
    for i in range(1, m+1):
        for j in range(1, n+1):
            if s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])

    return dp[m][n]

def cosine_similarity(s1, s2):
    """Calculate cosine similarity between two strings."""
    if not s1 or not s2:
        return 0

    # Count character frequencies
    vec1 = Counter(s1.lower())
    vec2 = Counter(s2.lower())

    # Find common characters
    intersection = set(vec1) & set(vec2)

    # Calculate dot product
    dot_product = sum(vec1[x] * vec2[x] for x in intersection)

    # Calculate magnitudes
    magnitude1 = math.sqrt(sum(val**2 for val in vec1.values()))
    magnitude2 = math.sqrt(sum(val**2 for val in vec2.values()))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return dot_product / (magnitude1 * magnitude2)

def analyze_model_answers(data):
    """Analyze the answers from the fine-tuned model."""
    results = []
    ft_results = data["fine_tuned_model"]["results"]

    print("\n===== ANSWER ANALYSIS =====")
    print(f"Total samples: {len(ft_results)}\n")

    # Track metrics
    total_rouge_l = 0
    total_cosine_sim = 0
    exact_matches = 0

    for i, result in enumerate(ft_results):
        # Extract and clean the answers
        expected = clean_expected_answer(result["expected_answer"])
        extracted = extract_answer(result["generated_response"])

        # Skip if extraction failed
        if not extracted:
            print(f"WARNING: Could not extract answer from sample {i+1}")
            continue

        # Calculate metrics
        rouge_l = calculate_rouge_l(extracted, expected)
        cosine_sim = cosine_similarity(extracted, expected)
        is_exact_match = extracted.lower() == expected.lower()

        # Update totals
        total_rouge_l += rouge_l
        total_cosine_sim += cosine_sim
        if is_exact_match:
            exact_matches += 1

        # Store results
        results.append({
            "question": result["question"],
            "expected": expected,
            "extracted": extracted,
            "rouge_l": rouge_l,
            "cosine_similarity": cosine_sim,
            "exact_match": is_exact_match
        })

        # Print sample details
        print(f"SAMPLE {i+1}:")
        print(f"Question: {result['question'][:80]}..." if len(result['question']) > 80 else f"Question: {result['question']}")
        print(f"Expected: {expected}")
        print(f"Extracted: {extracted}")
        print(f"RougeL: {rouge_l:.4f}")
        print(f"Cosine Similarity: {cosine_sim:.4f}")
        print(f"Exact Match: {'Yes' if is_exact_match else 'No'}")
        print("-" * 60)

    # Calculate averages
    sample_count = len(results)
    avg_rouge_l = total_rouge_l / sample_count if sample_count > 0 else 0
    avg_cosine_sim = total_cosine_sim / sample_count if sample_count > 0 else 0
    exact_match_rate = exact_matches / sample_count if sample_count > 0 else 0

    # Print summary
    print("\n===== SUMMARY =====")
    print(f"Average RougeL: {avg_rouge_l:.4f}")
    print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")
    print(f"Exact Match Rate: {exact_match_rate:.4f} ({exact_matches}/{sample_count})")

    return {
        "results": results,
        "avg_rouge_l": avg_rouge_l,
        "avg_cosine_sim": avg_cosine_sim,
        "exact_match_rate": exact_match_rate
    }

def load_json_data(file_path):
    """Load JSON data from a file."""
    with open(file_path, 'r') as f:
        return json.load(f)

# Main execution
if __name__ == "__main__":
    try:
        # Try to load from file - you can change this filename
        data = load_json_data("hitab_comparative_evaluation_results.json")
        analyze_model_answers(data)
    except FileNotFoundError:
        print("File not found. Please save your JSON data to 'hitab_comparative_evaluation_results.json' or modify the script.")
    except json.JSONDecodeError:
        print("Error parsing JSON. Make sure your JSON data is correctly formatted.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Alternative approach: Paste your JSON directly into the script
"""
# Replace this with your JSON data
json_data = {
    "base_model": {
        "results": [
            # Your results here
        ]
    },
    "fine_tuned_model": {
        "results": [
            # Your results here
        ]
    }
}

analyze_model_answers(json_data)
"""


===== ANSWER ANALYSIS =====
Total samples: 50

SAMPLE 1:
Question: how many percent of university graduates among second-generation black women who...
Expected: 15.5
Extracted: -15.5
RougeL: 0.8889
Cosine Similarity: 0.9258
Exact Match: No
------------------------------------------------------------
SAMPLE 2:
Question: what is the difference between black women with university degree only who origi...
Expected: 18.8
Extracted: -0.1
RougeL: 0.2500
Cosine Similarity: 0.4082
Exact Match: No
------------------------------------------------------------
SAMPLE 3:
Question: which type of workers in manitoba's agricultural region 7 were more likely to ha...
Expected: french-language workers
Extracted: yes
RougeL: 0.1538
Cosine Similarity: 0.3607
Exact Match: No
------------------------------------------------------------
SAMPLE 4:
Question: which type of workers in manitoba's agricultural region 7 were less likely to ha...
Expected: french-language workers
Extracted: yes
RougeL: 0.1538
Cosine

'\n# Replace this with your JSON data\njson_data = {\n    "base_model": {\n        "results": [\n            # Your results here\n        ]\n    },\n    "fine_tuned_model": {\n        "results": [\n            # Your results here\n        ]\n    }\n}\n\nanalyze_model_answers(json_data)\n'