In [1]:
!pip install unsloth
!pip install transformers
!pip install trl
!pip install rouge
!pip install scikit-learn



In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import re
import numpy as np
from tqdm import tqdm

def load_model(model_name, is_fine_tuned=False):
    """Load either base or fine-tuned model from HuggingFace"""
    print(f"Loading {'fine-tuned' if is_fine_tuned else 'base'} model...")
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    return model, tokenizer

def is_numerical_reasoning(example):
    """
    Check if an example involves numerical reasoning based on its aggregation field
    Returns True if any aggregation operation involves numerical operations
    """
    # Numerical aggregation operations
    numerical_operations = {
        'sum', 'diff', 'div', 'average', 'min', 'max',
        'greater_than', 'less_than', 'counta', 'range'
    }
    
    # Get the aggregation string and convert it to a list
    agg_str = example.get('aggregation', "['none']")
    # Remove brackets and quotes, split by comma
    agg_list = [op.strip().strip("'\"") for op in agg_str.strip('[]').split(',')]
    
    # Check if any of the aggregations are numerical
    return any(op.strip() in numerical_operations for op in agg_list)

def format_hitab_prompt(example):
    """Format HiTab example for numerical reasoning"""
    # Parse the table content from the string
    table_content = example['table_content']
    
    # Split into rows
    rows = [row.strip() for row in table_content.strip().split('\n')]
    
    # Format the table string with pipe separators
    table_str = '\n'.join(' | '.join(cell.strip() for cell in row.split('\t')) for row in rows)
    
    # Format with just the table and question
    prompt = f"""Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{table_str}
### Question:
{example['question']}
### Response:
"""
    return prompt

def filter_hitab_dataset():
    """Load and filter HiTab dataset for numerical reasoning questions"""
    print("Loading HiTab dataset...")
    dataset = load_dataset("kasnerz/hitab")['test']
    
    numerical_samples = []
    for example in tqdm(dataset):
        if is_numerical_reasoning(example):
            numerical_samples.append(example)
    
    print(f"Found {len(numerical_samples)} numerical reasoning samples out of {len(dataset)} total samples")
    return numerical_samples

def debug_numerical_samples(samples, num_to_show=5):
    """Print details of numerical samples to debug filtering"""
    print(f"\nExamining {num_to_show} samples from {len(samples)} total numerical samples:")
    for i, sample in enumerate(samples[:num_to_show]):
        print(f"\nSample {i+1}:")
        print(f"Question: {sample['question']}")
        print(f"Aggregation: {sample['aggregation']}")
        print(f"Answer: {sample['answer']}")
        print("-" * 50)

def evaluate_model_on_hitab(model, tokenizer, samples, num_samples=None):
    """Evaluate the model on HiTab numerical samples"""
    if num_samples is not None:
        samples = samples[:num_samples]
    
    results = []
    for idx, sample in enumerate(tqdm(samples)):
        # Format input
        input_text = format_hitab_prompt(sample)
        inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
        
        # Generate response
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=1e-10,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Store results
        result = {
            'question': sample['question'],
            'expected_answer': sample['answer'],
            'generated_response': generated_text,
            'table_metadata': {
                'sentence': sample.get('sub_sentence', ''),
                'table_source': sample.get('table_source', ''),
                'aggregation': sample.get('aggregation', '')
            }
        }
        results.append(result)
        
        # Print progress
        if (idx + 1) % 5 == 0:
            print(f"\nProcessed {idx + 1}/{len(samples)} samples")
            print(f"Last question: {result['question']}")
            print(f"Expected answer: {result['expected_answer']}")
            print(f"Generated response: {result['generated_response']}")
    
    return results

def analyze_results(results, model_type=""):
    """Analyze and summarize the evaluation results"""
    # Basic statistics
    total_samples = len(results)
    
    # Extract generated equations and answers
    equation_pattern = r'([^=]+)=\s*([^\n]+)'
    equations_found = 0
    correct_format = 0
    
    for result in results:
        generated = result['generated_response']
        if '=' in generated:
            equations_found += 1
            if re.search(equation_pattern, generated):
                correct_format += 1
    
    # Calculate metrics
    if total_samples > 0:
        format_adherence = equations_found / total_samples
        format_accuracy = correct_format / total_samples
    else:
        format_adherence = 0
        format_accuracy = 0
    
    print(f"\nEvaluation Results for {model_type}:")
    print("=" * 50)
    print(f"Total samples evaluated: {total_samples}")
    print(f"Responses containing equations: {equations_found} ({format_adherence:.2%})")
    print(f"Responses in correct format: {correct_format} ({format_accuracy:.2%})")
    
    # Analysis of aggregation types
    aggregation_types = {}
    
    for result in results:
        agg_str = result['table_metadata']['aggregation']
        agg_list = [op.strip().strip("'\"") for op in agg_str.strip('[]').split(',')]
        for agg_type in agg_list:
            agg_type = agg_type.strip()
            if agg_type:
                aggregation_types[agg_type] = aggregation_types.get(agg_type, 0) + 1
            
    print("\nAggregation Type Distribution:")
    for agg_type, count in sorted(aggregation_types.items()):
        percentage = count / total_samples * 100 if total_samples > 0 else 0
        print(f"{agg_type}: {count} ({percentage:.1f}%)")
    
    return {
        'total_samples': total_samples,
        'equations_found': equations_found,
        'correct_format': correct_format,
        'format_adherence': format_adherence,
        'format_accuracy': format_accuracy,
        'aggregation_types': aggregation_types
    }

def main(num_samples=None):
    # Model configurations
    base_model_name = "Qwen/Qwen2.5-7B-Instruct"
    fine_tuned_model_name = "n3Er/qwen2.5-7b-instruct-finqa-ht"
    
    # Load both models
    base_model, base_tokenizer = load_model(base_model_name)
    fine_tuned_model, fine_tuned_tokenizer = load_model(fine_tuned_model_name, is_fine_tuned=True)
    
    # Filter HiTab dataset for numerical reasoning samples
    numerical_samples = filter_hitab_dataset()
    
    # Debug sample selection
    print("\nDebugging sample selection:")
    debug_numerical_samples(numerical_samples)
    
    # Evaluate base model
    print("\nEvaluating base model on HiTab numerical reasoning samples...")
    base_results = evaluate_model_on_hitab(base_model, base_tokenizer, numerical_samples, num_samples=num_samples)
    
    # Evaluate fine-tuned model
    print("\nEvaluating fine-tuned model on HiTab numerical reasoning samples...")
    fine_tuned_results = evaluate_model_on_hitab(fine_tuned_model, fine_tuned_tokenizer, numerical_samples, num_samples=num_samples)
    
    # Analyze results
    base_metrics = analyze_results(base_results, "Base Model")
    fine_tuned_metrics = analyze_results(fine_tuned_results, "Fine-tuned Model")
    
    # Save results
    import json
    with open('hitab_comparative_evaluation_results.json', 'w') as f:
        json.dump({
            'base_model': {
                'results': base_results,
                'metrics': base_metrics
            },
            'fine_tuned_model': {
                'results': fine_tuned_results,
                'metrics': fine_tuned_metrics
            }
        }, f, indent=2)
    
    print("\nResults have been saved to 'hitab_comparative_evaluation_results.json'")

if __name__ == "__main__":
    main(num_samples=50)  # Use all samples by default

Loading base model...
Loading tokenizer...
Loading model...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading fine-tuned model...
Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading HiTab dataset...


100%|██████████| 1584/1584 [00:00<00:00, 7719.16it/s]


Found 198 numerical reasoning samples out of 1584 total samples

Debugging sample selection:

Examining 5 samples from 198 total numerical samples:

Sample 1:
Question: how many percent of university graduates among second-generation black women who originated from jamaica was higher than that of men in 2016?
Aggregation: ['diff']
Answer: [15.5]
--------------------------------------------------

Sample 2:
Question: what is the difference between black women with university degree only who originated from haitian and that of men?
Aggregation: ['diff']
Answer: [18.8]
--------------------------------------------------

Sample 3:
Question: which type of workers in manitoba's agricultural region 7 were more likely to have no certificate, diploma or degree, french-language workers or other workers?
Aggregation: ['greater_than']
Answer: ['french-language workers']
--------------------------------------------------

Sample 4:
Question: which type of workers in manitoba's agricultural region 7

 10%|█         | 5/50 [00:50<07:28,  9.96s/it]


Processed 5/50 samples
Last question: which type of workers in alberta's agricultural region 7 were less likely to have no certificate, diploma or degree, french-language workers or other workers?
Expected answer: ['french-language workers']
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'education by fols of workers in the agricultural sector aged 15 years and over, five agricultural regions of western canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}, {'row_index': 0, 'column_index': 5, 'children': [{'row

 20%|██        | 10/50 [01:40<06:47, 10.18s/it]


Processed 10/50 samples
Last question: what is the percentage of new mothers took parental leave, according to the roe-based definition in 2012?
Expected answer: [43.4]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'job separation types among parents who received a record of employment for parental leave or received employment insurance benefits, selected years', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}, {'row_index': 0, 'column_index': 3, 'children': []}, {'row_index': 0, 'column_index': 4, 'children': []}, {'row_index': 0, 'column_index': 5, 'children': []}, {'row_index': 0, 'column_index': 6, 'children': []}, {'row_index': 0, 'column_index': 7, 'children': []}, {'row_index': 0, 'column_index': 8, 'children': []}]

 30%|███       | 15/50 [02:35<06:29, 11.14s/it]


Processed 15/50 samples
Last question: from the 1999 cohort to the 2010 cohort,how many percentage points of rc-prs who received social assistance income six years after making their initial claim have declined?
Expected answer: [0.7]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'percentage receiving social assistance income, by cohort', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}, {'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}, {'row_index': 1, 'column_index': 5, 'children': []}, {'row_index': 1, 'column_index': 6, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index':

 40%|████      | 20/50 [03:33<05:59, 11.97s/it]


Processed 20/50 samples
Last question: what was the number of violent offences specific to firearms increased by in 2019?
Expected answer: [642]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'police-reported crime for selected offences, canada, 2018 and 2019', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}, {'row_index': 0, 'column_index': 5, 'children': [{'row_index': 1, 'column_index': 5, 'children': []}]}, {'row_index': 0, 'column_index': 6, 'children': []}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'chi

 50%|█████     | 25/50 [04:32<04:40, 11.24s/it]


Processed 25/50 samples
Last question: in fy 2016, how many percent of total s&e support to total s&e support to hbcus did fttg account for?
Expected answer: [0.15156]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'federal obligations for science and engineering to historically black colleges and universities, ranked by total amount received, by type of activity: fy 2016', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}, {'row_index': 0, 'column_index': 3, 'children': []}, {'row_index': 0, 'column_index': 4, 'children': []}, {'row_index': 0, 'column_index': 5, 'children': []}, {'row_index': 0, 'column_index': 6, 'children': []}, {'row_index': 0, 'column_index': 7, 'children': []}]}, 'left_root': {'row_index': -1, 'column_i

 60%|██████    | 30/50 [05:29<03:51, 11.57s/it]


Processed 30/50 samples
Last question: what was the proportion of female ecpas were skilled workers?
Expected answer: [0.687765]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'selected characteristics of economic class principal applicants in linked immigrant landing file (1980 through 2006)-discharge abstract database (2006/2007 through 2008/2009) and canadian-born in linked 2006 census-discharge abstract database, by sex, population aged 25 to 74, canada excluding quebec and territories', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 1, 'children': []}, {'row_index': 2, 'column_index': 2, 'children': []}, {'row_index': 2, 'column_index': 3, 'children': []}, {'row_index': 2, 'column_index': 4, 

 70%|███████   | 35/50 [06:20<02:29,  9.98s/it]


Processed 35/50 samples
Last question: how many times aboriginal women aged 25 to 54 were more likely than non-aboriginal women to report disabilities?
Expected answer: [2.340659]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'prevalence of disability among women and men aged 15 or older, by aboriginal identity and age group, canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 2, 'column_index': 1, 'children': [{'row_index': 3, 

 80%|████████  | 40/50 [07:05<01:32,  9.21s/it]


Processed 40/50 samples
Last question: how many times aboriginal women aged 15 or older were more likely than non-aboriginal women to report disabilities?
Expected answer: [1.496599]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'prevalence of disability among women and men aged 15 or older, by aboriginal identity and age group, canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 2, 'column_index': 1, 'children': [{'row_index': 

 90%|█████████ | 45/50 [07:56<00:50, 10.15s/it]


Processed 45/50 samples
Last question: in 2017, there were 266 firearm-related homicides reported in canada, what was the difference in 2016?
Expected answer: [43]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'homicides, by firearm-related status and province or territory, 2016 and 2017', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 1, 'children': []}, {'row_index': 2, 'column_index': 2, 'children': []}, {'row_index': 2, 'column_index': 3, 'children': []}, {'row_index': 2, 'column_index': 4, 'children': []}]}, {'row_index': 1, 'column_index': 5, 'children': [{'row_index': 2, 'column_index': 5, 'children': []}, {'row_index': 2, 'column_index': 6, 'children': []}, {'row_index': 2, 'column_index'

100%|██████████| 50/50 [08:49<00:00, 10.59s/it]



Processed 50/50 samples
Last question: how many times were aboriginal people more likely to report fair or poor mental health than non-aboriginal people?
Expected answer: [2]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'presence of selected social or demographic characteristics, by aboriginal people and non-aboriginal people, provinces and territories, 2014', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 0, 'children': [{'row_index': 3, 'column_index': 0, 'children': []}, {'row_index': 4, 'column_index': 0, 'children': []}, {'row_index': 5, 'column_index': 0, 'children': [

 10%|█         | 5/50 [00:46<06:53,  9.19s/it]


Processed 5/50 samples
Last question: which type of workers in alberta's agricultural region 7 were less likely to have no certificate, diploma or degree, french-language workers or other workers?
Expected answer: ['french-language workers']
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'education by fols of workers in the agricultural sector aged 15 years and over, five agricultural regions of western canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}, {'row_index': 0, 'column_index': 5, 'children': [{'row

 20%|██        | 10/50 [01:40<07:35, 11.39s/it]


Processed 10/50 samples
Last question: what is the percentage of new mothers took parental leave, according to the roe-based definition in 2012?
Expected answer: [43.4]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'job separation types among parents who received a record of employment for parental leave or received employment insurance benefits, selected years', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}, {'row_index': 0, 'column_index': 3, 'children': []}, {'row_index': 0, 'column_index': 4, 'children': []}, {'row_index': 0, 'column_index': 5, 'children': []}, {'row_index': 0, 'column_index': 6, 'children': []}, {'row_index': 0, 'column_index': 7, 'children': []}, {'row_index': 0, 'column_index': 8, 'children': []}]

 30%|███       | 15/50 [02:49<08:48, 15.09s/it]


Processed 15/50 samples
Last question: from the 1999 cohort to the 2010 cohort,how many percentage points of rc-prs who received social assistance income six years after making their initial claim have declined?
Expected answer: [0.7]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'percentage receiving social assistance income, by cohort', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}, {'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}, {'row_index': 1, 'column_index': 5, 'children': []}, {'row_index': 1, 'column_index': 6, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index':

 40%|████      | 20/50 [04:26<09:51, 19.72s/it]


Processed 20/50 samples
Last question: what was the number of violent offences specific to firearms increased by in 2019?
Expected answer: [642]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'police-reported crime for selected offences, canada, 2018 and 2019', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}, {'row_index': 0, 'column_index': 5, 'children': [{'row_index': 1, 'column_index': 5, 'children': []}]}, {'row_index': 0, 'column_index': 6, 'children': []}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'chi

 50%|█████     | 25/50 [05:55<06:34, 15.80s/it]


Processed 25/50 samples
Last question: in fy 2016, how many percent of total s&e support to total s&e support to hbcus did fttg account for?
Expected answer: [0.15156]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'federal obligations for science and engineering to historically black colleges and universities, ranked by total amount received, by type of activity: fy 2016', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}, {'row_index': 0, 'column_index': 3, 'children': []}, {'row_index': 0, 'column_index': 4, 'children': []}, {'row_index': 0, 'column_index': 5, 'children': []}, {'row_index': 0, 'column_index': 6, 'children': []}, {'row_index': 0, 'column_index': 7, 'children': []}]}, 'left_root': {'row_index': -1, 'column_i

 60%|██████    | 30/50 [07:19<05:48, 17.41s/it]


Processed 30/50 samples
Last question: what was the proportion of female ecpas were skilled workers?
Expected answer: [0.687765]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'selected characteristics of economic class principal applicants in linked immigrant landing file (1980 through 2006)-discharge abstract database (2006/2007 through 2008/2009) and canadian-born in linked 2006 census-discharge abstract database, by sex, population aged 25 to 74, canada excluding quebec and territories', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 1, 'children': []}, {'row_index': 2, 'column_index': 2, 'children': []}, {'row_index': 2, 'column_index': 3, 'children': []}, {'row_index': 2, 'column_index': 4, 

 70%|███████   | 35/50 [08:15<02:34, 10.31s/it]


Processed 35/50 samples
Last question: how many times aboriginal women aged 25 to 54 were more likely than non-aboriginal women to report disabilities?
Expected answer: [2.340659]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'prevalence of disability among women and men aged 15 or older, by aboriginal identity and age group, canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 2, 'column_index': 1, 'children': [{'row_index': 3, 

 80%|████████  | 40/50 [08:47<01:09,  6.97s/it]


Processed 40/50 samples
Last question: how many times aboriginal women aged 15 or older were more likely than non-aboriginal women to report disabilities?
Expected answer: [1.496599]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'prevalence of disability among women and men aged 15 or older, by aboriginal identity and age group, canada, 2011', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': []}, {'row_index': 1, 'column_index': 2, 'children': []}]}, {'row_index': 0, 'column_index': 3, 'children': [{'row_index': 1, 'column_index': 3, 'children': []}, {'row_index': 1, 'column_index': 4, 'children': []}]}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 2, 'column_index': 1, 'children': [{'row_index': 

 90%|█████████ | 45/50 [09:40<00:53, 10.62s/it]


Processed 45/50 samples
Last question: in 2017, there were 266 firearm-related homicides reported in canada, what was the difference in 2016?
Expected answer: [43]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'homicides, by firearm-related status and province or territory, 2016 and 2017', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 1, 'children': []}, {'row_index': 2, 'column_index': 2, 'children': []}, {'row_index': 2, 'column_index': 3, 'children': []}, {'row_index': 2, 'column_index': 4, 'children': []}]}, {'row_index': 1, 'column_index': 5, 'children': [{'row_index': 2, 'column_index': 5, 'children': []}, {'row_index': 2, 'column_index': 6, 'children': []}, {'row_index': 2, 'column_index'

100%|██████████| 50/50 [10:44<00:00, 12.88s/it]


Processed 50/50 samples
Last question: how many times were aboriginal people more likely to report fair or poor mental health than non-aboriginal people?
Expected answer: [2]
Generated response: Below is a table and a question. Answer the question by providing only the mathematical or logical operations needed to solve it along with the result.
### Table:
{'title': 'presence of selected social or demographic characteristics, by aboriginal people and non-aboriginal people, provinces and territories, 2014', 'top_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 0, 'column_index': 1, 'children': []}, {'row_index': 0, 'column_index': 2, 'children': []}]}, 'left_root': {'row_index': -1, 'column_index': -1, 'children': [{'row_index': 1, 'column_index': 1, 'children': [{'row_index': 2, 'column_index': 0, 'children': [{'row_index': 3, 'column_index': 0, 'children': []}, {'row_index': 4, 'column_index': 0, 'children': []}, {'row_index': 5, 'column_index': 0, 'children': [




In [3]:
from datasets import load_dataset

# Load dataset
print("Loading dataset...")
dataset = load_dataset("kasnerz/hitab")
print(f"\nDataset splits available: {dataset.keys()}")

# Print structure of the first example
print("\nExample structure:")
for key in dataset['test'][0].keys():
    print(f"{key}: {type(dataset['test'][0][key])}")

# Print first few examples to see their structure
print("\nFirst few examples aggregation field:")
for i, example in enumerate(dataset['test'].select(range(5))):
    print(f"\nExample {i+1}:")
    print(f"Question: {example['question']}")
    print(f"Aggregation: {example.get('aggregation', [])}")
    print(f"Keys in example: {example.keys()}")

# Count examples with aggregation
print("\nCounting examples with aggregation...")
has_agg = sum(1 for ex in dataset['test'] if ex.get('aggregation'))
print(f"Examples with aggregation field: {has_agg} out of {len(dataset['test'])}")

# Look at unique aggregation types
agg_types = set()
for ex in dataset['test']:
    if ex.get('aggregation'):
        if isinstance(ex['aggregation'], list):
            agg_types.update(ex['aggregation'])
        else:
            agg_types.add(ex['aggregation'])
print("\nUnique aggregation types found:", sorted(list(agg_types)))

Loading dataset...

Dataset splits available: dict_keys(['train', 'validation', 'test'])

Example structure:
id: <class 'str'>
table_id: <class 'str'>
table_source: <class 'str'>
sentence_id: <class 'str'>
sub_sentence_id: <class 'str'>
sub_sentence: <class 'str'>
question: <class 'str'>
answer: <class 'str'>
aggregation: <class 'str'>
linked_cells: <class 'str'>
answer_formulas: <class 'str'>
reference_cells_map: <class 'str'>
table_content: <class 'str'>

First few examples aggregation field:

Example 1:
Question: in eastern ontario, what percent of french-language workers have worked in the restaurant and food services sector?
Aggregation: ['none']
Keys in example: dict_keys(['id', 'table_id', 'table_source', 'sentence_id', 'sub_sentence_id', 'sub_sentence', 'question', 'answer', 'aggregation', 'linked_cells', 'answer_formulas', 'reference_cells_map', 'table_content'])

Example 2:
Question: in eastern ontario, what percent of other agri-food workers have worked in the restaurant and