# Question Type Classification
This notebook processes questions from GSM and web-dev datasets to classify their types using LLM.

In [1]:
import pandas as pd
import json
from llm_call import call_llm
import random
import asyncio
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from mcp_call import call_DuckDuckGo

## Phase 1: Data Reading and Tool Selection

In [2]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_code_examples():
    with open('prompt/gsm8k.md', 'r') as f:
        return f.read()
    
def load_search_examples():
    with open('prompt/trivia_qa.md', 'r') as f:
        return f.read()

In [3]:
read_json('dataset/verified-web-dev.json')["Data"]

[{'Answer': {'Aliases': ['Kamal kahn',
    'List of Bond girls in Octopussy',
    'Magda (James Bond)',
    'List of James Bond allies in Octopussy',
    'Vijay (James Bond)',
    'Bond 13',
    'Octopussy (character)',
    'Penelope Smallbone',
    'Octopussy',
    'General Orlov',
    'Kamal Khan',
    'Octopussy (film)',
    'List of James Bond villains in Octopussy',
    'Jim Fanning (James Bond)'],
   'HumanAnswers': ['OCTOPUSSY'],
   'MatchedWikiEntityName': 'Octopussy',
   'NormalizedAliases': ['list of bond girls in octopussy',
    'bond 13',
    'list of james bond allies in octopussy',
    'magda james bond',
    'penelope smallbone',
    'kamal kahn',
    'octopussy',
    'list of james bond villains in octopussy',
    'vijay james bond',
    'jim fanning james bond',
    'general orlov',
    'kamal khan',
    'octopussy character',
    'octopussy film'],
   'NormalizedMatchedWikiEntityName': 'octopussy',
   'NormalizedValue': 'octopussy',
   'Type': 'WikipediaEntity',
   'V

In [4]:
def combine_datasets():
    # Read datasets
    gsm_data = read_jsonl('dataset/gsm.jsonl')
    web_data = read_json('dataset/verified-web-dev.json')["Data"]
    
    # Process web_data to get only required fields
    processed_web_data = []
    for entry in web_data:
        processed_web_data.append({
            'question': entry['Question'],
            'question-id': entry['QuestionId'],
            'target': entry['Answer'],
        })
    
    # Convert to DataFrames and standardize column names
    gsm_df = pd.DataFrame(gsm_data)
    gsm_df = gsm_df.rename(columns={'input': 'question'})
    gsm_df['question_type'] = 'Code Execution'  # Add question type
    
    web_df = pd.DataFrame(processed_web_data)
    web_df['question_type'] = 'Search Engine'  # Add question type
    
    # Combine datasets with matching column names
    combined_df = pd.concat([gsm_df, web_df]).reset_index(drop=True)
    
    # Initialize prediction and code columns
    combined_df['question_type_prediction'] = None
    combined_df['generated_code'] = None
    combined_df['feedback'] = None
    
    # Shuffle the combined dataset
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Get first 300 of each type after shuffling
    code_execution_df = combined_df[combined_df['question_type'] == 'Code Execution'].head(100)
    search_engine_df = combined_df[combined_df['question_type'] == 'Search Engine'].head(100)
    
    # Final combination and shuffle
    combined_df = pd.concat([code_execution_df, search_engine_df])
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return combined_df

In [5]:
# Load datasets
combined_df = combine_datasets()
print(f"Total number of questions: {len(combined_df)}")
combined_df

Total number of questions: 200


Unnamed: 0,question,target,question_type,question-id,question_type_prediction,generated_code,feedback
0,A car in the fast lane is traveling at 60 mile...,16,Code Execution,,,,
1,Heather's razors come 4 to a pack and cost $4....,25,Code Execution,,,,
2,John decides to take up an odd hobby of speed ...,12,Code Execution,,,,
3,"What nationality was Christian Barnard, the su...","{'Aliases': ['South africa', 'South Africa's',...",Search Engine,qf_2913,,,
4,What is the flavouring of the liqueur Amaretto?,"{'Aliases': ['Almond tree', 'Luz (nut)', 'Amyg...",Search Engine,sfq_15907,,,
...,...,...,...,...,...,...,...
195,Which vitamin assists in blood clotting?,"{'Aliases': ['Vitamin k 2', 'Vitamin-K', 'Vita...",Search Engine,dpql_2636,,,
196,My mom went to a gardening shop. She bought a ...,38,Code Execution,,,,
197,A farmer extracts 5 liters of milk a day from ...,2,Code Execution,,,,
198,"An American aided coup in Chile in 1973, depos...","{'Aliases': ['Allende Gossens', 'Salvador Goss...",Search Engine,odql_4592,,,


In [6]:
async def process_questions():
    # Load tool examples
    tool_examples = load_tool_examples()
    
    # Process each question
    for idx, row in combined_df.iterrows():
        # Add sleep every 10 iterations
        if idx > 0 and idx % 15 == 0:
            print("Taking a 30-second break...")
            await asyncio.sleep(60)
    
        # Combine tool examples with current question
        prompt = f"{tool_examples}\n\nQuestion: {row['question']}\nAnswer:"
        
        # Call LLM with the prompt
        response = await call_llm(prompt)
        
        # Store the response in the dataframe
        combined_df.at[idx, 'question_type_prediction'] = response.strip()
        
        # Print progress
        print(f"Question {idx + 1}/{len(combined_df)}:")
        print(f"True Type: {row['question_type']}")
        print(f"Predicted Type: {response}\n")

In [50]:
# Run the processing
await process_questions()

Question 1/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 2/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 3/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 4/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 5/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 6/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 7/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 8/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 9/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 10/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 11/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 12/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 13/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 14/200:
True Type: Code Ex

In [51]:
combined_df.to_csv('dataset/combined_dataset_result_task1.csv', index=False)

In [52]:
# Calculate accuracy
accuracy = (combined_df['question_type'] == combined_df['question_type_prediction']).mean()
print(f"Classification accuracy: {accuracy:.2%}")

Classification accuracy: 99.50%


In [57]:
# Calculate detailed metrics
y_true = combined_df['question_type']
y_pred = combined_df['question_type_prediction']

# Calculate individual metrics with weighted averaging
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")

# Display detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_true, y_pred))

Accuracy: 99.50%
Precision: 100.00%
Recall: 99.50%

Detailed Classification Report:
                       precision    recall  f1-score   support

Answer: Search Engine       0.00      0.00      0.00         0
       Code Execution       1.00      1.00      1.00       100
        Search Engine       1.00      0.99      0.99       100

             accuracy                           0.99       200
            macro avg       0.67      0.66      0.66       200
         weighted avg       1.00      0.99      1.00       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
# Find misclassified questions
misclassified = combined_df[combined_df['question_type'].str.strip() != combined_df['question_type_prediction'].str.strip()]

print(f"Number of misclassified questions: {len(misclassified)}\n")

for idx, row in misclassified.iterrows():
    print(f"Index: {idx}")
    print(f"Question: {row['question']}")
    print(f"True Type: '{row['question_type']}'")
    print(f"Predicted Type: '{row['question_type_prediction']}'")
    print(f"String length - True: {len(row['question_type'])}, Predicted: {len(row['question_type_prediction'])}")
    print("Character by character comparison:")
    true_type = row['question_type']
    pred_type = row['question_type_prediction']
    for i, (c1, c2) in enumerate(zip(true_type, pred_type)):
        if c1 != c2:
            print(f"Position {i}: '{c1}' vs '{c2}'")
    if len(true_type) != len(pred_type):
        print(f"Length mismatch: Extra characters in prediction: '{pred_type[len(true_type):]}'")
    print("-" * 80, "\n")

Number of misclassified questions: 1

Index: 53
Question: Martin, Mandy and Mathew were flatmates in which comedy series
True Type: 'Search Engine'
Predicted Type: 'Answer: Search Engine'
String length - True: 13, Predicted: 21
Character by character comparison:
Position 0: 'S' vs 'A'
Position 1: 'e' vs 'n'
Position 2: 'a' vs 's'
Position 3: 'r' vs 'w'
Position 4: 'c' vs 'e'
Position 5: 'h' vs 'r'
Position 6: ' ' vs ':'
Position 7: 'E' vs ' '
Position 8: 'n' vs 'S'
Position 9: 'g' vs 'e'
Position 10: 'i' vs 'a'
Position 11: 'n' vs 'r'
Position 12: 'e' vs 'c'
Length mismatch: Extra characters in prediction: 'h Engine'
-------------------------------------------------------------------------------- 



## Phase 2: Tool Usage and Problem Solving

In [7]:
async def llm_search_question(question, past_feedback = []):
    need_improve = True
    result = None
    i = 0
    feedback_list = []
    while need_improve and i < 3:
        search_answer = await call_DuckDuckGo(question)
        print(f"Search answer: {search_answer}")
        if len(feedback_list) > 0:
            prompt = f"Context:{search_answer}\nExample: {load_search_examples()}\nFeedback: {feedback_list}\nQuestion: {question}\n Write no more than 5 sentences for each key. Answer:"
        else:
            prompt = f"Context:{search_answer}\nExample: {load_search_examples()}\nQuestion: {question}\n Write no more than 5 sentences for each key. Answer:"
        if len(past_feedback) > 0:
            prompt = f"Past Feedback: {past_feedback}\n{prompt}"
        response = await call_llm(prompt)
        if response is None:
            print("Error: LLM returned None")
            return "error in llm", feedback_list
        try:
            reflection_json = json.loads(response)
        except:
            if response is None:
                print("Error: LLM returned None")
                return "error in llm", feedback_list
            print(f"Error parsing JSON response: {e}")
            print(f"Raw response: {response}")
            return "error in json parsing", feedback_list
        need_improve = reflection_json.get('need_improve', False)
        feedback = reflection_json.get('feedback', '')
        print(f"Feedback: {feedback}")
        if need_improve:
            feedback_list.append(feedback)
        result = reflection_json.get('answer', None)
        print(f"Result: {result}")
        i += 1
    return result.lower().strip() if result else "no valid answer", feedback_list

In [15]:
await llm_search_question(combined_df.iloc[11]['question'], [])

Feedback: The provided documents state that Boris Becker won three Wimbledon Championships in Men's Singles. There is no mention of him being a runner-up at Wimbledon. Therefore, based on the given information, he was not a runner-up at Wimbledon.
Result: 0


('0', [])

In [8]:
async def llm_math_question(question, past_feedback = []):
    need_improve = True
    prompt = f"Generate code to answer following question, make sure final output of the code is the answer to the question and nothing else \nQuestion: {question}:"
    i = 0
    output = None
    feedback_list = []
    final_code = None
    while need_improve and i < 3:
        i += 1
        if len(feedback_list) > 0:
            prompt = f"Feedback: {feedback_list}\n{prompt}"
        if len(past_feedback) > 0:
            prompt = f"Past Feedback: {past_feedback}\n{prompt}"
        response = await call_llm(prompt, True)
        if response is None:
            print("Error: LLM returned None")
            return "error in llm", feedback_list
        code = None
        for part in response.candidates[0].content.parts:
            if part.executable_code is not None:
                code = part.executable_code.code
            if part.code_execution_result is not None:
                output = part.code_execution_result.output
        reflect_prompt = load_code_examples() + f"\n\nQuestion: {question} Code: {code}\nOutput: {output}\n\nReflect on the code and output, is it correct? If not, please improve the code. Write no more than 5 sentences for problem and feedback.\nAnswer:"
        reflection_response = await call_llm(reflect_prompt, False, True)
        try:
            reflection_json = json.loads(reflection_response)
            need_improve = reflection_json.get('need_improve', False)
            feedback = reflection_json.get('feedback', '')
            if need_improve:
                feedback_list.append(feedback)
            if output is None:
                output = reflection_json.get('answer', None)
            if i == 2 or not need_improve:  # Store code from final iteration
                final_code = code
        except:
            if reflection_response is None:
                print("Error: LLM returned None")
                return "error in llm", feedback_list
            print(f"Error parsing JSON response: {e}")
            print(f"Raw response: {reflection_response}")
            return "error in json parsing", feedback_list

    return output if output else "no valid answer", feedback_list, final_code

In [31]:
await llm_math_question(combined_df[combined_df['question_type'] == 'Code Execution'].iloc[1]['question'], [])

code: cost_per_pack = 4.00
packs_bought = 2
coupon = 2.00
razors_per_pack = 4

total_cost_before_discount = cost_per_pack * packs_bought
cost_after_bogo = cost_per_pack
cost_after_coupon = cost_after_bogo - coupon
total_razors = packs_bought * razors_per_pack
cost_per_razor = cost_after_coupon / total_razors
cost_per_razor_cents = cost_per_razor * 100

print(cost_per_razor_cents)

code: cost_per_pack = 4.00
coupon_value = 2.00
razors_per_pack = 4
num_packs = 2

# Calculate the cost after the BOGO deal
packs_to_pay_for = num_packs // 2 + num_packs % 2
total_cost_before_coupon = packs_to_pay_for * cost_per_pack

# Apply the coupon
total_cost_after_coupon = total_cost_before_coupon - coupon_value

# Calculate the total number of razors
total_razors = num_packs * razors_per_pack

# Calculate the cost per razor
cost_per_razor = total_cost_after_coupon / total_razors

# Convert to cents
cost_per_razor_in_cents = cost_per_razor * 100

print(round(cost_per_razor_in_cents))



('25\n',
 ['The original code incorrectly calculates the cost per razor. It only considers the cost of one pack after the buy-one-get-one-free deal and then subtracts the coupon. The coupon should be subtracted from the total cost of the packs. The correct calculation involves considering the total cost after the BOGO deal and coupon, then dividing by the total number of razors.'],
 'cost_per_pack = 4.00\ncoupon_value = 2.00\nrazors_per_pack = 4\nnum_packs = 2\n\n# Calculate the cost after the BOGO deal\npacks_to_pay_for = num_packs // 2 + num_packs % 2\ntotal_cost_before_coupon = packs_to_pay_for * cost_per_pack\n\n# Apply the coupon\ntotal_cost_after_coupon = total_cost_before_coupon - coupon_value\n\n# Calculate the total number of razors\ntotal_razors = num_packs * razors_per_pack\n\n# Calculate the cost per razor\ncost_per_razor = total_cost_after_coupon / total_razors\n\n# Convert to cents\ncost_per_razor_in_cents = cost_per_razor * 100\n\nprint(round(cost_per_razor_in_cents))\n'

In [9]:
async def answer_question():
    code_feedback = []
    search_feedback = []

    for idx, row in combined_df.iterrows():
        print(f"Processing question {idx}/{len(combined_df)}")
        
        #Add sleep every 10 iterations
        if idx > 0 and idx % 10 == 0:
            print("Taking a 60-second break...")
            await asyncio.sleep(60)

        if row['question_type'] == 'Code Execution':
            result, feedback_list, final_code = await llm_math_question(row['question'], code_feedback)
            combined_df.at[idx, 'question_type_prediction'] = result
            combined_df.at[idx, 'generated_code'] = final_code
            combined_df.at[idx, 'feedback'] = feedback_list
            
            if len(feedback_list) > 0:
                if len(code_feedback) > 5:
                    code_feedback.pop(0)
                feedback_summary = await call_llm(feedback_list, summerize=True)
                code_feedback.append(feedback_summary)
            print(f"Code result vs target: {result} / {row['target']}")
            print(f"feedback: {feedback_list}")
            print("feedback summary: ", code_feedback)   

        elif row['question_type'] == 'Search Engine':
            result, feedback_list = await llm_search_question(row['question'], search_feedback)
            row['question_type_prediction'] = result
            combined_df.at[idx, 'feedback'] = feedback_list
            
            if len(feedback_list) > 0:
                if len(search_feedback) > 5:
                    search_feedback.pop(0)
                feedback_summary = await call_llm(feedback_list, summerize=True)
                search_feedback.append(feedback_summary)
            print(f"Search result vs target: {result} / {row['target']}")

In [10]:
await answer_question()

Processing question 0/200
code: fast_lane_speed = 60
slow_lane_speed = fast_lane_speed / 2
distance = 480
time_slow_lane = distance / slow_lane_speed
print(time_slow_lane)

Code result vs target: 16.0
 / 16
feedback: []
feedback summary:  []
Processing question 1/200
code: print(25)

code: cost_per_pack = 4.00
num_packs = 2
coupon_value = 2.00
razors_per_pack = 4

total_cost_before_discount = cost_per_pack * num_packs
cost_after_bogo = cost_per_pack  # Buy one get one free
cost_after_coupon = cost_after_bogo - coupon_value
total_razors = razors_per_pack * num_packs
cost_per_razor = cost_after_coupon / total_razors
cost_per_razor_in_cents = cost_per_razor * 100

print(int(cost_per_razor_in_cents))

code: packs = 2
razors_per_pack = 4
cost_per_pack = 4.00
coupon = 2.00

total_razors = packs * razors_per_pack
total_cost = (packs / 2) * cost_per_pack - coupon
cost_per_razor = total_cost / total_razors
cost_per_razor_cents = cost_per_razor * 100

print(int(cost_per_razor_cents))

Code resul

In [11]:
combined_df

Unnamed: 0,question,target,question_type,question-id,question_type_prediction,generated_code,feedback
0,A car in the fast lane is traveling at 60 mile...,16,Code Execution,,16.0\n,fast_lane_speed = 60\nslow_lane_speed = fast_l...,[]
1,Heather's razors come 4 to a pack and cost $4....,25,Code Execution,,25\n,packs = 2\nrazors_per_pack = 4\ncost_per_pack ...,"[The original code simply prints 25, which is ..."
2,John decides to take up an odd hobby of speed ...,12,Code Execution,,12.0\n,original_wpm = 150\nspeed_multiplier = 2.5\nnu...,[]
3,"What nationality was Christian Barnard, the su...","{'Aliases': ['South africa', 'South Africa's',...",Search Engine,qf_2913,south african,,[]
4,What is the flavouring of the liqueur Amaretto?,"{'Aliases': ['Almond tree', 'Luz (nut)', 'Amyg...",Search Engine,sfq_15907,almond-like flavour,,[]
...,...,...,...,...,...,...,...
195,Which vitamin assists in blood clotting?,"{'Aliases': ['Vitamin k 2', 'Vitamin-K', 'Vita...",Search Engine,dpql_2636,vitamin k,,[]
196,My mom went to a gardening shop. She bought a ...,38,Code Execution,,38\n,total_cost = 19 + 26 - 7\nprint(total_cost)\n,[]
197,A farmer extracts 5 liters of milk a day from ...,2,Code Execution,,2\n,current_cows = 3\nmilk_per_cow = 5\ndesired_mi...,[]
198,"An American aided coup in Chile in 1973, depos...","{'Aliases': ['Allende Gossens', 'Salvador Goss...",Search Engine,odql_4592,allende,,[]


In [12]:
combined_df.to_csv('dataset/combined_dataset_result_combine.csv', index=True)

## Load and Evaluate Saved Results

In [26]:
# Read the saved results
combined_df = pd.read_csv('dataset/combined_dataset_result_code1.csv', index_col=0)

# Convert target column back to dictionary format for search questions
def convert_target_to_dict(target):
    if isinstance(target, str) and target.startswith('{'):
        return eval(target)  # Safe for our known data format
    return target

# combined_df['target'] = combined_df['target'].apply(convert_target_to_dict)
print("Dataset loaded with shape:", combined_df.shape)
combined_df.head()

Dataset loaded with shape: (200, 7)


Unnamed: 0,question,target,question_type,question-id,question_type_prediction,generated_code,feedback
0,A car in the fast lane is traveling at 60 mile...,16,Code Execution,,16.0\n,fast_lane_speed = 60\nslow_lane_speed = fast_l...,[]
1,Heather's razors come 4 to a pack and cost $4....,25,Code Execution,,25\n,packs = 2\nrazors_per_pack = 4\nprice_per_pack...,['The code incorrectly calculates the cost. Th...
2,John decides to take up an odd hobby of speed ...,12,Code Execution,,12.0\n,normal_speed = 150\nspeed_multiplier = 2.5\npa...,[]
3,"What nationality was Christian Barnard, the su...","{'Aliases': ['South africa', ""South Africa's"",...",Search Engine,qf_2913,,,
4,What is the flavouring of the liqueur Amaretto?,"{'Aliases': ['Almond tree', 'Luz (nut)', 'Amyg...",Search Engine,sfq_15907,,,


In [7]:
from sklearn.metrics import f1_score as sklearn_f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

def preprocess_search_answers(predictions, targets):
    # Convert predictions and targets to sets of words
    processed_preds = []
    processed_targets = []
    
    for pred, target in zip(predictions, targets):
        if pred is None:
            pred_words = set()
        else:
            pred_words = set(pred.lower().strip().split())
        
        if isinstance(target, dict):
            target_words = set()
            # Add words from Aliases
            if 'Aliases' in target:
                for alias in target['Aliases']:
                    target_words.update(alias.lower().strip().split())
            # Add words from HumanAnswers
            if 'HumanAnswers' in target:
                for answer in target['HumanAnswers']:
                    target_words.update(answer.lower().strip().split())
        else:
            target_words = set(str(target).lower().strip().split())
        
        processed_preds.append(pred_words)
        processed_targets.append(target_words)
    
    # Convert sets to binary format using MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    # Fit on both predictions and targets to get all possible words
    all_words = set()
    for words in processed_preds + processed_targets:
        all_words.update(words)
    mlb.fit([all_words])
    
    return mlb.transform(processed_preds), mlb.transform(processed_targets)

def exact_match_search(prediction, target):
    if prediction is None:
        return False
    prediction = prediction.lower().strip()
    if isinstance(target, dict):
        # Check Aliases
        if 'Aliases' in target:
            if prediction in [alias.lower().strip() for alias in target['Aliases']]:
                return True
        # Check HumanAnswers
        if 'HumanAnswers' in target:
            if prediction in [answer.lower().strip() for answer in target['HumanAnswers']]:
                return True
        return False
    return prediction == str(target).lower().strip()

def semantic_match_search(prediction, target):
    if prediction is None:
        return False
    prediction = prediction.lower().strip()
    
    if isinstance(target, dict):
        if 'Aliases' in target:
            aliases = [alias.lower().strip() for alias in target['Aliases']]
            # Check if any alias is contained in prediction or vice versa
            for alias in aliases:
                if alias in prediction or prediction in alias:
                    return True
        if 'HumanAnswers' in target:
            answers = [answer.lower().strip() for answer in target['HumanAnswers']]
            # Check if any human answer is contained in prediction or vice versa
            for answer in answers:
                if answer in prediction or prediction in answer:
                    return True
        return False
    return False

# Filter for Search Engine questions only
search_df = combined_df[combined_df['question_type'] == 'Search Engine']

# Calculate word-based metrics
y_pred = search_df['question_type_prediction'].values
y_true = search_df['target'].values

# Calculate exact matches
exact_matches = [exact_match_search(pred, true) for pred, true in zip(y_pred, y_true)]
em_score = np.mean(exact_matches)

# Calculate semantic matches
semantic_matches = [semantic_match_search(pred, true) for pred, true in zip(y_pred, y_true)]
semantic_score = np.mean(semantic_matches)

# Calculate F1 score
y_pred_binary, y_true_binary = preprocess_search_answers(y_pred, y_true)
f1 = sklearn_f1_score(y_true_binary, y_pred_binary, average='micro')

print(f"Search Question Metrics:")
print(f"Strict Exact Match Score: {em_score:.2%}")
print(f"Semantic Match Score: {semantic_score:.2%}")
print(f"F1 Score: {f1:.2%}")

# Display example predictions with both matching criteria
print("\nPredictions:")
for idx, row in search_df[search_df['question_type_prediction'].notna()].head(10).iterrows():
    print(f"Question: {row['question']}")
    print(f"Prediction: {row['question_type_prediction']}")
    print(f"Target: {row['target']}")
    i = search_df.index.get_loc(idx)
    strict_match = exact_match_search(row['question_type_prediction'], row['target'])
    semantic_match = semantic_match_search(row['question_type_prediction'], row['target'])
    print(f"Strict EM: {strict_match}")
    print(f"Semantic Match: {semantic_match}")
    print(f"F1: {sklearn_f1_score(y_true_binary[i:i+1], y_pred_binary[i:i+1], average='micro'):.2f}")
    print("-" * 80)

Search Question Metrics:
Strict Exact Match Score: 79.00%
Semantic Match Score: 94.00%
F1 Score: 16.26%

Predictions:
Question: What nationality was Christian Barnard, the surgeon who performed the first human heart transplant?
Prediction: south african
Target: {'Aliases': ['South africa', "South Africa's", 'Southafrica', 'Third Republic (South Africa)', 'Republiek van Suid-Afrika', "Sou'frica", 'Zuid Afrika', 'Zuid-Afrika', 'ISO 3166-1:ZA', 'South-African', 'S Africa', 'Zuid Africa', 'Mzansi', 'Afrique du sud', 'Zuidafrika', 'Ningizimu Afrika', 'Capital of South Africa', 'Suid-Afrika', 'South-Africa', 'Rep. of SOUTH AFRICA', 'The Republic of South Africa', 'Suid Africa', 'Azania/South Africa', 'S Afr', 'Saffa', 'South African', 'Seth efrika', 'South Africa', 'Soufrica', 'Republic of south africa', 'South Africaà', 'The Beloved Country', 'S. Africa', 'Rep. of South Africa', 'South Africans', 'Republic of South Africa'], 'HumanAnswers': ['South African'], 'MatchedWikiEntityName': 'South

In [21]:
import numpy as np
# Helper functions for code execution metrics
def normalize_number(value):
    try:
        # Convert to float and round to 1 decimal place
        return round(float(value), 1)
    except (ValueError, TypeError):
        # If conversion fails, return the original value
        return value

def exact_match_code(prediction, target):
    if prediction is None:
        return False
    pred_num = normalize_number(prediction)
    target_num = normalize_number(target)
    
    if isinstance(pred_num, (int, float)) and isinstance(target_num, (int, float)):
        return pred_num == target_num
    return False

def semantic_match_code(prediction, target):
    if prediction is None:
        return False
    # Convert prediction to string and target to float
    pred_str = str(prediction).lower().strip()
    try:
        target_num = normalize_number(target)
        # Find all numbers in the prediction string
        import re
        numbers = re.findall(r'\d*\.?\d+', pred_str)
        numbers = [normalize_number(num) for num in numbers]
        return target_num in numbers
    except (ValueError, TypeError):
        return False

# Filter for Code Execution questions only
code_df = combined_df[combined_df['question_type'] == 'Code Execution']

# Calculate matches
exact_matches = [
    exact_match_code(pred, target) 
    for pred, target in zip(code_df['question_type_prediction'], code_df['target'])
]
semantic_matches = [
    semantic_match_code(pred, target)
    for pred, target in zip(code_df['question_type_prediction'], code_df['target'])
]

em_score = np.mean(exact_matches)
semantic_score = np.mean(semantic_matches)

print(f"Code Execution Metrics:")
print(f"Strict Exact Match Score: {em_score:.2%}")
print(f"Semantic Match Score: {semantic_score:.2%}")

# Display example predictions
print("\nExample Predictions:")
for idx, row in code_df[code_df['question_type_prediction'].notna()].iterrows():
    print(f"Question: {row['question']}")
    print(f"Prediction: {row['question_type_prediction']}")
    print(f"Target: {row['target']}")
    strict_match = exact_match_code(row['question_type_prediction'], row['target'])
    semantic_match = semantic_match_code(row['question_type_prediction'], row['target'])
    print(f"Strict Match: {strict_match}")
    print(f"Semantic Match: {semantic_match}")
    if not strict_match and semantic_match:
        print("Note: Found target number within prediction string")
    print("-" * 80)

Code Execution Metrics:
Strict Exact Match Score: 50.00%
Semantic Match Score: 92.00%

Example Predictions:
Question: A car in the fast lane is traveling at 60 miles/hour. A car in the slow lane is traveling at half that speed. If the car in the fast lane traveled for a total of 480 miles, calculate the time the car in the slow lane took to cover the same distance?
Prediction: 16.0

Target: 16
Strict Match: True
Semantic Match: True
--------------------------------------------------------------------------------
Question: Heather's razors come 4 to a pack and cost $4.00 a pack.  They are currently on sale for buy one get one free.  She also has a $2.00 coupon.  How much will each individual razor cost, in cents, after the discount if she buys 2 packs of razors?
Prediction: 25

Target: 25
Strict Match: True
Semantic Match: True
--------------------------------------------------------------------------------
Question: John decides to take up an odd hobby of speed talking.  His normally s