# Question Type Classification
This notebook processes questions from GSM and web-dev datasets to classify their types using LLM.

In [1]:
import pandas as pd
import json
from llm_call import call_llm
import random
import asyncio
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from mcp_call import call_DuckDuckGo

## Phase 1: Data Reading and Tool Selection

In [2]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_code_examples():
    with open('prompt/gsm8k.md', 'r') as f:
        return f.read()
    
def load_search_examples():
    with open('prompt/trivia_qa.md', 'r') as f:
        return f.read()

In [3]:
read_json('dataset/verified-web-dev.json')["Data"]

[{'Answer': {'Aliases': ['Kamal kahn',
    'List of Bond girls in Octopussy',
    'Magda (James Bond)',
    'List of James Bond allies in Octopussy',
    'Vijay (James Bond)',
    'Bond 13',
    'Octopussy (character)',
    'Penelope Smallbone',
    'Octopussy',
    'General Orlov',
    'Kamal Khan',
    'Octopussy (film)',
    'List of James Bond villains in Octopussy',
    'Jim Fanning (James Bond)'],
   'HumanAnswers': ['OCTOPUSSY'],
   'MatchedWikiEntityName': 'Octopussy',
   'NormalizedAliases': ['list of bond girls in octopussy',
    'bond 13',
    'list of james bond allies in octopussy',
    'magda james bond',
    'penelope smallbone',
    'kamal kahn',
    'octopussy',
    'list of james bond villains in octopussy',
    'vijay james bond',
    'jim fanning james bond',
    'general orlov',
    'kamal khan',
    'octopussy character',
    'octopussy film'],
   'NormalizedMatchedWikiEntityName': 'octopussy',
   'NormalizedValue': 'octopussy',
   'Type': 'WikipediaEntity',
   'V

In [4]:
def combine_datasets():
    # Read datasets
    gsm_data = read_jsonl('dataset/gsm.jsonl')
    web_data = read_json('dataset/verified-web-dev.json')["Data"]
    
    # Process web_data to get only required fields
    processed_web_data = []
    for entry in web_data:
        processed_web_data.append({
            'question': entry['Question'],
            'question-id': entry['QuestionId'],
            'target': entry['Answer'],
        })
    
    # Convert to DataFrames and standardize column names
    gsm_df = pd.DataFrame(gsm_data)
    gsm_df = gsm_df.rename(columns={'input': 'question'})
    gsm_df['question_type'] = 'Code Execution'  # Add question type
    
    web_df = pd.DataFrame(processed_web_data)
    web_df['question_type'] = 'Search Engine'  # Add question type
    
    # Combine datasets with matching column names
    combined_df = pd.concat([gsm_df, web_df]).reset_index(drop=True)
    
    # Initialize prediction column
    combined_df['question_type_prediction'] = None
    
    # Shuffle the combined dataset
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Get first 300 of each type after shuffling
    code_execution_df = combined_df[combined_df['question_type'] == 'Code Execution'].head(100)
    search_engine_df = combined_df[combined_df['question_type'] == 'Search Engine'].head(100)
    
    # Final combination and shuffle
    combined_df = pd.concat([code_execution_df, search_engine_df])
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return combined_df

In [5]:
# Load datasets
combined_df = combine_datasets()
print(f"Total number of questions: {len(combined_df)}")
combined_df

Total number of questions: 200


Unnamed: 0,question,target,question_type,question-id,question_type_prediction
0,A car in the fast lane is traveling at 60 mile...,16,Code Execution,,
1,Heather's razors come 4 to a pack and cost $4....,25,Code Execution,,
2,John decides to take up an odd hobby of speed ...,12,Code Execution,,
3,"What nationality was Christian Barnard, the su...","{'Aliases': ['South africa', 'South Africa's',...",Search Engine,qf_2913,
4,What is the flavouring of the liqueur Amaretto?,"{'Aliases': ['Almond tree', 'Luz (nut)', 'Amyg...",Search Engine,sfq_15907,
...,...,...,...,...,...
195,Which vitamin assists in blood clotting?,"{'Aliases': ['Vitamin k 2', 'Vitamin-K', 'Vita...",Search Engine,dpql_2636,
196,My mom went to a gardening shop. She bought a ...,38,Code Execution,,
197,A farmer extracts 5 liters of milk a day from ...,2,Code Execution,,
198,"An American aided coup in Chile in 1973, depos...","{'Aliases': ['Allende Gossens', 'Salvador Goss...",Search Engine,odql_4592,


In [49]:
async def process_questions():
    # Load tool examples
    tool_examples = load_tool_examples()
    
    # Process each question
    for idx, row in combined_df.iterrows():
        # Add sleep every 10 iterations
        if idx > 0 and idx % 15 == 0:
            print("Taking a 30-second break...")
            await asyncio.sleep(60)
    
        # Combine tool examples with current question
        prompt = f"{tool_examples}\n\nQuestion: {row['question']}\nAnswer:"
        
        # Call LLM with the prompt
        response = await call_llm(prompt)
        
        # Store the response in the dataframe
        combined_df.at[idx, 'question_type_prediction'] = response.strip()
        
        # Print progress
        print(f"Question {idx + 1}/{len(combined_df)}:")
        print(f"True Type: {row['question_type']}")
        print(f"Predicted Type: {response}\n")

In [50]:
# Run the processing
await process_questions()

Question 1/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 2/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 3/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 4/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 5/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 6/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 7/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 8/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 9/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 10/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 11/200:
True Type: Code Execution
Predicted Type: Code Execution


Question 12/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 13/200:
True Type: Search Engine
Predicted Type: Search Engine


Question 14/200:
True Type: Code Ex

In [51]:
combined_df.to_csv('dataset/combined_dataset_result_task1.csv', index=False)

In [52]:
# Calculate accuracy
accuracy = (combined_df['question_type'] == combined_df['question_type_prediction']).mean()
print(f"Classification accuracy: {accuracy:.2%}")

Classification accuracy: 99.50%


In [57]:
# Calculate detailed metrics
y_true = combined_df['question_type']
y_pred = combined_df['question_type_prediction']

# Calculate individual metrics with weighted averaging
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")

# Display detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_true, y_pred))

Accuracy: 99.50%
Precision: 100.00%
Recall: 99.50%

Detailed Classification Report:
                       precision    recall  f1-score   support

Answer: Search Engine       0.00      0.00      0.00         0
       Code Execution       1.00      1.00      1.00       100
        Search Engine       1.00      0.99      0.99       100

             accuracy                           0.99       200
            macro avg       0.67      0.66      0.66       200
         weighted avg       1.00      0.99      1.00       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
# Find misclassified questions
misclassified = combined_df[combined_df['question_type'].str.strip() != combined_df['question_type_prediction'].str.strip()]

print(f"Number of misclassified questions: {len(misclassified)}\n")

for idx, row in misclassified.iterrows():
    print(f"Index: {idx}")
    print(f"Question: {row['question']}")
    print(f"True Type: '{row['question_type']}'")
    print(f"Predicted Type: '{row['question_type_prediction']}'")
    print(f"String length - True: {len(row['question_type'])}, Predicted: {len(row['question_type_prediction'])}")
    print("Character by character comparison:")
    true_type = row['question_type']
    pred_type = row['question_type_prediction']
    for i, (c1, c2) in enumerate(zip(true_type, pred_type)):
        if c1 != c2:
            print(f"Position {i}: '{c1}' vs '{c2}'")
    if len(true_type) != len(pred_type):
        print(f"Length mismatch: Extra characters in prediction: '{pred_type[len(true_type):]}'")
    print("-" * 80, "\n")

Number of misclassified questions: 1

Index: 53
Question: Martin, Mandy and Mathew were flatmates in which comedy series
True Type: 'Search Engine'
Predicted Type: 'Answer: Search Engine'
String length - True: 13, Predicted: 21
Character by character comparison:
Position 0: 'S' vs 'A'
Position 1: 'e' vs 'n'
Position 2: 'a' vs 's'
Position 3: 'r' vs 'w'
Position 4: 'c' vs 'e'
Position 5: 'h' vs 'r'
Position 6: ' ' vs ':'
Position 7: 'E' vs ' '
Position 8: 'n' vs 'S'
Position 9: 'g' vs 'e'
Position 10: 'i' vs 'a'
Position 11: 'n' vs 'r'
Position 12: 'e' vs 'c'
Length mismatch: Extra characters in prediction: 'h Engine'
-------------------------------------------------------------------------------- 



## Phase Two: Tool Usage and Problem Solving

In [13]:
async def llm_search_question(question, past_feedback = []):
    need_improve = True
    result = None
    i = 0
    feedback_list = []
    while need_improve and i < 3:
        search_answer = await call_DuckDuckGo(question)
        if len(feedback_list) > 0:
            prompt = f"Context:{search_answer}\nExample: {load_search_examples()}\nFeedback: {feedback_list}\nQuestion: {question}\n Write no more than 5 sentences for each key. Answer:"
        else:
            prompt = f"Context:{search_answer}\nExample: {load_search_examples()}\nQuestion: {question}\n Write no more than 5 sentences for each key. Answer:"
        if len(past_feedback) > 0:
            prompt = f"Past Feedback: {past_feedback}\n{prompt}"
        response = await call_llm(prompt)
        if response is None:
            print("Error: LLM returned None")
            return "error in llm", feedback_list
        try:
            reflection_json = json.loads(response)
        except:
            if response is None:
                print("Error: LLM returned None")
                return "error in llm", feedback_list
            print(f"Error parsing JSON response: {e}")
            print(f"Raw response: {response}")
            return "error in json parsing", feedback_list
        need_improve = reflection_json.get('need_improve', False)
        feedback = reflection_json.get('feedback', '')
        if need_improve:
            feedback_list.append(feedback)
        result = reflection_json.get('answer', None)
        i += 1
    return result.lower().strip() if result else "no valid answer", feedback_list

In [9]:
await llm_search_question("What is the capital of France?")

['Title: Paris facts: the capital of France in history\nDescription: <strong>Paris</strong> is the capital of France, the largest country of Europe with 550 000 km2 (65 millions inhabitants).\nURL: https://home.adelphi.edu/~ca19535/page%204.html\n\nTitle: List of capitals of France - Wikipedia\nDescription: This is a chronological list of capitals of France. The capital of France has been <strong>Paris</strong> since its liberation in 1944.\nURL: https://en.wikipedia.org/wiki/List_of_capitals_of_France\n\nTitle: Protests against Islamophobia held across France\nDescription: People gather at the Place de la Bastille during a demonstration against Islamophobia, racism and in support of popular unity in <strong>Paris, France</strong> on May 11, 2025.\nURL: https://www.aa.com.tr/en/europe/protests-against-islamophobia-held-across-france/3564043\n\nTitle: SNCF Connect : Book your train tickets to France and Europe\nDescription: SNCF Voyageurs - SA with a share capital of €157,789,960, regis

('paris', [])

In [12]:
async def llm_math_question(question, past_feedback = []):
    need_improve = True
    prompt = f"Generate code to answer following question, make sure final output of the code is the answer to the question and nothing else \nQuestion: {question}:"
    i = 0
    output = None
    feedback_list = []
    while need_improve and i < 3:
        i += 1
        if len(feedback_list) > 0:
            prompt = f"Feedback: {feedback_list}\n{prompt}"
        if len(past_feedback) > 0:
            prompt = f"Past Feedback: {past_feedback}\n{prompt}"
        response = await call_llm(prompt, True)
        if response is None:
            print("Error: LLM returned None")
            return "error in llm", feedback_list
        code = None
        for part in response.candidates[0].content.parts:
            if part.executable_code is not None:
                code = part.executable_code.code
            if part.code_execution_result is not None:
                output = part.code_execution_result.output
        reflect_prompt = load_code_examples() + f"\n\nQuestion: {question} Code: {code}\nOutput: {output}\n\nReflect on the code and output, is it correct? If not, please improve the code. Write no more than 5 sentences for problem and feedback.\nAnswer:"
        reflection_response = await call_llm(reflect_prompt, False, True)
        try:
            reflection_json = json.loads(reflection_response)
            need_improve = reflection_json.get('need_improve', False)
            feedback = reflection_json.get('feedback', '')
            if need_improve:
                feedback_list.append(feedback)
            if output is None:
                output = reflection_json.get('answer', None)
        except:
            if reflection_response is None:
                print("Error: LLM returned None")
                return "error in llm", feedback_list
            print(f"Error parsing JSON response: {e}")
            print(f"Raw response: {reflection_response}")
            return "error in json parsing", feedback_list

    return output.strip() if output else "no valid answer", feedback_list

In [19]:
await llm_math_question(combined_df[combined_df['question_type'] == 'Code Execution'].iloc[1]['question'])

{
"answer": "25",
"feedback": "The provided code directly prints '25' without performing any calculations based on the problem description. This is incorrect because it doesn't account for the buy-one-get-one-free sale, the coupon, and the number of razors purchased. The cost per razor needs to be calculated based on these factors.",
"problem": "The code bypasses the problem's requirements by directly printing a value without any calculations. It fails to incorporate the buy-one-get-one-free discount, the coupon, and the total number of razors to determine the cost per razor.",
"need_improve": true
}
{
"answer": "25",
"feedback": "The code is correct and produces the expected output. It accurately calculates the cost per razor after applying the buy-one-get-one-free discount and the coupon. The steps are clear and easy to follow, and the final result is rounded as requested.",
"problem": "The code correctly calculates the cost per razor. The logic for applying the discount and coupon i

('25',
 ["The provided code directly prints '25' without performing any calculations based on the problem description. This is incorrect because it doesn't account for the buy-one-get-one-free sale, the coupon, and the number of razors purchased. The cost per razor needs to be calculated based on these factors."])

In [10]:
async def answer_question():
    code_feedback = []
    search_feedback = []
    # Process each question
    for idx, row in combined_df.iterrows():
        print(f"Processing question {idx + 1}/{len(combined_df)}: {row['question']}")
        # Add sleep every 10 iterations
        if idx > 0 and idx % 8 == 0:
            print("Taking a 60-second break...")
            await asyncio.sleep(60)

        if row['question_type'] == 'Code Execution':
            print(f"Code question: {row['question']}")
            result, feedback_list = await llm_math_question(row['question'], code_feedback)
            row['question_type_prediction'] = result
            if len(feedback_list) > 5:
                feedback_list.pop(0)
            if len(feedback_list) > 0:
                feedback_summary = await call_llm(feedback_list, summerize=True)
                print(f"feedback summary single: {feedback_summary}")
                code_feedback.append(feedback_summary)
            print(f"Code feedback: {feedback_list}")
            print(f"feedbakc summary: {code_feedback}")
            print(f"Code result: {result}")
            print(f"Code target: {row['target']}")
        elif row['question_type'] == 'Search Engine':
            print(f"Search question: {row['question']}")
            result, feedback_list = await llm_search_question(row['question'], search_feedback)
            row['question_type_prediction'] = result
            if len(feedback_list) > 5:
                feedback_list.pop(0)
            if len(feedback_list) > 0:
                feedback_summary = await call_llm(feedback_list, summerize=True)

                search_feedback.append(feedback_summary)
            print(f"search feedback: {feedback_list}")
            print(f"feedbakc summary: {search_feedback}")
            print(f"search result: {result}")
            print(f"search target: {row['target']}")
        # Store the response in the dataframe
        # combined_df.at[idx, 'question_type_prediction'] = response.strip()
        
        # Print progress
        # print(f"Question {idx + 1}/{len(combined_df)}:")
        # print(f"True Type: {row['question_type']}")
        # print(f"Predicted Type: {response}\n")

In [11]:
await answer_question()

Processing question 1/200: A car in the fast lane is traveling at 60 miles/hour. A car in the slow lane is traveling at half that speed. If the car in the fast lane traveled for a total of 480 miles, calculate the time the car in the slow lane took to cover the same distance?
Code question: A car in the fast lane is traveling at 60 miles/hour. A car in the slow lane is traveling at half that speed. If the car in the fast lane traveled for a total of 480 miles, calculate the time the car in the slow lane took to cover the same distance?
Code feedback: []
feedbakc summary: []
Code result: 16.0
Code target: 16
Processing question 2/200: Heather's razors come 4 to a pack and cost $4.00 a pack.  They are currently on sale for buy one get one free.  She also has a $2.00 coupon.  How much will each individual razor cost, in cents, after the discount if she buys 2 packs of razors?
Code question: Heather's razors come 4 to a pack and cost $4.00 a pack.  They are currently on sale for buy one ge

Traceback (most recent call last):
  File "d:\UOA\cs767\cs767_project\llm_call.py", line 67, in call_llm
    response = client.models.generate_content(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\python_conda\env_base\Lib\site-packages\google\genai\models.py", line 4959, in generate_content
    response = self._generate_content(
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\python_conda\env_base\Lib\site-packages\google\genai\models.py", line 3935, in _generate_content
    response_dict = self._api_client.request(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\python_conda\env_base\Lib\site-packages\google\genai\_api_client.py", line 674, in request
    response = self._request(http_request, stream=False)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\python_conda\env_base\Lib\site-packages\google\genai\_api_client.py", line 603, in _request
    errors.APIError.raise_for_response(response)
  File "d:\python_conda\env_base\Lib\site-packag

['Title: Sheer Heart Attack - Wikipedia\nDescription: May did an interview the next <strong>day</strong> (<strong>which</strong> was published on 26 October) that explained what finishing <strong>the</strong> <strong>album</strong> was like. In total, <strong>the</strong> <strong>band</strong> used four different studios in the making of <strong>Sheer</strong> <strong>Heart</strong> <strong>Attack</strong>: most of the backing tracks were recorded at Rockfield, two backing tracks and some guitar ...\nURL: https://en.wikipedia.org/wiki/Sheer_Heart_Attack\n\nTitle: Sheer Heart Attack (song) - Wikipedia\nDescription: &quot;Sheer Heart Attack&quot; is a song by the British rock band <strong>Queen</strong>, released on their sixth studio album News of the World in 1977. It is one of two songs on the album entirely written by Roger Taylor, the other being &quot;Fight from the Inside&quot;. &quot;Sheer Heart Attack&quot; was originally written as the title ...\nURL: https://en.wikipedia.org/w

CancelledError: 

In [23]:
combined_df.to_csv('dataset/combined_dataset_result.csv', index=True)

In [16]:
# Filter for Code Execution questions only
code_exec_df = combined_df[combined_df['question_type'] == 'Code Execution']

# Convert predictions and targets to float for comparison
code_exec_df['target'] = code_exec_df['target'].astype(float)
code_exec_df['question_type_prediction'] = code_exec_df['question_type_prediction'].astype(float)

# Calculate accuracy (allowing for small floating point differences)
correct_predictions = abs(code_exec_df['target'] - code_exec_df['question_type_prediction']) < 0.01
accuracy = correct_predictions.mean()

print(f"Accuracy on Code Execution questions: {accuracy:.2%}\n")

# Find and display incorrect predictions
incorrect_predictions = code_exec_df[~correct_predictions]
print(f"Number of incorrect predictions: {len(incorrect_predictions)}\n")

for idx, row in incorrect_predictions.iterrows():
    print(f"Question: {row['question']}")
    print(f"Expected: {row['target']}")
    print(f"Predicted: {row['question_type_prediction']}")
    print("-" * 80, "\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  code_exec_df['target'] = code_exec_df['target'].astype(float)


ValueError: could not convert string to float: 'driving_speed=50\nhiking_speed=20.0\ndriving_time=4.5\ndriving_distance=225.0\nhiking_distance=30.0\ntotal_distance=255.0'