###Downloading and Preparing the GSM8K Dataset

In [1]:
import pandas as pd
import json
import urllib.request
import os

# Download JSONL files from GitHub
urls = {
    "train": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl",
    "test": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
}
os.makedirs("gsm8k_data", exist_ok=True)

for split, url in urls.items():
    urllib.request.urlretrieve(url, f"gsm8k_data/{split}.jsonl")

# Load JSONL files into a DataFrame
def load_gsm8k_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_df = load_gsm8k_jsonl("gsm8k_data/train.jsonl")
test_df = load_gsm8k_jsonl("gsm8k_data/test.jsonl")
df = pd.concat([train_df, test_df]).head(25)  # Select 25 problems for EduVLM-Bench
df.to_csv("gsm8k_subset.csv", index=False)
print("GSM8K subset saved as gsm8k_subset.csv")

GSM8K subset saved as gsm8k_subset.csv


###Generating Synthetic Diagrams

In [2]:
!pip install matplotlib pandas numpy

import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

def generate_synthetic_diagrams(csv_file, output_dir, num_problems=25):
    """
    Generate synthetic diagrams for GSM8K problems and save as a multimodal dataset.
    Args:
        csv_file: Path to gsm8k_subset.csv
        output_dir: Directory to save diagrams and output CSV
        num_problems: Number of problems to process (default: 25)
    Returns:
        DataFrame with question, answer, and diagram_path columns
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df = pd.read_csv(csv_file).head(num_problems)
    diagram_paths = []

    for idx, row in df.iterrows():
        problem = row['question']
        diagram_path = os.path.join(output_dir, f"diagram_{idx}.png")

        if any(keyword in problem.lower() for keyword in ["apples", "cats", "dogs", "items", "books"]):
            plt.figure(figsize=(6, 4))
            quantities = [int(s) for s in problem.split() if s.isdigit()]
            if quantities:
                items = [f"Item {i+1}" for i in range(len(quantities))]
                plt.bar(items, quantities, color='skyblue')
                plt.title(f"Bar Chart for Problem {idx}")
                plt.xlabel("Items")
                plt.ylabel("Quantity")
                plt.savefig(diagram_path)
                plt.close()
            else:
                plt.figure(figsize=(6, 4))
                plt.plot([0, 10], [0, 0], 'k-')
                plt.scatter([0, 10], [0, 0], color='black')
                plt.title(f"Number Line for Problem {idx}")
                plt.xlabel("Value")
                plt.ylabel("")
                plt.grid(True)
                plt.savefig(diagram_path)
                plt.close()

        elif any(keyword in problem.lower() for keyword in ["triangle", "rectangle", "circle"]):
            plt.figure(figsize=(6, 4))
            if "triangle" in problem.lower():
                triangle = np.array([[0, 0], [4, 0], [2, 3], [0, 0]])
                plt.plot(triangle[:, 0], triangle[:, 1], 'b-')
                plt.fill(triangle[:, 0], triangle[:, 1], 'lightblue', alpha=0.5)
                plt.title(f"Triangle for Problem {idx}")
            elif "rectangle" in problem.lower():
                rectangle = np.array([[0, 0], [5, 0], [5, 2], [0, 2], [0, 0]])
                plt.plot(rectangle[:, 0], rectangle[:, 1], 'b-')
                plt.fill(rectangle[:, 0], rectangle[:, 1], 'lightgreen', alpha=0.5)
                plt.title(f"Rectangle for Problem {idx}")
            else:
                circle = plt.Circle((0, 0), 2, color='lightcoral', alpha=0.5)
                plt.gca().add_patch(circle)
                plt.axis('equal')
                plt.title(f"Circle for Problem {idx}")
            plt.xlabel("X (cm)")
            plt.ylabel("Y (cm)")
            plt.grid(True)
            plt.savefig(diagram_path)
            plt.close()

        else:
            plt.figure(figsize=(6, 4))
            plt.plot([0, 10], [0, 0], 'k-')
            plt.scatter([0, 10], [0, 0], color='black')
            plt.title(f"Number Line for Problem {idx}")
            plt.xlabel("Value")
            plt.ylabel("")
            plt.grid(True)
            plt.savefig(diagram_path)
            plt.close()

        diagram_paths.append(diagram_path)

    df['diagram_path'] = diagram_paths
    output_csv = os.path.join(output_dir, 'gsm8k_multimodal.csv')
    df.to_csv(output_csv, index=False)
    print(f"Multimodal dataset saved as {output_csv}")
    return df

csv_file = "/content/gsm8k_subset.csv"
output_dir = "/content/multimodal_dataset"
df_multimodal = generate_synthetic_diagrams(csv_file, output_dir)

Multimodal dataset saved as /content/multimodal_dataset/gsm8k_multimodal.csv


###Computing Evaluation Metrics

In [8]:
!pip install networkx pandas numpy

import pandas as pd
import numpy as np
import random

import networkx as nx

# Load multimodal dataset
df_multimodal = pd.read_csv("/content/multimodal_dataset/gsm8k_multimodal.csv")

# Mock taxonomy and prerequisites
mock_taxonomy = pd.DataFrame({
    'problem_id': range(25),
    'question': df_multimodal['question'],
    'concepts': ["['addition', 'multiplication']" if i % 2 == 0 else "['area', 'multiplication']" for i in range(25)]
})
mock_errors = pd.DataFrame({
    'problem_id': range(25),
    'question': df_multimodal['question'],
    'correct_answer': df_multimodal['answer'],
    'wrong_answer': [str(int(ans.split("####")[-1].strip()) + random.randint(-5, 5)) for ans in df_multimodal['answer']],
    'error_type': [random.choice(['off-by-one', 'wrong-operation', 'random']) for _ in range(25)],
    'prerequisites': ["['addition', 'multiplication']" for _ in range(25)]
})
mock_taxonomy.to_csv("/content/taxonomy_validated.csv", index=False)
mock_errors.to_csv("/content/error_prerequisites.csv", index=False)

# Evaluation metrics functions
def compute_concept_extraction_metrics(vlm_concepts, gold_concepts):
    true_positives = len(set(vlm_concepts) & set(gold_concepts))
    precision = true_positives / len(vlm_concepts) if vlm_concepts else 0
    recall = true_positives / len(gold_concepts) if gold_concepts else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

def compute_prerequisite_metrics(vlm_prereqs, gold_prereqs, k=5):
    vlm_top_k = vlm_prereqs[:min(k, len(vlm_prereqs))]
    true_positives = len(set(vlm_top_k) & set(gold_prereqs))
    precision = true_positives / len(vlm_top_k) if vlm_top_k else 0
    recall = true_positives / len(gold_prereqs) if gold_prereqs else 0
    return precision, recall

def compute_path_metrics(vlm_path, gold_path, graph):
    true_positives = len(set(vlm_path) & set(gold_path))
    accuracy = true_positives / len(gold_path) if gold_path else 0
    start_node = vlm_path[0] if vlm_path else None
    end_node = vlm_path[-1] if vlm_path else None
    if start_node and end_node and nx.has_path(graph, start_node, end_node):
        shortest_length = nx.shortest_path_length(graph, start_node, end_node)
        efficiency = shortest_length / len(vlm_path) if vlm_path else 0
    else:
        efficiency = 0
    return accuracy, efficiency

# Create knowledge graph
G = nx.DiGraph()
G.add_edges_from([
    ("addition", "multiplication"),
    ("multiplication", "division"),
    ("addition", "subtraction"),
    ("area", "multiplication")
])

# Compute metrics
metrics_data = []
for idx in range(len(df_multimodal)):
    vlm_concepts = ["addition", "multiplication", "algebra"] if idx % 2 == 0 else ["area", "division"]
    vlm_prereqs = ["addition", "subtraction", "multiplication", "division", "fractions"]
    vlm_path = ["addition", "multiplication", "division"]

    gold_concepts = eval(mock_taxonomy.loc[mock_taxonomy['problem_id'] == idx, 'concepts'].values[0])
    gold_prereqs = eval(mock_errors.loc[mock_errors['problem_id'] == idx, 'prerequisites'].values[0])
    gold_path = gold_concepts[:2]

    concept_precision, concept_recall, concept_f1 = compute_concept_extraction_metrics(vlm_concepts, gold_concepts)
    prereq_precision, prereq_recall = compute_prerequisite_metrics(vlm_prereqs, gold_prereqs, k=5)
    path_accuracy, path_efficiency = compute_path_metrics(vlm_path, gold_path, G)

    metrics_data.append({
        "problem_id": idx,
        "concept_precision": concept_precision,
        "concept_recall": concept_recall,
        "concept_f1": concept_f1,
        "prereq_precision": prereq_precision,
        "prereq_recall": prereq_recall,
        "path_accuracy": path_accuracy,
        "path_efficiency": path_efficiency
    })

# Generate report
def generate_report(metrics_data, output_file):
    report = "# EduVLM-Bench Evaluation Report\n\n"
    report += "## Summary\n"
    report += "Evaluation of VLM performance on 25 GSM8K problems with synthetic diagrams.\n\n"

    concept_precision = np.mean([m['concept_precision'] for m in metrics_data])
    concept_recall = np.mean([m['concept_recall'] for m in metrics_data])
    concept_f1 = np.mean([m['concept_f1'] for m in metrics_data])
    prereq_precision = np.mean([m['prereq_precision'] for m in metrics_data])
    preerq_recall = np.mean([m['prereq_recall'] for m in metrics_data])
    path_accuracy = np.mean([m['path_accuracy'] for m in metrics_data])
    path_efficiency = np.mean([m['path_efficiency'] for m in metrics_data])

    report += "## Aggregated Metrics\n"
    report += "| Metric | Value |\n|--------|-------|\n"
    report += f"| Concept Extraction Precision | {concept_precision:.3f} |\n"
    report += f"| Concept Extraction Recall | {concept_recall:.3f} |\n"
    report += f"| Concept Extraction F1-Score | {concept_f1:.3f} |\n"
    report += f"| Prerequisite Top-5 Precision | {prereq_precision:.3f} |\n"
    report += f"| Prerequisite Top-5 Recall | {preerq_recall:.3f} |\n"
    report += f"| Learning Path Accuracy | {path_accuracy:.3f} |\n"
    report += f"| Learning Path Efficiency | {path_efficiency:.3f} |\n\n"

    report += "## Detailed Results\n"
    report += "| Problem ID | Concept P | Concept R | Concept F1 | Prereq P | Prereq R | Path Acc | Path Eff |\n"
    report += "|-----------|-----------|-----------|------------|----------|----------|----------|----------|\n"
    for m in metrics_data:
        report += f"| {m['problem_id']} | {m['concept_precision']:.3f} | {m['concept_recall']:.3f} | {m['concept_f1']:.3f} | "
        report += f"{m['prereq_precision']:.3f} | {m['prereq_recall']:.3f} | {m['path_accuracy']:.3f} | {m['path_efficiency']:.3f} |\n"

    report += "\n## Key Findings\n"
    report += "- Concept Extraction: VLMs need improvement for image-based concepts.\n"
    report += "- Prerequisite Identification: Moderate precision, refine ranking of prerequisites.\n"
    report += "- Learning Path Generation: Paths are accurate but less efficient than optimal.\n"

    with open(output_file, 'w') as f:
        f.write(report)
    print(f"Report generated at {output_file}")

# Run report generation
generate_report(metrics_data, "/content/eduvlm_bench_report.md")

Report generated at /content/eduvlm_bench_report.md
