In [None]:
# Save final results as JSON and push to GitHub
import json

final_results = {
    'model': config.model_name,
    'single_answer': {
        'math_acc': float(sa_math_acc),
        'gsm8k_acc': float(sa_gsm8k_acc),
    },
    'debate_only': {
        'math_acc_per_agent': [float(a) for a in debate_math_accs],
        'math_acc_avg': float(np.mean(debate_math_accs)),
        'math_consensus': float(debate_math_consensus),
    },
    'thoughtcomm': {
        'math_acc_per_agent': [float(a) for a in tc_math_accs],
        'math_acc_avg': float(np.mean(tc_math_accs)),
        'math_consensus': float(tc_math_consensus),
        'gsm8k_acc_per_agent': [float(a) for a in tc_gsm8k_accs],
        'gsm8k_acc_avg': float(np.mean(tc_gsm8k_accs)),
        'gsm8k_consensus': float(tc_gsm8k_consensus),
    },
}

os.makedirs('results', exist_ok=True)
with open(f'results/05_final_results_{MODEL_TAG}.json', 'w') as f:
    json.dump(final_results, f, indent=2)

!git pull --rebase 2>/dev/null || true
!git add results/
!git commit -m "Add Notebook 05 results: full evaluation Table 1 ({MODEL_TAG})"
!git push

print('Final results pushed to GitHub!')
print(f'View at: https://github.com/AUMEZAK/thoughtcomm/tree/main/results')

# Notebook 5: Full Evaluation — Reproduce Table 1

Loads all trained components and evaluates on MATH and GSM8K.
Compares:
1. Single Answer baseline
2. Debate-only baseline
3. ThoughtComm (ours)

Reproduces Table 1, Fig 5 (prefix length ablation), Fig 6 (round scaling).

**Estimated time: ~2-4 hours per model**

In [None]:
# Setup
import os
try:
    from google.colab import userdata
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
    REPO_URL = f'https://{GITHUB_TOKEN}@github.com/AUMEZAK/thoughtcomm.git'
except Exception:
    GITHUB_TOKEN = None
    REPO_URL = 'https://github.com/AUMEZAK/thoughtcomm.git'

!git clone {REPO_URL} thoughtcomm 2>/dev/null || echo 'Already cloned'
%cd thoughtcomm
!pip install -e . -q

!git config user.email "colab@thoughtcomm.dev"
!git config user.name "ThoughtComm Colab"

from google.colab import drive
drive.mount('/content/drive')
SAVE_DIR = '/content/drive/MyDrive/thoughtcomm_checkpoints/'

In [None]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
from configs.config import ThoughtCommConfig
from models.model_utils import load_model_and_tokenizer
from models.autoencoder import SparsityRegularizedAE
from models.prefix_adapter import PrefixAdapter
from pipeline.agreement import AgreementReweighter
from pipeline.thought_comm import ThoughtCommPipeline, run_single_answer_baseline, run_debate_baseline
from data.math_data import load_math_dataset
from data.gsm8k_data import load_gsm8k_dataset
from evaluation.metrics import compute_accuracy, compute_consensus

device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = ThoughtCommConfig.for_qwen_0_6b(device=device)
# config = ThoughtCommConfig.for_phi4_mini(device=device)
MODEL_TAG = config.model_name.split('/')[-1]

In [None]:
# Load model
model, tokenizer = load_model_and_tokenizer(config.model_name, dtype=config.dtype)

# Load trained components
ae_dir = os.path.join(SAVE_DIR, f'{MODEL_TAG}_ae')
adapter_dir = os.path.join(SAVE_DIR, f'{MODEL_TAG}_adapter')

ae_model = SparsityRegularizedAE(
    n_h=config.n_h, n_z=config.n_z,
    hidden_dim=config.ae_hidden, num_layers=config.ae_num_layers
).to(device)
ae_model.load_state_dict(torch.load(os.path.join(ae_dir, 'ae_model.pt'), map_location=device))

B = torch.load(os.path.join(ae_dir, 'B_matrix.pt'), map_location='cpu')
reweighter = AgreementReweighter(B, config).to(device)
reweighter.load_state_dict(torch.load(os.path.join(adapter_dir, 'reweighter.pt'), map_location=device))

adapter = PrefixAdapter(
    n_z=config.n_z, hidden_size=config.hidden_size,
    prefix_length=config.prefix_length, adapter_hidden=config.adapter_hidden
).to(device)
adapter.load_state_dict(torch.load(os.path.join(adapter_dir, 'adapter.pt'), map_location=device))

print('All components loaded.')

In [None]:
# Load eval datasets
_, math_eval = load_math_dataset(num_train=500, num_eval=500, level=3)
_, gsm8k_eval = load_gsm8k_dataset(num_train=500, num_eval=500)
print(f'MATH eval: {len(math_eval)}, GSM8K eval: {len(gsm8k_eval)}')

## 1. Single Answer Baseline

In [None]:
# MATH - Single Answer
sa_math = run_single_answer_baseline(model, tokenizer, math_eval, config, 'math')
sa_math_acc, _ = compute_accuracy(sa_math, [e['answer'] for e in math_eval], 'math')
print(f'Single Answer MATH: {sa_math_acc:.2f}%')

In [None]:
# GSM8K - Single Answer
sa_gsm8k = run_single_answer_baseline(model, tokenizer, gsm8k_eval, config, 'gsm8k')
sa_gsm8k_acc, _ = compute_accuracy(sa_gsm8k, [e['answer'] for e in gsm8k_eval], 'gsm8k')
print(f'Single Answer GSM8K: {sa_gsm8k_acc:.2f}%')

## 2. Debate-Only Baseline

In [None]:
# MATH - Debate only
debate_math = run_debate_baseline(model, tokenizer, math_eval, config)

# Accuracy: use majority vote from final round agents
debate_math_accs = []
for agent_idx in range(config.num_agents):
    agent_resps = [fr[agent_idx] for fr in debate_math['final_responses']]
    acc, _ = compute_accuracy(agent_resps, [e['answer'] for e in math_eval], 'math')
    debate_math_accs.append(acc)
    print(f'  Agent {agent_idx} MATH acc: {acc:.2f}%')

debate_math_consensus = compute_consensus(debate_math['final_responses'], 'math')
print(f'Debate MATH avg acc: {np.mean(debate_math_accs):.2f}%, consensus: {debate_math_consensus:.2f}%')

## 3. ThoughtComm (Ours)

In [None]:
# Create ThoughtComm pipeline
tc = ThoughtCommPipeline(model, tokenizer, ae_model, reweighter, adapter, config)

# MATH evaluation
tc_math = tc.evaluate(math_eval, 'math')

tc_math_accs = []
for agent_idx in range(config.num_agents):
    agent_resps = [fr[agent_idx] for fr in tc_math['final_responses']]
    acc, _ = compute_accuracy(agent_resps, tc_math['ground_truths'], 'math')
    tc_math_accs.append(acc)
    print(f'  Agent {agent_idx} MATH acc: {acc:.2f}%')

tc_math_consensus = compute_consensus(tc_math['final_responses'], 'math')
print(f'ThoughtComm MATH avg acc: {np.mean(tc_math_accs):.2f}%, consensus: {tc_math_consensus:.2f}%')

In [None]:
# GSM8K evaluation
tc_gsm8k = tc.evaluate(gsm8k_eval, 'gsm8k')

tc_gsm8k_accs = []
for agent_idx in range(config.num_agents):
    agent_resps = [fr[agent_idx] for fr in tc_gsm8k['final_responses']]
    acc, _ = compute_accuracy(agent_resps, tc_gsm8k['ground_truths'], 'gsm8k')
    tc_gsm8k_accs.append(acc)

tc_gsm8k_consensus = compute_consensus(tc_gsm8k['final_responses'], 'gsm8k')
print(f'ThoughtComm GSM8K avg acc: {np.mean(tc_gsm8k_accs):.2f}%, consensus: {tc_gsm8k_consensus:.2f}%')

## 4. Results Summary (Table 1)

In [None]:
print('=' * 70)
print(f'Table 1 Results — {config.model_name}')
print('=' * 70)
print(f'{"Method":<25} {"MATH Acc (%)":<15} {"MATH Cons (%)":<15} {"GSM8K Acc (%)":<15} {"GSM8K Cons (%)":<15}')
print('-' * 70)
print(f'{"Single Answer":<25} {sa_math_acc:<15.2f} {"N/A":<15} {sa_gsm8k_acc:<15.2f} {"N/A":<15}')
print(f'{"Debate Only":<25} {np.mean(debate_math_accs):<15.2f} {debate_math_consensus:<15.2f} {"--":<15} {"--":<15}')
print(f'{"ThoughtComm (Ours)":<25} {np.mean(tc_math_accs):<15.2f} {tc_math_consensus:<15.2f} {np.mean(tc_gsm8k_accs):<15.2f} {tc_gsm8k_consensus:<15.2f}')
print('=' * 70)

In [None]:
# Save results
results = {
    'model': config.model_name,
    'single_answer_math': sa_math_acc,
    'single_answer_gsm8k': sa_gsm8k_acc,
    'debate_math_acc': np.mean(debate_math_accs),
    'debate_math_consensus': debate_math_consensus,
    'thoughtcomm_math_acc': np.mean(tc_math_accs),
    'thoughtcomm_math_consensus': tc_math_consensus,
    'thoughtcomm_gsm8k_acc': np.mean(tc_gsm8k_accs),
    'thoughtcomm_gsm8k_consensus': tc_gsm8k_consensus,
}

results_path = os.path.join(SAVE_DIR, f'{MODEL_TAG}_results.pt')
torch.save(results, results_path)
print(f'Results saved to {results_path}')

## Push Results to GitHub