# Demo: Comparing Reasoning Approaches

**NLP Final Lecture - Live Demo**

This notebook demonstrates the difference between:
1. Direct answering (no reasoning)
2. Chain-of-thought prompting
3. Structured reasoning traces

In [None]:
import os
from openai import OpenAI

# Set your API key
# os.environ["OPENAI_API_KEY"] = "your-key-here"

client = OpenAI()

## Test Problem

A classic multi-step math problem that requires reasoning.

In [None]:
problem = """A store has 312 apples. They sell 1/4 of them in the morning and 1/3 of the 
remaining apples in the afternoon. How many apples are left at the end of the day?"""

print("Problem:")
print(problem)

## Approach 1: Direct Answering

In [None]:
def direct_answer(problem):
    """Ask for direct answer without reasoning."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer with just the number, nothing else."},
            {"role": "user", "content": problem}
        ],
        temperature=0,
        max_tokens=10
    )
    return response.choices[0].message.content

direct_result = direct_answer(problem)
print(f"Direct answer: {direct_result}")

## Approach 2: Chain-of-Thought (Zero-Shot)

In [None]:
def cot_zero_shot(problem):
    """Use zero-shot chain-of-thought prompting."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": f"{problem}\n\nLet's think step by step."}
        ],
        temperature=0
    )
    return response.choices[0].message.content

cot_result = cot_zero_shot(problem)
print("Chain-of-Thought (Zero-Shot):")
print(cot_result)

## Approach 3: Structured Reasoning

In [None]:
def structured_reasoning(problem):
    """Use structured reasoning format."""
    system_prompt = """You are a careful problem solver. For each problem:
1. First, identify the given information
2. Then, break down the problem into steps
3. Solve each step showing your calculations
4. Verify your answer makes sense
5. State the final answer clearly

Use <thinking>...</thinking> tags for your reasoning and <answer>...</answer> for the final answer."""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": problem}
        ],
        temperature=0
    )
    return response.choices[0].message.content

structured_result = structured_reasoning(problem)
print("Structured Reasoning:")
print(structured_result)

## Verification: What's the Correct Answer?

In [None]:
# Let's verify manually:
initial = 312
after_morning = initial - (initial // 4)  # Sell 1/4
after_afternoon = after_morning - (after_morning // 3)  # Sell 1/3 of remaining

print("Manual verification:")
print(f"Initial apples: {initial}")
print(f"Sold in morning (1/4): {initial // 4}")
print(f"After morning: {after_morning}")
print(f"Sold in afternoon (1/3 of remaining): {after_morning // 3}")
print(f"Final answer: {after_afternoon}")

## Harder Problem: Self-Consistency

In [None]:
hard_problem = """In a room of 23 people, what is the probability that at least two people 
share the same birthday? Assume 365 days in a year and uniform birthday distribution."""

print("Harder problem:")
print(hard_problem)

In [None]:
def self_consistency(problem, n_samples=5):
    """Generate multiple reasoning chains and take majority vote."""
    answers = []
    
    for i in range(n_samples):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "user", "content": f"{problem}\n\nThink step by step and give your final answer as a percentage."}
            ],
            temperature=0.7  # Higher temperature for diversity
        )
        
        result = response.choices[0].message.content
        # Extract percentage (simplified extraction)
        import re
        percentages = re.findall(r'(\d+\.?\d*)%', result)
        if percentages:
            answers.append(float(percentages[-1]))  # Take last mentioned percentage
        print(f"Sample {i+1}: {percentages[-1] if percentages else 'N/A'}%")
    
    if answers:
        avg = sum(answers) / len(answers)
        print(f"\nAverage across samples: {avg:.1f}%")
        print(f"(True answer is approximately 50.7%)")
    
    return answers

print("Self-consistency with 5 samples:\n")
results = self_consistency(hard_problem, n_samples=5)

## Key Takeaways

1. **Chain-of-thought improves accuracy**: Just adding "Let's think step by step" helps
2. **Structure helps verification**: Explicit reasoning traces can be checked
3. **Self-consistency reduces errors**: Multiple samples with voting
4. **Test-time compute**: More reasoning tokens = better results (up to a point)

## Bonus: Simulating o1-style Reasoning

In [None]:
def o1_style_reasoning(problem):
    """Simulate extended reasoning like o1 models."""
    system_prompt = """You are a careful mathematical reasoner. Before answering:

1. Consider multiple approaches to the problem
2. Check if your approach is correct before calculating
3. After getting an answer, verify it makes sense
4. If you notice an error, backtrack and try again

Show your complete reasoning process, including any corrections.
Use extensive step-by-step reasoning."""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": problem}
        ],
        temperature=0,
        max_tokens=2000  # Allow extended reasoning
    )
    return response.choices[0].message.content

olympiad_problem = """Find all positive integers n such that n^2 + 1 is divisible by n + 1."""

print("Olympiad-style problem:")
print(olympiad_problem)
print("\n" + "="*50 + "\n")
print(o1_style_reasoning(olympiad_problem))