In [2]:
# ============================================================
# NOTEBOOK 1: DATA PREPARATION & KNOWLEDGE BASE SETUP
# ============================================================
# Purpose: Create a math dataset and prepare it for RAG system
# What YOU need to do:
#   1. Add your OpenAI API key
#   2. Optionally add more math problems to the dataset
#.   3. Run all cells
# ============================================================

# ============================================================
# STEP 1: Install Required Libraries
# ============================================================
"""
Run this cell first to install all dependencies
"""
!pip install pandas sentence-transformers

# ============================================================
# STEP 2: Import Libraries
# ============================================================
import json
import pandas as pd
from datetime import datetime
import os
from pathlib import Path


# ============================================================
# STEP 3: Configure API Keys
# ============================================================

# ============================================================
# STEP 4: Create Sample Math Dataset
# ============================================================
"""
This is a sample knowledge base with 20 math problems.
YOU CAN: Add more problems following the same format
Categories: algebra, calculus, geometry, trigonometry, statistics
"""

math_dataset = [
    # ALGEBRA
    {
        "id": "alg_001",
        "question": "Solve for x: 2x + 5 = 13",
        "solution": "Step 1: Subtract 5 from both sides\n2x + 5 - 5 = 13 - 5\n2x = 8\n\nStep 2: Divide both sides by 2\nx = 8/2\nx = 4",
        "answer": "x = 4",
        "topic": "Linear Equations",
        "difficulty": "easy"
    },
    {
        "id": "alg_002",
        "question": "Factor the expression: x² + 5x + 6",
        "solution": "Step 1: Find two numbers that multiply to 6 and add to 5\nNumbers: 2 and 3 (2×3=6, 2+3=5)\n\nStep 2: Write as factors\nx² + 5x + 6 = (x + 2)(x + 3)",
        "answer": "(x + 2)(x + 3)",
        "topic": "Factoring",
        "difficulty": "medium"
    },
    {
        "id": "alg_003",
        "question": "Solve the quadratic equation: x² - 7x + 12 = 0",
        "solution": "Step 1: Factor the quadratic\nx² - 7x + 12 = (x - 3)(x - 4) = 0\n\nStep 2: Set each factor to zero\nx - 3 = 0  or  x - 4 = 0\nx = 3  or  x = 4",
        "answer": "x = 3 or x = 4",
        "topic": "Quadratic Equations",
        "difficulty": "medium"
    },

    # CALCULUS
    {
        "id": "calc_001",
        "question": "Find the derivative of f(x) = 3x² + 2x - 1",
        "solution": "Step 1: Apply power rule to each term\nd/dx(3x²) = 3 × 2x = 6x\nd/dx(2x) = 2\nd/dx(-1) = 0\n\nStep 2: Combine results\nf'(x) = 6x + 2",
        "answer": "f'(x) = 6x + 2",
        "topic": "Derivatives",
        "difficulty": "easy"
    },
    {
        "id": "calc_002",
        "question": "Integrate: ∫(2x + 3)dx",
        "solution": "Step 1: Apply power rule for integration\n∫2x dx = 2 × (x²/2) = x²\n∫3 dx = 3x\n\nStep 2: Add constant of integration\n∫(2x + 3)dx = x² + 3x + C",
        "answer": "x² + 3x + C",
        "topic": "Integration",
        "difficulty": "easy"
    },

    # GEOMETRY
    {
        "id": "geom_001",
        "question": "Find the area of a circle with radius 5 cm",
        "solution": "Step 1: Use the formula A = πr²\nA = π × (5)²\nA = π × 25\nA = 25π cm²\n\nStep 2: Approximate value\nA ≈ 78.54 cm²",
        "answer": "25π cm² or approximately 78.54 cm²",
        "topic": "Circle Area",
        "difficulty": "easy"
    },
    {
        "id": "geom_002",
        "question": "Find the volume of a cylinder with radius 3 cm and height 10 cm",
        "solution": "Step 1: Use formula V = πr²h\nV = π × (3)² × 10\nV = π × 9 × 10\nV = 90π cm³\n\nStep 2: Approximate value\nV ≈ 282.74 cm³",
        "answer": "90π cm³ or approximately 282.74 cm³",
        "topic": "Volume",
        "difficulty": "medium"
    },

    # TRIGONOMETRY
    {
        "id": "trig_001",
        "question": "If sin(θ) = 3/5 and θ is in the first quadrant, find cos(θ)",
        "solution": "Step 1: Use Pythagorean identity: sin²(θ) + cos²(θ) = 1\n(3/5)² + cos²(θ) = 1\n9/25 + cos²(θ) = 1\n\nStep 2: Solve for cos(θ)\ncos²(θ) = 1 - 9/25 = 16/25\ncos(θ) = ±4/5\n\nStep 3: Choose positive (first quadrant)\ncos(θ) = 4/5",
        "answer": "cos(θ) = 4/5",
        "topic": "Trigonometric Identities",
        "difficulty": "medium"
    },
    {
        "id": "trig_002",
        "question": "Simplify: sin(2x) in terms of sin(x) and cos(x)",
        "solution": "Step 1: Apply double angle formula\nsin(2x) = 2sin(x)cos(x)\n\nThis is the simplified form.",
        "answer": "sin(2x) = 2sin(x)cos(x)",
        "topic": "Double Angle Formulas",
        "difficulty": "easy"
    },

    # STATISTICS
    {
        "id": "stat_001",
        "question": "Find the mean of the dataset: 4, 7, 2, 9, 5, 3",
        "solution": "Step 1: Add all numbers\nSum = 4 + 7 + 2 + 9 + 5 + 3 = 30\n\nStep 2: Divide by count\nMean = 30 ÷ 6 = 5",
        "answer": "Mean = 5",
        "topic": "Mean",
        "difficulty": "easy"
    },
    {
        "id": "stat_002",
        "question": "Find the median of: 12, 5, 8, 15, 3, 10",
        "solution": "Step 1: Sort the data\n3, 5, 8, 10, 12, 15\n\nStep 2: Find middle values (even count)\nMiddle two values: 8 and 10\n\nStep 3: Average them\nMedian = (8 + 10) / 2 = 9",
        "answer": "Median = 9",
        "topic": "Median",
        "difficulty": "easy"
    },

    # MORE ALGEBRA
    {
        "id": "alg_004",
        "question": "Solve the system: 2x + y = 7 and x - y = 2",
        "solution": "Step 1: Add equations to eliminate y\n(2x + y) + (x - y) = 7 + 2\n3x = 9\nx = 3\n\nStep 2: Substitute x = 3 into second equation\n3 - y = 2\ny = 1",
        "answer": "x = 3, y = 1",
        "topic": "Systems of Equations",
        "difficulty": "medium"
    },
    {
        "id": "alg_005",
        "question": "Simplify: (x² - 4) / (x - 2)",
        "solution": "Step 1: Factor numerator (difference of squares)\nx² - 4 = (x + 2)(x - 2)\n\nStep 2: Cancel common factor\n(x + 2)(x - 2) / (x - 2) = x + 2\n\nNote: x ≠ 2",
        "answer": "x + 2 (where x ≠ 2)",
        "topic": "Rational Expressions",
        "difficulty": "medium"
    },

    # MORE CALCULUS
    {
        "id": "calc_003",
        "question": "Find the derivative of f(x) = sin(x) + cos(x)",
        "solution": "Step 1: Apply derivative rules\nd/dx[sin(x)] = cos(x)\nd/dx[cos(x)] = -sin(x)\n\nStep 2: Combine\nf'(x) = cos(x) - sin(x)",
        "answer": "f'(x) = cos(x) - sin(x)",
        "topic": "Derivatives of Trig Functions",
        "difficulty": "easy"
    },
    {
        "id": "calc_004",
        "question": "Find critical points of f(x) = x² - 4x + 3",
        "solution": "Step 1: Find derivative\nf'(x) = 2x - 4\n\nStep 2: Set derivative equal to zero\n2x - 4 = 0\n2x = 4\nx = 2\n\nCritical point at x = 2",
        "answer": "Critical point at x = 2",
        "topic": "Critical Points",
        "difficulty": "medium"
    },

    # MORE GEOMETRY
    {
        "id": "geom_003",
        "question": "Find the perimeter of a rectangle with length 8 cm and width 5 cm",
        "solution": "Step 1: Use formula P = 2(l + w)\nP = 2(8 + 5)\nP = 2(13)\nP = 26 cm",
        "answer": "26 cm",
        "topic": "Perimeter",
        "difficulty": "easy"
    },
    {
        "id": "geom_004",
        "question": "Find the distance between points A(2, 3) and B(5, 7)",
        "solution": "Step 1: Use distance formula d = √[(x₂-x₁)² + (y₂-y₁)²]\nd = √[(5-2)² + (7-3)²]\nd = √[3² + 4²]\nd = √[9 + 16]\nd = √25\nd = 5",
        "answer": "5 units",
        "topic": "Distance Formula",
        "difficulty": "medium"
    },

    # PROBABILITY
    {
        "id": "prob_001",
        "question": "What is the probability of rolling a 4 on a standard six-sided die?",
        "solution": "Step 1: Count favorable outcomes\nFavorable outcome: {4}\nCount = 1\n\nStep 2: Count total outcomes\nTotal outcomes: {1, 2, 3, 4, 5, 6}\nCount = 6\n\nStep 3: Calculate probability\nP(4) = 1/6 ≈ 0.167 or 16.7%",
        "answer": "1/6 or approximately 16.7%",
        "topic": "Basic Probability",
        "difficulty": "easy"
    },
    {
        "id": "prob_002",
        "question": "Two coins are flipped. What's the probability of getting at least one head?",
        "solution": "Step 1: List all outcomes\n{HH, HT, TH, TT}\nTotal = 4\n\nStep 2: Count favorable outcomes (at least 1 head)\n{HH, HT, TH}\nCount = 3\n\nStep 3: Calculate probability\nP(at least 1 head) = 3/4 = 0.75 or 75%",
        "answer": "3/4 or 75%",
        "topic": "Probability",
        "difficulty": "medium"
    },

    # NUMBER THEORY
    {
        "id": "num_001",
        "question": "Is 17 a prime number? Explain.",
        "solution": "Step 1: Check divisibility by primes up to √17 ≈ 4.12\nPrimes to check: 2, 3\n\nStep 2: Test divisibility\n17 ÷ 2 = 8.5 (not divisible)\n17 ÷ 3 = 5.67 (not divisible)\n\nStep 3: Conclusion\n17 is only divisible by 1 and itself\nTherefore, 17 is prime",
        "answer": "Yes, 17 is a prime number",
        "topic": "Prime Numbers",
        "difficulty": "easy"
    }
]

print(f"✅ Created dataset with {len(math_dataset)} math problems")

# ============================================================
# STEP 5: Save Dataset to JSON
# ============================================================
"""
This saves your knowledge base to a JSON file
"""
import os
import json
from pathlib import Path
# Get project root (parent of notebooks folder)
project_root = Path.cwd().parent  # assuming notebook is in 'notebooks/'

# Create data folder at project root
data_dir = project_root / "data"
data_dir.mkdir(exist_ok=True)

# Save JSON file
with open(data_dir / "math_knowledge_base.json", "w") as f:
    json.dump(math_dataset, f, indent=2)

print(f"✅ Saved to '{data_dir / 'math_knowledge_base.json'}'")






# import os
# os.makedirs('data', exist_ok=True)
# with open('data/math_knowledge_base.json', 'w') as f:
#     json.dump(math_dataset, f, indent=2)

# print("✅ Saved to 'math_knowledge_base.json'")

# ============================================================
# STEP 6: Create DataFrame for Easy Viewing
# ============================================================
df = pd.DataFrame(math_dataset)
print("\n📊 Dataset Overview:")
print(df[['id', 'question', 'topic', 'difficulty']])

# ============================================================
# STEP 7: Dataset Statistics
# ============================================================
print("\n📈 Dataset Statistics:")
print(f"Total problems: {len(df)}")
print(f"\nBy Topic:")
print(df['topic'].value_counts())
print(f"\nBy Difficulty:")
print(df['difficulty'].value_counts())

# ============================================================
# STEP 8: Sample Questions for Testing
# ============================================================
"""
These are good test questions from your knowledge base:
"""
print("\n" + "="*60)
print("🧪 SAMPLE QUESTIONS FOR TESTING")
print("="*60)

sample_questions = [
    "Solve for x: 2x + 5 = 13",
    "Find the derivative of f(x) = 3x² + 2x - 1",
    "Find the area of a circle with radius 5 cm"
]

for i, q in enumerate(sample_questions, 1):
    print(f"\n{i}. {q}")

print("\n" + "="*60)
print("✅ NOTEBOOK 1 COMPLETE!")
print("="*60)
print("\n📝 WHAT YOU DID:")
print("   - Created a knowledge base with 20 math problems")
print("   - Saved it to JSON format")
print("   - Prepared sample test questions")
print("\n🔜 NEXT: Move to Notebook 2 (RAG System with Qdrant)")
print("="*60)

Defaulting to user installation because normal site-packages is not writeable
✅ Created dataset with 20 math problems
✅ Saved to 'd:\3. SELF STUDY\2. AI-ML\5. PROJECT\13. MathAI (AI Planets)\data\math_knowledge_base.json'

📊 Dataset Overview:
          id                                           question  \
0    alg_001                           Solve for x: 2x + 5 = 13   
1    alg_002                 Factor the expression: x² + 5x + 6   
2    alg_003     Solve the quadratic equation: x² - 7x + 12 = 0   
3   calc_001         Find the derivative of f(x) = 3x² + 2x - 1   
4   calc_002                             Integrate: ∫(2x + 3)dx   
5   geom_001         Find the area of a circle with radius 5 cm   
6   geom_002  Find the volume of a cylinder with radius 3 cm...   
7   trig_001  If sin(θ) = 3/5 and θ is in the first quadrant...   
8   trig_002    Simplify: sin(2x) in terms of sin(x) and cos(x)   
9   stat_001     Find the mean of the dataset: 4, 7, 2, 9, 5, 3   
10  stat_002        


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\brije\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
