In [43]:
import json
import time
import re
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from tqdm import tqdm

In [44]:
with open("knowledge_base.md", "r", encoding="utf-8") as f:
    full_kb = f.read()

print(f"Loaded knowledge base ({len(full_kb)} characters)")

Loaded knowledge base (619144 characters)


In [45]:
# Load generated questions
with open("generated_questions_smaller_set.json", "r") as f:
    all_questions = json.load(f)

print(f"Loaded {sum(len(ctx['questions']) for ctx in all_questions)} total questions")

Loaded 1345 total questions


In [46]:
# Load generated questions
with open("merged_contexts.json", "r") as f:
    contexts = json.load(f)

In [47]:
print(f"Loaded {len(contexts)} total contexts")

Loaded 89 total contexts


In [37]:
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "API KEY"

In [38]:
# Initialize Gemini
llm = ChatGoogleGenerativeAI(
    # model="gemini-2.0-flash",
    model="gemini-2.0-pro-exp-02-05",
    temperature=0.1,
    max_retries=3,
    top_p = 0.95,
    timeout = 360,
)

In [39]:
answer_prompt = ChatPromptTemplate.from_messages([
    ("system", """# You are TurboML Expert - a senior ML engineer specializing in real-time machine learning with TurboML. You answer technical questions using ONLY the documentation provided below.

Documentation:
{full_kb}

## Response Requirements

### 1. Content Accuracy & Documentation Fidelity
- Use EXACT class/method names from docs (e.g., `FeatureEngineering.create_sql_features()`)
- Include verbatim code snippets when available
- Reference specific sections like `Feature Engineering/UDAF` or `BYOM/ONNX`
- Cite documentation with section headers in brackets (e.g., [Feature Engineering - Python UDFs])
- For code references, include source: [File: feature_engineering.py]

### 2. Real-Time ML Focus
- Emphasize TurboML's streaming capabilities: `OnlineDataset`, windowed aggregates, continuous training
- Highlight key differentiators: Ibis integration, ONNX deployment, Python UDF support
- Use official syntax and parameter names exactly as documented

### 3. Response Structure
- **Problem Analysis** (2 to 4 sentences)
- **Step-by-Step Solution** with implementation steps
- **Code Implementation** with properly formatted examples
- **Common Pitfalls & Debugging Tips**
- For conceptual questions, compare 2-3 approaches with pros/cons as bullet points
- For troubleshooting, identify error scenario, root cause, and provide before/after code

### 4. Knowledge Boundaries & Anti-Hallucination
- If the answer is not in the provided documentation context, clearly state: "I cannot answer this question based on the provided context."
- DO NOT attempt to answer questions beyond what's explicitly in the documentation
- NO assumptions or extrapolations beyond the documentation
- NO generic advice unless specifically mentioned in Context
- If uncertain about any aspect, acknowledge limitations instead of guessing

### 5. Documentation Navigation
- Guide users to relevant documentation sections
- Explain parameter meanings and default values from documentation
- When referencing implementation steps, follow TurboML's workflow: Data ingestion → Feature engineering → Model training → Deployment → Monitoring
"""),
    ("human", "Question: {question}\n\nGenerate comprehensive answer:")
])

In [42]:
# Create checkpoints directory if it doesn't exist
checkpoint_dir = "checkpoints"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Single checkpoint filename that will be overwritten
checkpoint_filename = os.path.join(checkpoint_dir, "current_checkpoint.json")

# Check if there's a previous checkpoint to resume from
if os.path.exists(checkpoint_filename):
    with open(checkpoint_filename, "r") as f:
        checkpoint_data = json.load(f)
        answers = checkpoint_data["answers"]
        failed_questions = checkpoint_data["failed_questions"]
        start_group_idx = checkpoint_data.get("next_group_idx", 0)
        answer_count = checkpoint_data.get("answer_count", 0)
    print(f"Resuming from checkpoint: {checkpoint_filename}")
    print(f"Completed {answer_count} answers so far")
else:
    answers = []
    failed_questions = []
    start_group_idx = 0
    answer_count = 0

# Process all questions
total_answer_count = answer_count
checkpoint_frequency = 10

for ctx_idx, ctx_group in enumerate(tqdm(all_questions[start_group_idx:], desc="Processing context groups", initial=start_group_idx, total=len(all_questions))):
    global_ctx_idx = ctx_idx + start_group_idx
    ctx_answers = []
    
    for q_idx, question in enumerate(ctx_group["questions"]):
        try:
            # Generate answer
            response = llm.invoke(
                answer_prompt.format(
                    full_kb=full_kb,
                    question=question
                )
            )
            
            # Extract referenced sections
            sections_found = list(set(re.findall(
                r"\[(.*?)\]", 
                response.content
            )))
            
            # Validate code presence if question requires it
            if any(q_word in question.lower() for q_word in ["code", "implement", "write"]):
                if "```python" not in response.content:
                    raise ValueError("Code question missing code block")
                
            # Store answer
            ctx_answers.append({
                "question": question,
                "answer": str(response.content).strip(),
                "referenced_sections": sections_found,
                "token_usage": response.response_metadata.get("token_usage", {}),
                "safety_ratings": response.response_metadata.get("safety_ratings", []),
                "generation_time": time.time_ns()
            })
            
            total_answer_count += 1
            
            # Create checkpoint at every 10th answer
            if total_answer_count % checkpoint_frequency == 0:
                # Create current context data
                current_ctx_data = {
                    "context_id": ctx_group["context_id"],
                    "base_sections": ctx_group["base_sections"],
                    "answers": ctx_answers.copy()
                }
                
                # Add to answers list
                answers.append(current_ctx_data)
                
                # Create checkpoint file (overwriting previous one)
                checkpoint_data = {
                    "answers": answers,
                    "failed_questions": failed_questions,
                    "next_group_idx": global_ctx_idx,
                    "next_question_idx": q_idx + 1,
                    "answer_count": total_answer_count,
                    "timestamp": time.time_ns()
                }
                
                with open(checkpoint_filename, "w") as f:
                    json.dump(checkpoint_data, f, indent=2)
                
                print(f"\nCheckpoint updated at {checkpoint_filename} ({total_answer_count} answers processed)")
                
                # Reset context answers for next batch
                ctx_answers = []
            
            # Rate limiting
            time.sleep(11)
            
        except Exception as e:
            failed_questions.append({
                "question": question,
                "error": str(e),
                "context_group": ctx_group["context_id"]
            })
            print(f"\nError processing question: {str(e)}")
            continue
    
    # If we have answers in the current context group that weren't checkpointed, add them now
    if ctx_answers:
        current_ctx_data = {
            "context_id": ctx_group["context_id"],
            "base_sections": ctx_group["base_sections"],
            "answers": ctx_answers
        }
        answers.append(current_ctx_data)
        
        # Update checkpoint with these answers too
        checkpoint_data = {
            "answers": answers,
            "failed_questions": failed_questions,
            "next_group_idx": global_ctx_idx + 1,  # Move to next group
            "next_question_idx": 0,
            "answer_count": total_answer_count,
            "timestamp": time.time_ns()
        }
        
        with open(checkpoint_filename, "w") as f:
            json.dump(checkpoint_data, f, indent=2)

# Save final results
with open("full_answers.json", "w") as f:
    json.dump(answers, f, indent=2)

# Save failed questions separately
if failed_questions:
    with open("failed_questions.json", "w") as f:
        json.dump(failed_questions, f, indent=2)

print(f"""
Answer Generation Complete!
Successfully answered: {total_answer_count} questions
Failed questions: {len(failed_questions)}
Full results saved to: full_answers.json
""")

Resuming from checkpoint: checkpoints\current_checkpoint.json
Completed 1060 answers so far


Processing context groups:  79%|███████▊  | 70/89 [00:00<?, ?it/s]


Checkpoint updated at checkpoints\current_checkpoint.json (1070 answers processed)


Processing context groups:  80%|███████▉  | 71/89 [07:01<2:06:34, 421.92s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1080 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1090 answers processed)


Processing context groups:  81%|████████  | 72/89 [13:35<1:54:54, 405.54s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1100 answers processed)


Processing context groups:  82%|████████▏ | 73/89 [20:34<1:49:45, 411.56s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1110 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1120 answers processed)


Processing context groups:  83%|████████▎ | 74/89 [27:52<1:45:30, 422.00s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1130 answers processed)


Processing context groups:  84%|████████▍ | 75/89 [35:25<1:41:00, 432.93s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1140 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1150 answers processed)


Processing context groups:  85%|████████▌ | 76/89 [44:32<1:42:15, 471.96s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1160 answers processed)


Processing context groups:  87%|████████▋ | 77/89 [52:11<1:33:30, 467.56s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1170 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1180 answers processed)


Processing context groups:  88%|████████▊ | 78/89 [1:01:07<1:29:43, 489.43s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1190 answers processed)


Processing context groups:  89%|████████▉ | 79/89 [1:09:28<1:22:10, 493.03s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1200 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1210 answers processed)


Processing context groups:  90%|████████▉ | 80/89 [1:16:13<1:09:53, 465.98s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1220 answers processed)


Processing context groups:  91%|█████████ | 81/89 [1:23:06<59:57, 449.69s/it]  


Checkpoint updated at checkpoints\current_checkpoint.json (1230 answers processed)


Processing context groups:  92%|█████████▏| 82/89 [1:31:06<53:32, 459.00s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1240 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1250 answers processed)


Processing context groups:  93%|█████████▎| 83/89 [1:38:59<46:18, 463.10s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1260 answers processed)

Error processing question: Code question missing code block


Processing context groups:  94%|█████████▍| 84/89 [1:45:56<37:25, 449.15s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1270 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1280 answers processed)


Processing context groups:  96%|█████████▌| 85/89 [1:52:32<28:53, 433.32s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1290 answers processed)


Processing context groups:  97%|█████████▋| 86/89 [2:00:27<22:17, 445.83s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1300 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1310 answers processed)


Processing context groups:  98%|█████████▊| 87/89 [2:07:35<14:40, 440.38s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1320 answers processed)


Processing context groups:  99%|█████████▉| 88/89 [2:15:20<07:27, 447.79s/it]


Checkpoint updated at checkpoints\current_checkpoint.json (1330 answers processed)

Checkpoint updated at checkpoints\current_checkpoint.json (1340 answers processed)


Processing context groups: 100%|██████████| 89/89 [2:21:39<00:00, 447.36s/it]


Answer Generation Complete!
Successfully answered: 1343 questions
Failed questions: 10
Full results saved to: full_answers.json




