# SFSU Q&A Generation with Ollama (FREE on Google Colab)

This notebook will generate Q&A training pairs from your SFSU scraped data.

**Features:**
- ✅ FREE (uses Google's free GPU)
- ✅ No PC load/overheating
- ✅ Process ALL 3,867 pages
- ✅ ~60 minutes total time
- ✅ Generate ~10,000 Q&A pairs

**Steps:**
1. Run Cell 1: Install Ollama
2. Run Cell 2: Upload your data file
3. Run Cell 3: Generate Q&A pairs
4. Run Cell 4: Download results

## Cell 1: Install and Start Ollama

In [None]:
# Install Ollama
print("📦 Installing Ollama...")
!curl -fsSL https://ollama.com/install.sh | sh

# Start Ollama server in background
print("\n🚀 Starting Ollama server...")
import subprocess
import time

ollama_process = subprocess.Popen(
    ['ollama', 'serve'],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)
time.sleep(10)  # Wait for server to start

# Pull the model
print("⬇️ Downloading llama3.2 model (this may take a few minutes)...")
!ollama pull llama3.2

print("\n✅ Ollama is ready!")
print("GPU Available:", "YES" if subprocess.run(['nvidia-smi'], capture_output=True).returncode == 0 else "NO")

## Cell 2: Upload Your Data File

In [None]:
from google.colab import files
import os

print("📤 Please upload your comprehensive_sfsu_crawl.json file")
print("   (Click 'Choose Files' and select the file from your D:\\sfsu-cs-chatbot\\data folder)")
print()

uploaded = files.upload()

# Create data directory
os.makedirs('data', exist_ok=True)

# Move uploaded file
for filename in uploaded.keys():
    !mv "{filename}" data/comprehensive_sfsu_crawl.json
    print(f"\n✅ Uploaded: {filename}")
    
    import json
    with open('data/comprehensive_sfsu_crawl.json', 'r') as f:
        data = json.load(f)
    print(f"   Pages in file: {len(data)}")

## Cell 3: Generate Q&A Pairs

In [None]:
%%writefile generate_qa.py
import json
import requests
import time

def call_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "llama3.2",
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.7, "num_predict": 1500}
    }
    response = requests.post(url, json=payload, timeout=60)
    return response.json()['response'].strip()

def extract_qa(page):
    url = page.get('url', '')
    title = page.get('title', 'No Title')
    text = page.get('full_text', '')
    
    if not text or len(text) < 200:
        return []
    
    if len(text) > 3000:
        text = text[:3000]
    
    prompt = f'''Extract 2-3 natural Q&A pairs from this SFSU content.

Title: {title}
Content: {text}

Return ONLY valid JSON array:
[{{"question": "...", "answer": "...", "source_url": "{url}"}}]

JSON:'''
    
    try:
        result = call_ollama(prompt)
        if '```json' in result:
            result = result.split('```json')[1].split('```')[0]
        elif '```' in result:
            result = result.split('```')[1].split('```')[0]
        return json.loads(result.strip())
    except:
        return []

# Load data
print("Loading data...")
with open('data/comprehensive_sfsu_crawl.json', 'r') as f:
    pages = json.load(f)

valid = [p for p in pages if p.get('status')=='success' and p.get('full_text') and len(p.get('full_text',''))>200]
print(f"Processing {len(valid)} valid pages...\n")

all_qa = []
start = time.time()

for i, page in enumerate(valid, 1):
    title = page.get('title', '')[:50]
    print(f"[{i}/{len(valid)}] {title}...", end=' ')
    
    qa_pairs = extract_qa(page)
    if qa_pairs:
        all_qa.extend(qa_pairs)
        print(f"✓ +{len(qa_pairs)} (Total: {len(all_qa)})")
    else:
        print("✗")
    
    if i % 50 == 0:
        elapsed = time.time() - start
        rate = i / elapsed
        remaining = (len(valid) - i) / rate / 60
        print(f"\n[CHECKPOINT] {len(all_qa)} pairs | ETA: {remaining:.0f} min\n")
        with open('data/qa_training_data.json', 'w') as f:
            json.dump(all_qa, f, indent=2)

# Save final
with open('data/qa_training_data.json', 'w') as f:
    json.dump(all_qa, f, indent=2)

print(f"\n✅ Complete! Generated {len(all_qa)} Q&A pairs in {(time.time()-start)/60:.1f} minutes")

In [None]:
# Run the generation
print("🚀 Starting Q&A generation...")
print("This will take ~60 minutes for all pages.\n")

!python generate_qa.py

## Cell 4: Download Results

In [None]:
from google.colab import files
import json

# Show statistics
with open('data/qa_training_data.json', 'r') as f:
    qa_data = json.load(f)

print("📊 Final Statistics:")
print(f"   Total Q&A pairs: {len(qa_data)}")
print(f"   Unique pages: {len(set(qa['source_url'] for qa in qa_data))}")
print(f"   Avg per page: {len(qa_data)/len(set(qa['source_url'] for qa in qa_data)):.2f}")

print("\n📥 Downloading results...")
files.download('data/qa_training_data.json')

print("\n✅ Done! Upload this file to your Supabase database.")