In [None]:
%pip install -q sentence-transformers fastapi uvicorn nest-asyncio pyngrok torch pinecone PyGithub python-frontmatter requests

In [None]:
from google.colab import userdata

In [None]:
NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
!ngrok config add-authtoken $NGROK_AUTH_TOKEN

In [None]:
import nest_asyncio
from pyngrok import ngrok
from fastapi import FastAPI
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
import uvicorn
import threading

nest_asyncio.apply()

# Load embedding model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("‚úÖ Model loaded (768 dimensions)")

# Create FastAPI app
app = FastAPI()

class EmbedRequest(BaseModel):
    texts: list[str]

@app.get("/")
async def root():
  return {"status": "ready", "model": "all-mpnet-base-v2", "dimensions": 768}
@app.post("/embed")
async def embed_texts(request: EmbedRequest):
    embeddings = model.encode(request.texts, normalize_embeddings=True).tolist()
    return {"embeddings": embeddings, "count": len(embeddings)}

# Kill old tunnels
ngrok.kill()

# Create tunnel
tunnel = ngrok.connect(8000)

# FIX: Extract actual URL string (not the object)
EMBEDDING_API_URL = tunnel.public_url  # Changed from str(public_url)

print(f"\nüöÄ Embedding API Ready!")
print(f"üì° Public URL: {EMBEDDING_API_URL}")
print(f"üìö Docs: {EMBEDDING_API_URL}/docs")
print(f"\n‚ö†Ô∏è  IMPORTANT: Copy the URL above and use it in Cell 4 if needed\n")

# Start server in background
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="error")

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()

print("‚úÖ Server running in background")
print(f"‚úÖ EMBEDDING_API_URL = {EMBEDDING_API_URL}")


In [None]:
# ============ CONFIGURATION ============
# GitHub repo settings
GITHUB_TOKEN = userdata.get('GITHUB_PAT')
GITHUB_REPO = "FairArena/FairArena-Docs"
DOCS_PATH = "content/docs"
PINECODE_DB_API_KEY = userdata.get('PINECODE_DB_API_KEY')

# Pinecone settings (Get from https://app.pinecone.io/)
PINECONE_API_KEY = PINECODE_DB_API_KEY  # Replace with your key
PINECONE_INDEX_NAME = "fairarena-docs-768"

# Use embedding URL from previous cell
print(f"Using Embedding API: {EMBEDDING_API_URL}")
print(f"Pinecone Index: {PINECONE_INDEX_NAME}")

In [None]:
import requests
import frontmatter
from github import Github
from pinecone import Pinecone, ServerlessSpec
from typing import List, Dict
import time

# ============ STEP 1: Fetch All Docs from GitHub (WITH AUTH) =============
def fetch_all_docs(repo_name: str, docs_path: str, github_token: str) -> List[Dict]:
    """Recursively fetch all .mdx and .md files with authentication"""
    print("üì• Fetching files from GitHub (authenticated)...")

    # Use token for authentication (5000 requests/hour vs 60 without)
    g = Github(github_token) if github_token else Github()
    repo = g.get_repo(repo_name)

    all_files = []

    def traverse(path: str):
        contents = repo.get_contents(path)
        for item in contents:
            if item.type == "dir":
                traverse(item.path)
            elif item.path.endswith(('.mdx', '.md')):
                all_files.append({
                    'path': item.path,
                    'name': item.name,
                    'content': item.decoded_content.decode('utf-8'),
                    'sha': item.sha,
                    'url': item.html_url
                })
                print(f"  ‚úì {item.path}")

    traverse(docs_path)
    return all_files

# ============ STEP 2: Parse MDX Files =============
def parse_mdx_file(file_data: Dict) -> Dict:
    """Extract frontmatter and clean content"""
    try:
        doc = frontmatter.loads(file_data['content'])
        metadata = doc.metadata
        content = doc.content.strip()
    except:
        metadata = {}
        content = file_data['content']

    # Clean path for URL
    clean_path = file_data['path'].replace('content/docs/', '').replace('.mdx', '').replace('.md', '')

    return {
        'id': file_data['sha'][:12],  # Unique ID
        'title': metadata.get('title', file_data['name'].replace('.mdx', '').replace('.md', '')),
        'description': metadata.get('description', ''),
        'content': content,
        'file_path': file_data['path'],
        'url': f"https://docs.fairarena.in/{clean_path}",
        'github_url': file_data['url'],
        'metadata': metadata
    }

# ============ STEP 3: Generate Embeddings =============
def generate_embeddings_batch(texts: List[str], batch_size: int = 10) -> List[List[float]]:
    """Generate embeddings in batches via API"""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        try:
            response = requests.post(
                f"{EMBEDDING_API_URL}/embed",
                json={"texts": batch},
                timeout=60
            )
            response.raise_for_status()
            embeddings = response.json()['embeddings']
            all_embeddings.extend(embeddings)
            print(f"  ‚úì Embedded {i + len(batch)}/{len(texts)} files")
            time.sleep(0.5)  # Rate limit
        except Exception as e:
            print(f"  ‚úó Error on batch {i}: {e}")
            # Retry once
            time.sleep(2)
            response = requests.post(
                f"{EMBEDDING_API_URL}/embed",
                json={"texts": batch},
                timeout=60
            )
            response.raise_for_status()
            embeddings = response.json()['embeddings']
            all_embeddings.extend(embeddings)

    return all_embeddings

# ============ STEP 4: Upload to Pinecone =============
def upload_to_pinecone(docs: List[Dict], embeddings: List[List[float]):
    """Create index and upload vectors"""
    print("\n‚òÅÔ∏è  Connecting to Pinecone...")

    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Create index if doesn't exist
    if PINECONE_INDEX_NAME not in pc.list_indexes().names():
        print(f"  Creating index '{PINECONE_INDEX_NAME}'...")
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=768,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print("  ‚è≥ Waiting for index to initialize...")
        time.sleep(15)  # Wait for index creation

    index = pc.Index(PINECONE_INDEX_NAME)

    # Prepare vectors
    vectors = []
    for doc, embedding in zip(docs, embeddings):
        vectors.append({
            'id': doc['id'],
            'values': embedding,
            'metadata': {
                'title': doc['title'],
                'description': doc['description'][:500] if doc['description'] else '',
                'content': doc['content'][:1000],  # First 1000 chars
                'file_path': doc['file_path'],
                'url': doc['url'],
                'github_url': doc['github_url']
            }
        })

    # Upload in batches (Pinecone limit: 100 vectors/request)
    batch_size = 50
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"  ‚úì Uploaded {min(i + batch_size, len(vectors))}/{len(vectors)} vectors")
        time.sleep(1)  # Small delay between batches

    print(f"\n‚úÖ Index stats: {index.describe_index_stats()}")

# ============ RUN PIPELINE =============
print("üöÄ Starting FairArena Docs Pipeline\n")
print("=" * 60)

# Verify config
if not EMBEDDING_API_URL or EMBEDDING_API_URL == "":
    raise ValueError("‚ùå EMBEDDING_API_URL not set! Rerun Cell 3 first.")
if PINECONE_API_KEY == "YOUR_PINECONE_API_KEY":
    raise ValueError("‚ùå Update PINECONE_API_KEY in Cell 4!")
if GITHUB_TOKEN == "ghp_YOUR_GITHUB_TOKEN_HERE":
    print("‚ö†Ô∏è  WARNING: No GitHub token provided. Using unauthenticated (rate limited).")
    GITHUB_TOKEN = None

# Step 1: Fetch (WITH TOKEN)
docs_raw = fetch_all_docs(GITHUB_REPO, DOCS_PATH, GITHUB_TOKEN)
print(f"‚úÖ Found {len(docs_raw)} files\n")

# Step 2: Parse
print("üìù Parsing MDX files...")
docs_parsed = [parse_mdx_file(f) for f in docs_raw]
print(f"‚úÖ Parsed {len(docs_parsed)} documents\n")

# Step 3: Generate embeddings
print("üß† Generating embeddings...")
texts_to_embed = [f"{doc['title']}. {doc['description']}. {doc['content']}" for doc in docs_parsed]
embeddings = generate_embeddings_batch(texts_to_embed)
print(f"‚úÖ Generated {len(embeddings)} embeddings\n")

# Step 4: Upload to Pinecone
upload_to_pinecone(docs_parsed, embeddings)

print("\n" + "=" * 60)
print("üéâ PIPELINE COMPLETE!")
print(f"üìä Total files indexed: {len(docs_parsed)}")
print(f"üìÅ Index name: {PINECONE_INDEX_NAME}")
print(f"üîç Ready for semantic search!")

In [None]:
# ============ FETCH SPECIALIZED FILES AND FOLDERS ============
def fetch_multiple_file_types(repo_name: str, github_token: str, rate_limit_delay: float = 0.5) -> Dict[str, List[Dict]]:
    """
    Fetch multiple file types and entire folders from repo
    Implements rate limiting to avoid hitting GitHub API limits
    """
    print(f"üì• Fetching specialized files and folders from {repo_name}...")
    print(f"   (Rate limit delay: {rate_limit_delay}s between requests)\n")

    g = Github(github_token) if github_token else Github()
    repo = g.get_repo(repo_name)

    results = {
        'postman': [],
        'prisma': [],
        'config_files': [],
        'docker_files': [],
        'shell_scripts': [],
        'yaml_files': [],
        'husky': [],
        'github_workflows': [],
        'vscode': []
    }

    # Define folders to fetch entirely
    entire_folders = {
        'postman': 'Backend/postman',
        'prisma': 'Backend/prisma',
        'husky': '.husky',
        'github_workflows': '.github',
        'vscode': '.vscode',
    }

    # File patterns to search for (excluding pnpm-lock.yaml)
    patterns = {
        'package.json': 'config_files',
        'package-lock.json': 'config_files',
        'dockerfile': 'docker_files',
        'docker-compose': 'docker_files',
        '.dockerignore': 'docker_files',
        '.sh': 'shell_scripts',
        '.yaml': 'yaml_files',
        '.yml': 'yaml_files',
    }

    def traverse_and_fetch(path: str, results_key: str, folder_name: str = None):
        """Traverse a directory and fetch all files with rate limiting"""
        try:
            contents = repo.get_contents(path)
            for item in contents:
                time.sleep(rate_limit_delay)  # RATE LIMITING

                if item.type == "dir":
                    skip_dirs = {
                        'node_modules', '.git', 'dist', 'build', '.next',
                        '__pycache__', 'coverage', '.pytest_cache', '.turbo'
                    }
                    if item.name not in skip_dirs:
                        traverse_and_fetch(item.path, results_key, folder_name)
                else:
                    try:
                        file_content = item.decoded_content.decode('utf-8')
                        results[results_key].append({
                            'path': item.path,
                            'name': item.name,
                            'content': file_content,
                            'sha': item.sha,
                            'url': item.html_url,
                            'category': folder_name or results_key
                        })
                        print(f"    ‚úì {item.path}")
                    except Exception as e:
                        print(f"    ‚ö†Ô∏è  Could not read {item.path}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error accessing {path}: {str(e)[:50]}")

    # STEP 1: Fetch entire folders
    print("üìÅ Fetching entire folders:\n")
    for folder_key, folder_path in entire_folders.items():
        try:
            print(f"  {folder_key.upper()} ({folder_path}):")
            traverse_and_fetch(folder_path, folder_key, folder_key)
            time.sleep(1)  # Delay between folder fetches
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Folder not found: {folder_path}\n")

    # STEP 2: Fetch specific file types across entire repo
    print("\nüìÑ Fetching specific file types:\n")

    def traverse_for_patterns(path: str, depth: int = 0):
        """Traverse repo and match file patterns"""
        if depth > 20:  # Prevent infinite recursion
            return

        try:
            contents = repo.get_contents(path)
            for item in contents:
                time.sleep(rate_limit_delay)  # RATE LIMITING

                if item.type == "dir":
                    skip_dirs = {
                        'node_modules', '.git', 'dist', 'build', '.next',
                        '__pycache__', 'coverage', '.pytest_cache', '.turbo',
                        '.husky', '.github', '.vscode'  # Already fetched
                    }
                    if item.name not in skip_dirs:
                        traverse_for_patterns(item.path, depth + 1)
                else:
                    file_lower = item.name.lower()
                    file_path_lower = item.path.lower()

                    # Skip pnpm-lock.yaml (too large)
                    if 'pnpm-lock.yaml' in file_path_lower:
                        print(f"    ‚äò {item.path} (skipped - too large)")
                        continue

                    # Check patterns
                    for pattern, category in patterns.items():
                        if pattern.lower() in file_path_lower or file_lower.endswith(pattern.lower()):
                            try:
                                file_content = item.decoded_content.decode('utf-8')
                                results[category].append({
                                    'path': item.path,
                                    'name': item.name,
                                    'content': file_content,
                                    'sha': item.sha,
                                    'url': item.html_url,
                                    'category': category
                                })
                                print(f"    ‚úì {item.path}")
                            except:
                                print(f"    ‚ö†Ô∏è  Could not read {item.path}")
                            break
        except Exception as e:
            if "API rate limit exceeded" in str(e):
                print("\n‚ö†Ô∏è  RATE LIMIT WARNING!")
                print("   Waiting 60 seconds before retrying...")
                time.sleep(60)
                traverse_for_patterns(path, depth)
            else:
                print(f"  Error at {path}: {str(e)[:50]}")

    print("  Scanning for config, build, and script files...")
    traverse_for_patterns("")

    # Summary
    print("\n" + "="*70)
    print("üìä FILES COLLECTED:")
    print("="*70)
    for key, items in results.items():
        if items:
            print(f"  ‚Ä¢ {key.replace('_', ' ').title()}: {len(items)} files")

    total = sum(len(items) for items in results.values())
    print(f"\n  TOTAL: {total} files")
    print("="*70 + "\n")

    return results

# Execute fetch (with 0.5s delay between requests to be safe)
print("üöÄ Starting comprehensive repository scan...\n")
all_specialized_files = fetch_multiple_file_types(
    GITHUB_REPO_MAIN,
    GITHUB_TOKEN_MAIN,
    rate_limit_delay=0.5  # Adjust if needed: 0.3-1.0 is safe
)

# ============ COMBINE ALL FILES ============
# Flatten all specialized files into one list
all_repo_files = []
for category, files in all_specialized_files.items():
    all_repo_files.extend(files)

print(f"‚úÖ Total files to process: {len(all_repo_files)}")


In [None]:
# ============ PARSE AND EMBED SPECIALIZED FILES ============
def parse_specialized_file(file_data: Dict) -> Dict:
    """Parse specialized files (configs, scripts, etc)"""
    return {
        'id': file_data['sha'][:12],
        'title': file_data['path'].split('/')[-1],
        'description': f"File: {file_data['path']} | Type: {file_data['category']}",
        'content': file_data['content'][:5000],  # Limit content
        'file_path': file_data['path'],
        'url': file_data['url'],
        'metadata': {
            'category': file_data['category'],
            'file_type': file_data['name'].split('.')[-1] if '.' in file_data['name'] else 'unknown'
        },
        'repo_type': 'developer'
    }

print("üìù Parsing specialized files...")
docs_specialized = [parse_specialized_file(f) for f in all_repo_files]
print(f"‚úÖ Parsed {len(docs_specialized)} files\n")

print("üß† Generating embeddings for specialized files...")
texts_to_embed_specialized = [
    f"{doc['title']}. {doc['description']}. {doc['content'][:2000]}"
    for doc in docs_specialized
]

embeddings_specialized = generate_embeddings_batch(texts_to_embed_specialized)
print(f"‚úÖ Generated {len(embeddings_specialized)} embeddings\n")

# Upload to same main repo index
print(f"üì§ Uploading to Pinecone index '{PINECONE_INDEX_NAME_MAIN}'...")
vectors_specialized = []
for doc, embedding in zip(docs_specialized, embeddings_specialized):
    vectors_specialized.append({
        'id': f"spec_{doc['id']}",  # Prefix with 'spec_'
        'values': embedding,
        'metadata': {
            'title': doc['title'][:100],
            'description': doc['description'][:200],
            'file_path': doc['file_path'],
            'url': doc['url'],
            'category': doc['metadata']['category'],
            'file_type': doc['metadata']['file_type'],
            'source': 'FairArena-Specialized'
        }
    })

# Upload in batches
batch_size = 50
for i in range(0, len(vectors_specialized), batch_size):
    batch = vectors_specialized[i:i + batch_size]
    index_main.upsert(vectors=batch)
    print(f"  ‚úì Uploaded {min(i + batch_size, len(vectors_specialized))}/{len(vectors_specialized)}")
    time.sleep(1)

print(f"\n‚úÖ All specialized files indexed!")
