In [None]:
# Install required packages if not already installed
import subprocess
import sys

def install_package(package):
    try:
        __import__(package.split('==')[0].replace('-', '_'))
        print(f"✅ {package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package} installed successfully")

# Install required packages
packages = [
    "google-cloud-vision",
    "PyMuPDF",
    "groq",  # Switched from openai to groq
    "datasets",
    "pandas",
    "python-dotenv",
    "pdfplumber"
]

for package in packages:
    install_package(package)

# Environment Setup for Groq API

Before running the cells below, make sure you have set up your environment variables:

## 1. **Groq API Key** (Primary extraction method)
- Get your API key from [Groq Console](https://console.groq.com/keys)
- Add `GROQ_API_KEY=your_api_key_here` to your `.env` file

## 2. **Google Cloud Vision API** (Secondary extraction method)
Your project ID is: `105410006241`

**Steps to enable:**
1. Go to [Google Cloud Console](https://console.cloud.google.com/apis/api/vision.googleapis.com/overview?project=105410006241)
2. Click "ENABLE" button to enable the Vision API
3. Create a service account and download the JSON credentials
4. Add `GOOGLE_APPLICATION_CREDENTIALS=path_to_your_credentials.json` to your `.env` file

**Alternative: Skip Google Vision**
If you don't want to set up Google Vision, you can modify the code to only use Groq + fallback heuristics (see cell below).

## 3. **HuggingFace Token** (for uploading datasets)
- Add `HF_TOKEN=your_hf_token_here` to your `.env` file

## **Benefits of using Groq:**
- Much faster inference speed compared to OpenAI
- Cost-effective pricing  
- High-quality text generation with Llama models
- Generous rate limits

In [None]:
# Option: Test with Groq only (no Google Vision required)
import asyncio
from dataset_generator import PDFOutlineExtractor, DatasetCreator
import os
import shutil
from dotenv import load_dotenv
load_dotenv()

async def generate_groq_only_test():
    """Test with only Groq API - no Google Vision needed"""
    
    # Create a test directory with just a few PDFs
    test_pdf_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\test_pdf_subset"
    test_output_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\groq_only_test"
    
    # Create directories if they don't exist
    os.makedirs(test_pdf_dir, exist_ok=True)
    os.makedirs(test_output_dir, exist_ok=True)
    
    # Copy just the first 2 PDFs for quick testing
    original_pdf_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\Pdf"
    test_files = ["0.pdf", "1.pdf"]
    
    for pdf_file in test_files:
        src = os.path.join(original_pdf_dir, pdf_file)
        dst = os.path.join(test_pdf_dir, pdf_file)
        if os.path.exists(src) and not os.path.exists(dst):
            shutil.copy2(src, dst)
            print(f"Copied {pdf_file} to test directory")
    
    # Initialize extractor - use dummy path for Google credentials
    extractor = PDFOutlineExtractor(
        groq_api_key=os.getenv("GROQ_API_KEY"),
        google_credentials_path="dummy_path"  # Won't be used
    )
    
    # Create dataset
    creator = DatasetCreator(extractor)
    
    print("Processing test dataset with Groq only (no Google Vision)...")
    
    # We'll modify this to handle Google Vision errors gracefully
    try:
        dataset_path = await creator.create_dataset(
            pdf_directory=test_pdf_dir,
            output_directory=test_output_dir
        )
        print(f"Test dataset created at: {dataset_path}")
    except Exception as e:
        print(f"Note: Some extraction methods failed (expected without Google Vision setup)")
        print(f"Error details: {str(e)}")
        print("This is normal if Google Vision API is not set up.")

# Uncomment the line below to run this test
# await generate_groq_only_test()

In [None]:
# Helper: Open Google Cloud Console to enable Vision API
import webbrowser
import os

def setup_google_vision():
    """Helper function to set up Google Cloud Vision API"""
    
    # Your project ID from the error message
    project_id = "105410006241"
    
    print("🔧 Setting up Google Cloud Vision API")
    print("=" * 50)
    
    # Step 1: Enable the API
    enable_url = f"https://console.cloud.google.com/apis/api/vision.googleapis.com/overview?project={project_id}"
    print(f"1. Enable Vision API:")
    print(f"   Opening: {enable_url}")
    webbrowser.open(enable_url)
    
    print("\n2. After enabling the API, create service account credentials:")
    credentials_url = f"https://console.cloud.google.com/iam-admin/serviceaccounts?project={project_id}"
    print(f"   Opening: {credentials_url}")
    webbrowser.open(credentials_url)
    
    print("\n📝 Next steps:")
    print("1. Click 'ENABLE' on the Vision API page")
    print("2. Create a new service account")
    print("3. Download the JSON key file")
    print("4. Add to your .env file: GOOGLE_APPLICATION_CREDENTIALS=path/to/your/key.json")
    print("\n✅ Once done, you can run all extraction methods!")
    print("❌ Or run the 'Groq only' cell above to skip Google Vision")

# Uncomment to run:
# setup_google_vision()

In [None]:
import asyncio
from dataset_generator import PDFOutlineExtractor, DatasetCreator
import os
from dotenv import load_dotenv
load_dotenv()

async def generate_dataset():
    # Initialize extractor with Groq API
    extractor = PDFOutlineExtractor(
        groq_api_key=os.getenv("GROQ_API_KEY"),  # Changed from OPENAI_API_KEY
        google_credentials_path=os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    )
    
    # Create dataset
    creator = DatasetCreator(extractor)
    
    # Process PDFs and generate JSON files
    dataset_path = await creator.create_dataset(
        pdf_directory="D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\Pdf",
        output_directory="D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\new_json_files"
    )
    
    print(f"Dataset created at: {dataset_path}")

# Run the generation - use await instead of asyncio.run() in Jupyter
await generate_dataset()

In [None]:
# Test with a small subset of PDFs while resolving API issues
import asyncio
from dataset_generator import PDFOutlineExtractor, DatasetCreator
import os
import shutil
from dotenv import load_dotenv
load_dotenv()

async def generate_test_dataset():
    # Create a test directory with just a few PDFs
    test_pdf_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\test_pdf_subset"
    test_output_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\test_json_files"
    
    # Create directories if they don't exist
    os.makedirs(test_pdf_dir, exist_ok=True)
    os.makedirs(test_output_dir, exist_ok=True)
    
    # Copy just the first 3 PDFs for testing
    original_pdf_dir = "D:\\VS_CODE\\Adobe\\LLM_SE_LLM_Adobe\\training data\\Pdf"
    test_files = ["0.pdf", "1.pdf", "2.pdf"]
    
    for pdf_file in test_files:
        src = os.path.join(original_pdf_dir, pdf_file)
        dst = os.path.join(test_pdf_dir, pdf_file)
        if os.path.exists(src) and not os.path.exists(dst):
            shutil.copy2(src, dst)
            print(f"Copied {pdf_file} to test directory")
    
    # Initialize extractor with Groq API
    extractor = PDFOutlineExtractor(
        groq_api_key=os.getenv("GROQ_API_KEY"),  # Changed from OPENAI_API_KEY
        google_credentials_path=os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    )
    
    # Create dataset
    creator = DatasetCreator(extractor)
    
    # Process PDFs and generate JSON files
    print("Processing test dataset with 3 PDFs...")
    dataset_path = await creator.create_dataset(
        pdf_directory=test_pdf_dir,
        output_directory=test_output_dir
    )
    
    print(f"Test dataset created at: {dataset_path}")

# Run the test generation - this will only process 3 PDFs
await generate_test_dataset()

In [None]:
# Quick test to verify Groq API setup and check available models
import os
from dotenv import load_dotenv
load_dotenv()

try:
    from groq import Groq
    
    # Test if API key is set
    api_key = os.getenv("GROQ_API_KEY")
    if api_key:
        print("✅ GROQ_API_KEY found in environment")
        
        # Initialize client
        client = Groq(api_key=api_key)
        
        # List available models
        try:
            models = client.models.list()
            print("\n📋 Available Groq Models:")
            for model in models.data:
                print(f"  - {model.id}")
        except Exception as e:
            print(f"Could not list models: {e}")
        
        # Test a simple API call with current model
        print("\n🧪 Testing API connection...")
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Updated to current model
            messages=[{"role": "user", "content": "Hello! Just testing the connection."}],
            max_tokens=50
        )
        print("✅ Groq API connection successful!")
        print(f"Response: {response.choices[0].message.content}")
        
    else:
        print("❌ GROQ_API_KEY not found. Please add it to your .env file")
        
except ImportError:
    print("❌ Groq package not installed. Run the package installation cell first.")
except Exception as e:
    print(f"❌ Error testing Groq API: {str(e)}")

# 🔍 Understanding the Error Output

If you ran the dataset generation and got errors, here's what they mean:

## ❌ **Common Errors and Solutions:**

### 1. **"model has been decommissioned"**
- **Problem**: The Groq model `llama-3.1-70b-versatile` was removed
- **Solution**: ✅ **FIXED** - Updated to use `llama-3.1-8b-instant`

### 2. **"Google Vision extraction failed"**  
- **Problem**: Google Cloud Vision API not enabled or network issues
- **Solution**: Either enable Google Vision API or use "Groq only" mode

### 3. **"MuPDF error: No default Layer config"**
- **Problem**: Some PDF files have formatting issues
- **Solution**: This is just a warning, processing continues

### 4. **"CancelledError"**
- **Problem**: The operation was stopped due to previous errors
- **Solution**: Fix API issues first, then retry

## ✅ **Next Steps:**
1. Run the Groq test cell below to verify your API works
2. Try the "Groq only" test with 2 PDFs (faster and simpler)
3. Optionally set up Google Vision API later for better results

In [None]:
from huggingface_upload import HuggingFaceDatasetUploader
import os
from dotenv import load_dotenv
load_dotenv()

# Initialize uploader
uploader = HuggingFaceDatasetUploader(token=os.getenv('HF_TOKEN'))

# Upload dataset
repo_url = uploader.upload_dataset(
    dataset_directory="D:\VS_CODE\Adobe\LLM_SE_LLM_Adobe\training data\Pdf",
    repo_name="arendra/pdf-outline-extraction-dataset",
    private=False
)

print(f"Dataset uploaded: {repo_url}")

In [None]:
from datasets import load_dataset

# Load your uploaded dataset
dataset = load_dataset("arendra/pdf-outline-extraction-dataset")

# Access data
train_data = dataset["train"]
test_data = dataset["test"]

# Example: Iterate through training examples
for example in train_data:
    print(f"PDF: {example['pdf_filename']}")
    print(f"Title: {example['title']}")
    print(f"Headings: {len(example['outline'])}")
    
    for heading in example['outline']:
        print(f"  {heading['level']}: {heading['text']} (Page {heading['page']})")

In [None]:
# Filter results based on quality metrics
def filter_high_quality_extractions(dataset_directory):
    import json
    from pathlib import Path
    
    json_files = list(Path(dataset_directory).glob("*.json"))
    high_quality = []
    
    for json_file in json_files:
        with open(json_file) as f:
            data = json.load(f)
        
        # Quality criteria
        if (len(data["outline"]) >= 3 and  # At least 3 headings
            data["title"] != "Unknown Document" and  # Valid title
            len(data["title"]) > 10):  # Substantial title
            high_quality.append(json_file)
    
    return high_quality