In [None]:

# Install required packages
!pip install -q transformers torch accelerate bitsandbytes
!pip install -q datasets evaluate
!pip install -q ctransformers
!pip install -q gputil psutil
!pip install -q huggingface_hub
!pip install -q pandas matplotlib seaborn

# Import libraries
import torch
import transformers
import time
import psutil
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
import gc
import os

print("Environment setup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

In [None]:
def get_model_size_mb(model):
    """Calculate model size in MB"""
    param_count = sum(p.numel() for p in model.parameters())
    # Assuming float16 (2 bytes per parameter)
    size_mb = param_count * 2 / (1024**2)
    return size_mb, param_count

def get_memory_usage():
    """Get current memory usage"""
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / (1024**2)  # MB
        gpu_memory_cached = torch.cuda.memory_reserved() / (1024**2)  # MB
    else:
        gpu_memory = gpu_memory_cached = 0

    cpu_memory = psutil.virtual_memory().used / (1024**2)  # MB

    return {
        'gpu_allocated_mb': gpu_memory,
        'gpu_cached_mb': gpu_memory_cached,
        'cpu_used_mb': cpu_memory
    }

def benchmark_model(model, tokenizer, test_prompts, model_name="Unknown", device="auto"):
    """Comprehensive benchmarking function"""
    print(f"\n🔍 Benchmarking {model_name}...")

    # Initialize results
    results = {
        'model_name': model_name,
        'model_size_mb': 0,
        'param_count': 0,
        'avg_latency_ms': 0,
        'tokens_per_second': 0,
        'memory_before_mb': 0,
        'memory_after_mb': 0,
        'responses': [],
        'individual_times': []
    }

    # Get model size
    size_mb, param_count = get_model_size_mb(model)
    results['model_size_mb'] = size_mb
    results['param_count'] = param_count

    # Record memory before inference
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    memory_before = get_memory_usage()
    results['memory_before_mb'] = memory_before['gpu_allocated_mb']

    # Test inference
    total_time = 0
    total_tokens = 0

    print(f"Testing with {len(test_prompts)} prompts...")

    for i, prompt in enumerate(test_prompts):
        print(f"  Processing prompt {i+1}/{len(test_prompts)}")

        try:
            # Tokenize input
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda") for k, v in inputs.items()}

            # Generate response with timing
            start_time = time.time()

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id,
                    repetition_penalty=1.1
                )

            end_time = time.time()

            # Decode response
            full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = full_response[len(prompt):].strip()  # Remove input prompt from response

            # Calculate metrics
            inference_time = end_time - start_time
            new_tokens = len(outputs[0]) - len(inputs['input_ids'][0])

            total_time += inference_time
            total_tokens += new_tokens

            results['responses'].append({
                'prompt': prompt,
                'response': response[:200] + "..." if len(response) > 200 else response,
                'time_ms': inference_time * 1000,
                'tokens': new_tokens,
                'tokens_per_sec': new_tokens / inference_time if inference_time > 0 else 0
            })

            results['individual_times'].append(inference_time * 1000)

        except Exception as e:
            print(f"Error processing prompt {i+1}: {str(e)}")
            results['responses'].append({
                'prompt': prompt,
                'response': f"ERROR: {str(e)}",
                'time_ms': 0,
                'tokens': 0,
                'tokens_per_sec': 0
            })

    # Calculate averages
    if total_time > 0:
        results['avg_latency_ms'] = (total_time / len(test_prompts)) * 1000
        results['tokens_per_second'] = total_tokens / total_time

    # Record memory after inference
    memory_after = get_memory_usage()
    results['memory_after_mb'] = memory_after['gpu_allocated_mb']

    print(f"✅ {model_name} benchmark complete!")
    print(f"   Model size: {results['model_size_mb']:.1f} MB")
    print(f"   Avg latency: {results['avg_latency_ms']:.1f} ms")
    print(f"   Throughput: {results['tokens_per_second']:.1f} tok/s")
    print(f"   GPU memory: {results['memory_after_mb']:.1f} MB")

    return results


In [None]:
# Comprehensive test prompts in Hindi and English
test_prompts = [
    "भारत की राजधानी क्या है?",
    "What is machine learning? Explain in simple terms.",
    "एक छोटी कहानी लिखें जो प्रेरणादायक हो।",
    "Write a Python function to calculate factorial.",
    "दीपावली का महत्व बताइए।",
    "Explain the difference between AI and ML.",
    "हिंदी भाषा का महत्व क्या है?",
    "How does a neural network work?",
    "भारतीय संस्कृति की विशेषताएं बताएं।",
    "What are the benefits of renewable energy?"
]

print(f"Prepared {len(test_prompts)} test prompts for evaluation")
print("\nSample prompts:")
for i, prompt in enumerate(test_prompts[:5]):
    print(f"{i+1}. {prompt}")

In [None]:

print("🚀 Loading original Airavata model...")

model_name = "ai4bharat/Airavata"

try:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Set pad token if not exists
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("✅ Tokenizer loaded successfully")

    # Load model in float16 for memory efficiency
    print("Loading original model (FP16)...")
    original_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    print("✅ Original model loaded successfully")

    # Run benchmark on original model
    original_results = benchmark_model(
        original_model,
        tokenizer,
        test_prompts,
        "Original (FP16)"
    )

    # Clear GPU memory
    del original_model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    print("✅ Original model benchmark complete and memory cleared")

except Exception as e:
    print(f"❌ Error loading original model: {str(e)}")
    original_results = None

In [None]:
!nvidia-smi

In [None]:

print("\n🔧 Testing 8-bit quantization...")


try:
    # Load 8-bit quantized model
    model_8bit = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",
        trust_remote_code=True
    )

    print("✅ 8-bit model loaded successfully")

    # Run benchmark
    results_8bit = benchmark_model(
        model_8bit,
        tokenizer,
        test_prompts,
        "8-bit Quantized"
    )

    # Clear memory
    del model_8bit
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    print("✅ 8-bit model benchmark complete and memory cleared")

except Exception as e:
    print(f"❌ Error with 8-bit quantization: {str(e)}")
    results_8bit = None

In [None]:
!nvidia-smi

In [None]:
print("\n🔧 Testing 4-bit quantization...")

try:
    from transformers import BitsAndBytesConfig

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )

    # Load 4-bit quantized model
    model_4bit = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    print("✅ 4-bit model loaded successfully")

    # Run benchmark
    results_4bit = benchmark_model(
        model_4bit,
        tokenizer,
        test_prompts,
        "4-bit Quantized"
    )

    # Clear memory
    del model_4bit
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    print("✅ 4-bit model benchmark complete and memory cleared")

except Exception as e:
    print(f"❌ Error with 4-bit quantization: {str(e)}")
    results_4bit = None

In [None]:
!nvidia-smi

In [None]:
print("\n🔧 Testing GGUF quantized model...")

try:
    from ctransformers import AutoModelForCausalLM as CTAutoModel
    import os

    # Try to load GGUF model
    gguf_model = CTAutoModel.from_pretrained(
        "sam749/Airavata-GGUF",
        model_type="llama",
        gpu_layers=40  # Use GPU acceleration if available
    )

    print("✅ GGUF model loaded successfully")

    # Custom benchmark function for GGUF
    def benchmark_gguf_model(model, prompts, model_name):
        print(f"\n🔍 Benchmarking {model_name}...")

        results = {
            'model_name': model_name,
            'model_size_mb': 0,  # Placeholder
            'param_count': 0,    # Placeholder
            'responses': [],
            'individual_times': [],
            'total_time': 0
        }

        total_time = 0
        total_tokens = 0

        for i, prompt in enumerate(prompts):
            print(f"  Processing prompt {i+1}/{len(prompts)}")

            try:
                start_time = time.time()
                response = model(prompt, max_new_tokens=100, temperature=0.7)
                end_time = time.time()

                inference_time = end_time - start_time
                total_time += inference_time

                estimated_tokens = len(response.split()) * 1.3
                total_tokens += estimated_tokens

                results['responses'].append({
                    'prompt': prompt,
                    'response': response[:200] + "..." if len(response) > 200 else response,
                    'time_ms': inference_time * 1000,
                    'tokens': estimated_tokens,
                    'tokens_per_sec': estimated_tokens / inference_time if inference_time > 0 else 0
                })

                results['individual_times'].append(inference_time * 1000)

            except Exception as e:
                print(f"Error processing prompt {i+1}: {str(e)}")
                results['responses'].append({
                    'prompt': prompt,
                    'response': f"ERROR: {str(e)}",
                    'time_ms': 0,
                    'tokens': 0,
                    'tokens_per_sec': 0
                })

        # Calculate averages
        results['total_time'] = total_time
        if total_time > 0 and len(prompts) > 0:
            results['avg_latency_ms'] = (total_time / len(prompts)) * 1000
            results['tokens_per_second'] = total_tokens / total_time
        else:
            results['avg_latency_ms'] = 0
            results['tokens_per_second'] = 0

        print(f"✅ {model_name} benchmark complete!")
        print(f"   Avg latency: {results['avg_latency_ms']:.1f} ms")
        print(f"   Throughput: {results['tokens_per_second']:.1f} tok/s")

        return results

    # Run GGUF benchmark
    results_gguf = benchmark_gguf_model(gguf_model, test_prompts, "GGUF Quantized")

    # 🔧 Inject model size manually (from download or disk)
    # If downloaded via Hugging Face, you'll find path in huggingface cache or know the local .gguf path
    try:
        gguf_file_path = os.path.expanduser("~/.cache/huggingface/hub/models--sam749--Airavata-GGUF/snapshots")  # Adjust if needed
        for root, _, files in os.walk(gguf_file_path):
            for f in files:
                if f.endswith(".gguf") or f.endswith(".bin"):
                    full_path = os.path.join(root, f)
                    size_mb = os.path.getsize(full_path) / (1024 ** 2)
                    results_gguf['model_size_mb'] = round(size_mb, 1)
                    break
    except Exception as e:
        print(f"⚠️ Could not determine GGUF model size: {e}")
        results_gguf['model_size_mb'] = 3700  # fallback estimate in MB

    # 🔧 Set estimated parameter count (use 4-bit version as proxy)
    results_gguf['param_count'] = 3632017408

    # Clean up
    del gguf_model
    gc.collect()

    print("✅ GGUF model benchmark complete and memory cleared")

except Exception as e:
    print(f"❌ Error with GGUF model: {str(e)}")
    results_gguf = None


In [None]:

print("\n📊 Analyzing and comparing results...")

# Compile all results
all_results = {}
if original_results:
    all_results['Original (FP16)'] = original_results
if results_8bit:
    all_results['8-bit Quantized'] = results_8bit
if results_4bit:
    all_results['4-bit Quantized'] = results_4bit
if results_gguf:
    all_results['GGUF Quantized'] = results_gguf

if not all_results:
    print("❌ No successful benchmarks to analyze")
else:
    print(f"✅ Successfully benchmarked {len(all_results)} model variants")

    # Create comparison DataFrame
    comparison_data = []
    for model_name, results in all_results.items():
        comparison_data.append({
            'Model': model_name,
            'Size (MB)': results.get('model_size_mb', 0),
            'Parameters': results.get('param_count', 0),
            'Avg Latency (ms)': results.get('avg_latency_ms', 0),
            'Throughput (tok/s)': results.get('tokens_per_second', 0),
            'GPU Memory (MB)': results.get('memory_after_mb', 0)
        })

    df_comparison = pd.DataFrame(comparison_data)

    # Display comparison table
    print("\n📋 Performance Comparison Table:")
    print("=" * 80)
    print(df_comparison.to_string(index=False, float_format='%.1f'))

    # Calculate compression ratios and performance improvements
    if original_results:
        original_size = original_results.get('model_size_mb', 1)
        original_latency = original_results.get('avg_latency_ms', 1)
        original_throughput = original_results.get('tokens_per_second', 1)

        print("\n📈 Compression and Performance Analysis:")
        print("=" * 60)

        for model_name, results in all_results.items():
            if model_name != 'Original (FP16)':
                size_reduction = ((original_size - results.get('model_size_mb', 0)) / original_size) * 100
                latency_change = ((results.get('avg_latency_ms', 0) - original_latency) / original_latency) * 100
                throughput_change = ((results.get('tokens_per_second', 0) - original_throughput) / original_throughput) * 100

                print(f"\n{model_name}:")
                print(f"  Size reduction: {size_reduction:.1f}%")
                print(f"  Latency change: {latency_change:+.1f}%")
                print(f"  Throughput change: {throughput_change:+.1f}%")

    # Save results to JSON
    results_json = {
        'timestamp': datetime.now().isoformat(),
        'test_environment': {
            'gpu_available': torch.cuda.is_available(),
            'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None',
            'pytorch_version': torch.__version__,
            'transformers_version': transformers.__version__
        },
        'test_prompts': test_prompts,
        'results': all_results
    }

    # Save to file
    with open('benchmark_results.json', 'w', encoding='utf-8') as f:
        json.dump(results_json, f, indent=2, ensure_ascii=False)

    print("\n💾 Results saved to 'benchmark_results.json'")

In [None]:
print("\n📊 Creating visualizations...")

if len(all_results) > 1:

    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")

    # Create subplots
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Airavata Model Quantization Performance Analysis', fontsize=16, fontweight='bold')

    # Prepare data for plotting
    models = list(all_results.keys())
    sizes = [all_results[m].get('model_size_mb', 0) for m in models]
    latencies = [all_results[m].get('avg_latency_ms', 0) for m in models]
    throughputs = [all_results[m].get('tokens_per_second', 0) for m in models]
    memory_usage = [all_results[m].get('memory_after_mb', 0) for m in models]

    # Plot 1: Model Size Comparison
    bars1 = ax1.bar(models, sizes, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    ax1.set_title('Model Size Comparison', fontweight='bold')
    ax1.set_ylabel('Size (MB)')
    ax1.tick_params(axis='x', rotation=45)

    # Add value labels on bars
    for bar, size in zip(bars1, sizes):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                f'{size:.0f}MB', ha='center', va='bottom', fontweight='bold')

    # Plot 2: Latency Comparison
    bars2 = ax2.bar(models, latencies, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    ax2.set_title('Average Latency Comparison', fontweight='bold')
    ax2.set_ylabel('Latency (ms)')
    ax2.tick_params(axis='x', rotation=45)

    for bar, latency in zip(bars2, latencies):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                f'{latency:.0f}ms', ha='center', va='bottom', fontweight='bold')

    # Plot 3: Throughput Comparison
    bars3 = ax3.bar(models, throughputs, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    ax3.set_title('Throughput Comparison', fontweight='bold')
    ax3.set_ylabel('Tokens per Second')
    ax3.tick_params(axis='x', rotation=45)

    for bar, throughput in zip(bars3, throughputs):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                f'{throughput:.1f}', ha='center', va='bottom', fontweight='bold')

    # Plot 4: Memory Usage Comparison
    bars4 = ax4.bar(models, memory_usage, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    ax4.set_title('GPU Memory Usage', fontweight='bold')
    ax4.set_ylabel('Memory (MB)')
    ax4.tick_params(axis='x', rotation=45)

    for bar, memory in zip(bars4, memory_usage):
        height = bar.get_height()
        if height > 0:
            ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'{memory:.0f}MB', ha='center', va='bottom', fontweight='bold')

    plt.tight_layout()
    plt.savefig('quantization_performance_charts.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Create efficiency plot (Size vs Performance)
    plt.figure(figsize=(10, 6))

    # Create scatter plot with size vs throughput
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    for i, model in enumerate(models):
        plt.scatter(sizes[i], throughputs[i], s=200, c=colors[i], alpha=0.7,
                   edgecolors='black', linewidth=2, label=model)
        plt.annotate(model, (sizes[i], throughputs[i]),
                    xytext=(5, 5), textcoords='offset points',
                    fontweight='bold', fontsize=9)

    plt.title('Model Efficiency: Size vs Throughput', fontsize=14, fontweight='bold')
    plt.xlabel('Model Size (MB)', fontweight='bold')
    plt.ylabel('Throughput (tokens/sec)', fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig('efficiency_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

    print("✅ Visualizations created and saved")

else:
    print("⚠️ Not enough models to create meaningful comparisons")

In [None]:
print("\n🔍 Analyzing Response Quality...")

def analyze_response_quality(all_results, sample_size=3):
    """Analyze and compare response quality across models"""

    if not all_results:
        print("No results to analyze")
        return

    print(f"Comparing first {sample_size} responses across all models:\n")
    print("=" * 100)

    # Get sample responses for comparison
    for i in range(min(sample_size, len(test_prompts))):
        print(f"\n📝 PROMPT {i+1}: {test_prompts[i]}")
        print("-" * 80)

        for model_name, results in all_results.items():
            if i < len(results['responses']):
                response = results['responses'][i]['response']
                time_ms = results['responses'][i]['time_ms']
                print(f"\n{model_name} ({time_ms:.0f}ms):")
                print(f"  {response}")

        print("\n" + "=" * 100)

# Run quality analysis
analyze_response_quality(all_results, sample_size=3)

In [None]:

def generate_comprehensive_report(all_results, test_prompts):
    """Generate a comprehensive markdown report"""

    report = f"""# Airavata Model Quantization Performance Report

## Executive Summary

**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Base Model**: ai4bharat/Airavata (7B parameters)
**Test Environment**: Google Colab
**GPU**: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}
**Test Prompts**: {len(test_prompts)} (Hindi and English mix)

## Key Findings

"""

    if len(all_results) > 1:
        # Find best performing models
        best_speed = max(all_results.items(), key=lambda x: x[1].get('tokens_per_second', 0))
        smallest_size = min(all_results.items(), key=lambda x: x[1].get('model_size_mb', float('inf')))

        report += f"""
### 🏆 Best Performers
- **Fastest Model**: {best_speed[0]} ({best_speed[1].get('tokens_per_second', 0):.1f} tok/s)
- **Smallest Model**: {smallest_size[0]} ({smallest_size[1].get('model_size_mb', 0):.1f} MB)

### 📊 Performance Summary
"""

        # Create performance table
        report += "\n| Model | Size (MB) | Latency (ms) | Throughput (tok/s) | GPU Memory (MB) |\n"
        report += "|-------|-----------|--------------|-------------------|------------------|\n"

        for model_name, results in all_results.items():
            report += f"| {model_name} | {results.get('model_size_mb', 0):.1f} | {results.get('avg_latency_ms', 0):.1f} | {results.get('tokens_per_second', 0):.1f} | {results.get('memory_after_mb', 0):.1f} |\n"

    # Add detailed analysis
    report += f"""

## Detailed Analysis

### Test Configuration
- **Number of test prompts**: {len(test_prompts)}
- **Max tokens per response**: 100
- **Temperature**: 0.7
- **Repetition penalty**: 1.1

### Model Variants Tested
"""

    for model_name, results in all_results.items():
        report += f"""
#### {model_name}
- **Model Size**: {results.get('model_size_mb', 0):.1f} MB
- **Parameters**: {results.get('param_count', 'N/A')}
- **Average Latency**: {results.get('avg_latency_ms', 0):.1f} ms
- **Throughput**: {results.get('tokens_per_second', 0):.1f} tokens/sec
- **GPU Memory**: {results.get('memory_after_mb', 0):.1f} MB
"""

    # Recommendations
    report += """

## Recommendations

### For Production Deployment
"""

    if results_4bit:
        report += """
**4-bit Quantization** appears to offer the best balance of:
- Significant size reduction (~75%)
- Good performance maintenance
- Reasonable memory usage
"""

    if results_gguf:
        report += """
**GGUF Format** is recommended for:
- CPU-only inference
- Edge deployment scenarios
- Maximum compatibility
"""

    report += """

### Next Steps
1. **Phase 2**: Implement FastAPI backend with selected quantization method
2. **Phase 3**: Test on target hardware (CPU deployment)
3. **Phase 4**: Performance optimization and final benchmarking

## Technical Notes

### Quantization Methods Tested
- **8-bit**: Uses BitsAndBytes library for 8-bit integer quantization
- **4-bit**: NF4 quantization with double quantization enabled
- **GGUF**: CPU-optimized format with various quantization levels

### Limitations
- Tests performed in Google Colab environment
- Results may vary on different hardware configurations
- Quality assessment is preliminary (manual evaluation recommended)

---

*Report generated automatically by the Airavata Model Analysis pipeline*
"""

    return report

# Generate and save the final report
print("\n📋 Generating comprehensive report...")

final_report = generate_comprehensive_report(all_results, test_prompts)

# Save the report
with open('airavata_quantization_report.md', 'w', encoding='utf-8') as f:
    f.write(final_report)

print("✅ Comprehensive report saved to 'airavata_quantization_report.md'")

# Display summary
print("\n" + "="*80)
print("🎉 PHASE 1 COMPLETE!")
print("="*80)
print(f"✅ Tested {len(all_results)} model variants")
print(f"✅ Processed {len(test_prompts)} test prompts")
print("✅ Generated performance comparisons")
print("✅ Created visualizations")
print("✅ Saved detailed report")

if all_results:
    print(f"\n🏆 RECOMMENDED FOR PHASE 2:")
    # Simple recommendation logic
    if results_gguf and results_gguf.get('tokens_per_second', 0) > 0:
        print("   GGUF Format (best for CPU deployment)")
    elif results_4bit and results_4bit.get('tokens_per_second', 0) > 0:
        print("   4-bit Quantization (best balance of size and performance)")
    elif results_8bit and results_8bit.get('tokens_per_second', 0) > 0:
        print("   8-bit Quantization (good performance with moderate compression)")
    else:
        print("   Review results to choose optimal quantization method")

print("\n📁 FILES GENERATED:")
print("   - benchmark_results.json")
print("   - airavata_quantization_report.md")
print("   - quantization_performance_charts.png")
print("   - efficiency_analysis.png")
