In [None]:
!pip install huggingface_hub ctransformers

import os
from huggingface_hub import hf_hub_download
import shutil

In [None]:
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

selected_models = [
    "Airavata.Q2_K.gguf",
    "Airavata.Q4_K_M.gguf",
    "Airavata.Q6_K.gguf"
]

for model_file in selected_models:
    print(f"Downloading {model_file} ...")
    model_path = hf_hub_download(
        repo_id="mradermacher/Airavata-GGUF",
        filename=model_file,
        local_dir=model_dir,
        local_dir_use_symlinks=False
    )
    print(f"Downloaded: {model_file}")


In [None]:
!pip install ctransformers pandas matplotlib seaborn huggingface_hub
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ctransformers import AutoModelForCausalLM
import gc

In [None]:
def benchmark_model(model, prompt, max_tokens=100):
    start_time = time.time()
    first_token_time = None
    response = ""

    try:
        for token in model(prompt, max_new_tokens=max_tokens, stream=True):
            if first_token_time is None:
                first_token_time = time.time() - start_time
            response += token

        total_time = time.time() - start_time

        if first_token_time is None:
            first_token_time = total_time

        token_count = len(response.split())
        throughput = token_count / total_time if total_time > 0 else 0

        return {
            'time_to_first_token': first_token_time,
            'total_latency': total_time,
            'throughput': throughput,
            'response': response[:200],
            'token_count': token_count
        }
    except Exception as e:
        return {
            'time_to_first_token': 0,
            'total_latency': 0,
            'throughput': 0,
            'response': f"Error: {str(e)}",
            'token_count': 0
        }

def get_file_size_mb(filepath):
    return os.path.getsize(filepath) / (1024 * 1024)

def clear_memory():
    gc.collect()

In [None]:
test_prompts = [
    "What is Artificial Intelligence?",
    "Explain photosynthesis.",
    "Explain the Benefits of solar energy?",
    "How to make bread?",
    "Define about the India?",
    "कृत्रिम बुद्धिमत्ता क्या है?",
    "प्रकाश संश्लेषण की व्याख्या करें।",
    "सौर ऊर्जा के लाभों की व्याख्या करें।",
    "ब्रेड कैसे बनाते हैं?",
    "भारत के बारे में परिभाषित करें।"
]

print(f"Setup complete. {len(test_prompts)} test prompts ready.")

In [None]:
model_path_q2k = "models/Airavata.Q2_K.gguf"
results_q2k = []

print("Loading Q2_K model...")
try:
    model_q2k = AutoModelForCausalLM.from_pretrained(
        model_path_q2k,
        model_type="llama",
        gpu_layers=10
    )

    file_size_q2k = get_file_size_mb(model_path_q2k)
    print(f"Q2_K model loaded. File size: {file_size_q2k:.2f} MB")

    for i, prompt in enumerate(test_prompts):
        print(f"Testing Q2_K - Prompt {i+1}/10")
        result = benchmark_model(model_q2k, prompt)
        result['model'] = 'Q2_K'
        result['prompt_id'] = i+1
        result['prompt'] = prompt
        result['file_size_mb'] = file_size_q2k
        results_q2k.append(result)

    del model_q2k
    clear_memory()
    print("Q2_K benchmarking completed.")

except Exception as e:
    print(f"Error with Q2_K model: {str(e)}")
    results_q2k = []

In [None]:
model_path_q4km = "models/Airavata.Q4_K_M.gguf"
results_q4km = []

print("Loading Q4_K_M model...")
try:
    model_q4km = AutoModelForCausalLM.from_pretrained(
        model_path_q4km,
        model_type="llama",
        gpu_layers=10
    )

    file_size_q4km = get_file_size_mb(model_path_q4km)
    print(f"Q4_K_M model loaded. File size: {file_size_q4km:.2f} MB")

    for i, prompt in enumerate(test_prompts):
        print(f"Testing Q4_K_M - Prompt {i+1}/10")
        result = benchmark_model(model_q4km, prompt)
        result['model'] = 'Q4_K_M'
        result['prompt_id'] = i+1
        result['prompt'] = prompt
        result['file_size_mb'] = file_size_q4km
        results_q4km.append(result)

    del model_q4km
    clear_memory()
    print("Q4_K_M benchmarking completed.")

except Exception as e:
    print(f"Error with Q4_K_M model: {str(e)}")
    results_q4km = []

In [None]:
model_path_q6k = "models/Airavata.Q6_K.gguf"
results_q6k = []

print("Loading Q6_K model...")
try:
    model_q6k = AutoModelForCausalLM.from_pretrained(
        model_path_q6k,
        model_type="llama",
        gpu_layers=10
    )

    file_size_q6k = get_file_size_mb(model_path_q6k)
    print(f"Q6_K model loaded. File size: {file_size_q6k:.2f} MB")

    for i, prompt in enumerate(test_prompts):
        print(f"Testing Q6_K - Prompt {i+1}/10")
        result = benchmark_model(model_q6k, prompt)
        result['model'] = 'Q6_K'
        result['prompt_id'] = i+1
        result['prompt'] = prompt
        result['file_size_mb'] = file_size_q6k
        results_q6k.append(result)

    del model_q6k
    clear_memory()
    print("Q6_K benchmarking completed.")

except Exception as e:
    print(f"Error with Q6_K model: {str(e)}")
    results_q6k = []

In [None]:
all_results = results_q2k + results_q4km + results_q6k

if all_results:
    df_results = pd.DataFrame(all_results)

    # Select columns for CSV
    csv_columns = ['model', 'time_to_first_token', 'total_latency', 'throughput', 'file_size_mb', 'prompt']
    df_csv = df_results[csv_columns].copy()

    print("Benchmark Results Summary:")
    print(df_results.groupby('model')[['time_to_first_token', 'total_latency', 'throughput', 'file_size_mb']].mean())

    df_csv.to_csv('benchmark_results.csv', index=False,encoding='utf-8-sig')
    print("Results saved to benchmark_results.csv")
else:
    print("No results to compile. Check model loading errors.")

In [None]:
if all_results:
    plt.style.use('seaborn-v0_8')
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

    # Average metrics by model
    model_summary = df_results.groupby('model').agg({
        'time_to_first_token': 'mean',
        'total_latency': 'mean',
        'throughput': 'mean',
        'file_size_mb': 'first'
    }).reset_index()

    # Time to First Token
    bars1 = axes[0,0].bar(model_summary['model'], model_summary['time_to_first_token'],
                          color=['#3498db', '#e74c3c', '#2ecc71'], alpha=0.8)
    axes[0,0].set_title('Average Time to First Token', fontweight='bold')
    axes[0,0].set_ylabel('Time (seconds)')
    axes[0,0].grid(True, alpha=0.3)
    for bar in bars1:
        height = bar.get_height()
        axes[0,0].text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.3f}s', ha='center', va='bottom', fontweight='bold')

    # Total Latency
    bars2 = axes[0,1].bar(model_summary['model'], model_summary['total_latency'],
                          color=['#3498db', '#e74c3c', '#2ecc71'], alpha=0.8)
    axes[0,1].set_title('Average Total Latency', fontweight='bold')
    axes[0,1].set_ylabel('Time (seconds)')
    axes[0,1].grid(True, alpha=0.3)
    for bar in bars2:
        height = bar.get_height()
        axes[0,1].text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.2f}s', ha='center', va='bottom', fontweight='bold')

    # Throughput
    bars3 = axes[1,0].bar(model_summary['model'], model_summary['throughput'],
                          color=['#3498db', '#e74c3c', '#2ecc71'], alpha=0.8)
    axes[1,0].set_title('Average Throughput', fontweight='bold')
    axes[1,0].set_ylabel('Tokens/second')
    axes[1,0].grid(True, alpha=0.3)
    for bar in bars3:
        height = bar.get_height()
        axes[1,0].text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.1f}', ha='center', va='bottom', fontweight='bold')

    # Size vs Performance
    scatter = axes[1,1].scatter(model_summary['file_size_mb'], model_summary['throughput'],
                                s=200, c=['#3498db', '#e74c3c', '#2ecc71'], alpha=0.8)
    for i, row in model_summary.iterrows():
        axes[1,1].annotate(row['model'],
                           (row['file_size_mb'], row['throughput']),
                           xytext=(5, 5), textcoords='offset points',
                           fontweight='bold')
    axes[1,1].set_xlabel('Model Size (MB)')
    axes[1,1].set_ylabel('Throughput (tokens/sec)')
    axes[1,1].set_title('Model Size vs Performance Trade-off', fontweight='bold')
    axes[1,1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('professional_benchmark_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
if all_results:
    df_results['language'] = df_results['prompt_id'].apply(lambda x: 'English' if x <= 5 else 'Hindi')

    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    fig.suptitle('Performance by Language', fontsize=14, fontweight='bold')

    # Throughput by language
    lang_throughput = df_results.groupby(['model', 'language'])['throughput'].mean().unstack()
    lang_throughput.plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c'], alpha=0.8)
    axes[0].set_title('Average Throughput by Language', fontweight='bold')
    axes[0].set_ylabel('Tokens/second')
    axes[0].set_xlabel('Model')
    axes[0].legend(title='Language', frameon=True)
    axes[0].grid(True, alpha=0.3)
    axes[0].tick_params(axis='x', rotation=45)

    # Latency by language
    lang_latency = df_results.groupby(['model', 'language'])['total_latency'].mean().unstack()
    lang_latency.plot(kind='bar', ax=axes[1], color=['#3498db', '#e74c3c'], alpha=0.8)
    axes[1].set_title('Average Latency by Language', fontweight='bold')
    axes[1].set_ylabel('Latency (seconds)')
    axes[1].set_xlabel('Model')
    axes[1].legend(title='Language', frameon=True)
    axes[1].grid(True, alpha=0.3)
    axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.savefig('language_performance_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
if all_results:
    model_summary = df_results.groupby('model').agg({
        'time_to_first_token': ['mean', 'std'],
        'total_latency': ['mean', 'std'],
        'throughput': ['mean', 'std'],
        'file_size_mb': 'first'
    }).round(4)

    model_summary.columns = ['_'.join(col).strip() for col in model_summary.columns]

    print("="*80)
    print("FINAL BENCHMARK REPORT")
    print("="*80)
    print("\nDETAILED PERFORMANCE METRICS:")
    print(model_summary)

    print("\n" + "="*80)
    print("PERFORMANCE RANKING:")
    print("="*80)

    # Rank models by different criteria
    avg_metrics = df_results.groupby('model')[['time_to_first_token', 'total_latency', 'throughput', 'file_size_mb']].mean()

    print(f"\n1. FASTEST FIRST TOKEN: {avg_metrics['time_to_first_token'].idxmin()} ({avg_metrics['time_to_first_token'].min():.3f}s)")
    print(f"2. LOWEST LATENCY: {avg_metrics['total_latency'].idxmin()} ({avg_metrics['total_latency'].min():.2f}s)")
    print(f"3. HIGHEST THROUGHPUT: {avg_metrics['throughput'].idxmax()} ({avg_metrics['throughput'].max():.1f} tokens/s)")
    print(f"4. SMALLEST SIZE: {avg_metrics['file_size_mb'].idxmin()} ({avg_metrics['file_size_mb'].min():.0f} MB)")

    # Calculate efficiency score (throughput per MB)
    avg_metrics['efficiency'] = avg_metrics['throughput'] / avg_metrics['file_size_mb']
    best_efficiency = avg_metrics['efficiency'].idxmax()

    print("\n" + "="*80)
    print("RECOMMENDATION:")
    print("="*80)

    best_overall = avg_metrics['throughput'].idxmax()

    print(f"\nBEST OVERALL MODEL: {best_overall}")
    print(f"- Throughput: {avg_metrics.loc[best_overall, 'throughput']:.1f} tokens/s")
    print(f"- Latency: {avg_metrics.loc[best_overall, 'total_latency']:.2f}s")
    print(f"- First Token: {avg_metrics.loc[best_overall, 'time_to_first_token']:.3f}s")
    print(f"- Size: {avg_metrics.loc[best_overall, 'file_size_mb']:.0f} MB")

    print(f"\nMOST EFFICIENT MODEL: {best_efficiency}")
    print(f"- Efficiency Score: {avg_metrics.loc[best_efficiency, 'efficiency']:.3f} tokens/s/MB")

    print(f"\nSMALLEST MODEL: {avg_metrics['file_size_mb'].idxmin()}")
    print(f"- Good for resource-constrained environments")

    print("\n" + "="*80)
else:
    print("No results available for final report.")