In [1]:
# MODEL_NAME = "qwen/qwen3-next-80b-a3b-instruct-maas"
# MODEL_NAME = "gemini-2.0-flash-001"
MODEL_NAME = "openai/gpt-oss-120b-maas"
NUM_GPUS = 2
RUN_DATASET = ['fir','fir_hash','cfpb']

# Gemini inference

In [2]:
import logging
import os

# Silence Python logging
logging.basicConfig(level=logging.WARNING)

# Silence noisy Google / Vertex / gRPC loggers
for logger_name in [
    "google",
    "google.genai",
    "google.api_core",
    "google.auth",
    "google.cloud",
    "grpc",
    "absl",
]:
    logging.getLogger(logger_name).setLevel(logging.WARNING)

# Optional: fully disable HTTP request logs
logging.getLogger("urllib3").setLevel(logging.WARNING)


In [3]:
from google import genai
from google.genai import types
import os
from typing import List
import time
import random

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/scratch/asing725/CSE336/privacy_qa/prompt-qwen-3-9adf83a178fb.json'
PROJECT_ID = "prompt-qwen-3"
LOCATION = "global"
def generate_single(
    prompt: str,
    client: genai.Client,
    model: str,
    config: types.GenerateContentConfig,
    max_retries: int = 6,
    base_delay: float = 3.0,
    max_delay: float = 90.0,
):
    """Generate response for a single prompt with retry + truncated exponential backoff"""

    contents = [
        types.Content(
            role="user",
            parts=[types.Part(text=prompt)]
        )
    ]

    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=model,
                contents=contents,
                config=config,
            )
            return response

        except Exception as e:
            if attempt == max_retries - 1:
                raise  # re-raise on final failure

            # truncated exponential backoff + small jitter
            delay = min(max_delay, base_delay * (2 ** attempt))
            time.sleep(delay)

def get_batch_results(prompts: List, max_tokens: int, temperature: float, model_name: str) -> List[str]:
    client = genai.Client(
        vertexai=True,
        location=LOCATION
    )

    config = types.GenerateContentConfig(
        # max_output_tokens=max_tokens,
    )
    results = []
    for entry in prompts:
        single_response = generate_single(entry, client, model_name, config)
        results.append(single_response)

    return results

# Local inference

In [None]:
MAX_TOKENS = 250
batch_size = 10

import torch
from datetime import datetime
import gc
from vllm import LLM, SamplingParams
from typing import List
from itertools import islice
import json
import time
from pathlib import Path
from tqdm.notebook import tqdm

def run_hf_inference(
    prompts_dict,
    model_name: str,
    model_path,
    temperature: float = 0.01,
    max_tokens: int = MAX_TOKENS,
    tensor_parallel_size: int = NUM_GPUS,
    **sampling_kwargs):
    """
    Run inference on a dictionary of prompts using HuggingFace Transformers.
    
    Args:
        prompts_dict: Dictionary mapping output_file -> list of prompts
        model_name: Name of the model (key in model_configs)
        model_path: Path to the model
        temperature: Sampling temperature
        max_tokens: Maximum tokens to generate
        tensor_parallel_size: Number of GPUs to use
        **sampling_kwargs: Additional sampling parameters
    
    Returns:
        None (saves results to files)
    """    
    print(f"Loading model: {model_name}")
    print(f"Model path: {model_path}")
    print(f"Using {tensor_parallel_size} GPUs")
    if 'llama' in model_name.lower():
        llm = LLM(
            model=model_path,
            dtype="auto",
            tensor_parallel_size=NUM_GPUS,
            # pipeline_parallel_size=NUM_GPUS,
            trust_remote_code=True,
            gpu_memory_utilization=0.95,
            # max_model_len=50000,
        )

        sampling_params = SamplingParams(
            temperature=0.01,
            max_tokens=MAX_TOKENS,
        )

    # Flatten all prompts and track their sources
    all_prompts = []
    prompt_to_file = []
    
    for output_file, prompts in prompts_dict.items():
        for prompt in prompts:
            all_prompts.append(prompt)
            prompt_to_file.append(output_file)
    
    print(f"\nRunning inference on {len(all_prompts)} prompts")

    # Initialize results dictionary
    file_results = {output_file: [] for output_file in prompts_dict.keys()}
    file_counters = {output_file: 0 for output_file in prompts_dict.keys()}
    
    # Process prompts in batches
    for i in tqdm(range(0, len(all_prompts), batch_size), desc="Inference Batches"):
        batch_prompts = all_prompts[i:i + batch_size]
        batch_files = prompt_to_file[i:i + batch_size]
        
        # Generate texts
        if 'llama' in model_name.lower():
            generated_texts = llm.generate(batch_prompts, sampling_params)
        else:
            generated_texts = get_batch_results(batch_prompts, max_tokens, temperature, model_name)
        
        # time.sleep(3)
        # Collect results for this batch
        for prompt, generated_text, output_file in zip(batch_prompts, generated_texts, batch_files):
            if 'llama' in model_name.lower():
                result = {
                    "prompt_question_index": file_counters[output_file],
                    "prompt": prompt,
                    "response": generated_text.outputs[0].text,
                    "prompt_length": len(generated_text.prompt_token_ids),
                    "response_length": len(generated_text.outputs[0].token_ids),
                    "model": model_name,
                }
            else:
                result = {
                    "prompt_question_index": file_counters[output_file],
                    "prompt": prompt,
                    "response": generated_text.text,
                    "prompt_length": generated_text.usage_metadata.prompt_token_count,
                    "response_length": generated_text.usage_metadata.candidates_token_count,
                    "model": model_name,
                    "raw_response":str(generated_text),
                }
            file_results[output_file].append(result)
            file_counters[output_file] += 1

        # Save results to respective files
        for output_file, results in file_results.items():
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
    
    return

MODELS = {
    "openai/gpt-oss-120b-maas": "vertex_api",
    "gemini-2.0-flash-001":"vertex_api",
    "meta-llama/Llama-3.3-70B-Instruct": "/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b",
    "qwen/qwen3-next-80b-a3b-instruct-maas": "vertex_api",
}

def process_prompts(base_path: str, model_name: str):
    base_dir = Path(base_path)

    # Datasets to process
    datasets = RUN_DATASET#['fir','fir_hash','cfpb']
    
    print(f"Starting to process prompts from: {base_path}")
    print("=" * 80)
    
    # Dictionary to store all prompts: output_file -> list of prompts
    prompts_dict = {}
    
    for dataset in datasets:
        dataset_dir = base_dir / dataset
        
        if not dataset_dir.exists():
            print(f"Skipping {dataset} - directory not found")
            continue
        
        print(f"\nProcessing dataset: {dataset}")
        print("-" * 80)
        
        # Get all JSON files
        json_files = sorted(dataset_dir.glob("prompts*.json"))
        for json_file in json_files:
            # if model_name == "meta-llama/Llama-3.3-70B-Instruct" and "cfpb"==dataset and "prompts_setup1_k10" in json_file.name:
            #     print(f"  Skipping file {json_file.name}  for model {model_name} and dataset {dataset}")
            #     continue
            all_prompts = []
            print(f"\nFile: {json_file.name}")
            
            # Load JSON file
            with open(json_file, 'r') as f:
                data = json.load(f)
            
            # Handle both single object and array
            if isinstance(data, dict):
                data = [data]
            
            print(f"  Found {len(data)} entries")
            
            # Process each entry
            for idx, entry in enumerate(data, 1):
                # Extract required fields                           
                prompt = entry['prompt']
                setup = entry['setup']
                question_id = entry.get('base_question_id')
                all_prompts.append(prompt)
                
                # print(f"  Entry {idx}/{len(data)}: ID={question_id}, setup={setup}")
            
            # Store prompts with their output file path
            output_file = str(dataset_dir / json_file.name.replace("prompts", f"results_{model_name}"))
            prompts_dict[output_file] = all_prompts
            # print()
    
    # Single run_hf_inference call with all prompts
    if prompts_dict:
        run_hf_inference(prompts_dict, model_path=MODELS[model_name], model_name=model_name)
                    
    print("\n" + "=" * 80)
    print(f"Processing complete!")
    print("=" * 80)
    return



INFO 01-02 06:58:34 [importing.py:44] Triton is installed but 0 active driver(s) found (expected 1). Disabling Triton to prevent runtime errors.
INFO 01-02 06:58:34 [importing.py:68] Triton not installed or not compatible; certain GPU-related functions will not be available.




In [None]:
BASE_PATH = "/scratch/asing725/CSE336/privacy_qa/all_prompts"
process_prompts(BASE_PATH, model_name=MODEL_NAME)

Starting to process prompts from: /scratch/asing725/CSE336/privacy_qa/all_prompts

Processing dataset: fir
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 1001 entries

File: prompts_setup1_k5.json
  Found 1001 entries

File: prompts_setup2_k10.json
  Found 1001 entries

File: prompts_setup2_k5.json
  Found 1001 entries

Processing dataset: fir_hash
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 570 entries

File: prompts_setup1_k5.json
  Found 570 entries

File: prompts_setup2_k10.json
  Found 570 entries

File: prompts_setup2_k5.json
  Found 570 entries

Processing dataset: cfpb
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 1356 entries

File: prompts_setup1_k5.json
  Found 1356 entries

File: prompts_setup2_k10.json
  Found 1356 entries

File: prompts_setup2_k5.json
 

Inference Batches:   0%|          | 0/1171 [00:00<?, ?it/s]