In [1]:
# /scratch/asing725/Huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/

In [2]:
NUM_GPUS = 2
MAX_TOKENS = 512
batch_size = 100
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import torch
from datetime import datetime
import gc
from vllm import LLM, SamplingParams
from typing import List
from itertools import islice
import json
import time
from pathlib import Path
from tqdm.notebook import tqdm

def run_hf_inference(
    prompts_dict,
    model_name: str,
    model_path,
    temperature: float = 0.01,
    max_tokens: int = MAX_TOKENS,
    tensor_parallel_size: int = NUM_GPUS,
    **sampling_kwargs):
    """
    Run inference on a dictionary of prompts using HuggingFace Transformers.
    
    Args:
        prompts_dict: Dictionary mapping output_file -> list of prompts
        model_name: Name of the model (key in model_configs)
        model_path: Path to the model
        temperature: Sampling temperature
        max_tokens: Maximum tokens to generate
        tensor_parallel_size: Number of GPUs to use
        **sampling_kwargs: Additional sampling parameters
    
    Returns:
        None (saves results to files)
    """    
    print(f"Loading model: {model_name}")
    print(f"Model path: {model_path}")
    print(f"Using {tensor_parallel_size} GPUs")
    
    llm = LLM(
        model=model_path,
        dtype="auto",
        tensor_parallel_size=NUM_GPUS,
        # pipeline_parallel_size=3,
        trust_remote_code=True,
        gpu_memory_utilization=0.97,
        max_model_len=30000,
    )

    sampling_params = SamplingParams(
        temperature=0.01,
        max_tokens=MAX_TOKENS,
    )

    # Flatten all prompts and track their sources
    all_prompts = []
    prompt_to_file = []
    
    for output_file, prompts in prompts_dict.items():
        for prompt in prompts:
            all_prompts.append(prompt)
            prompt_to_file.append(output_file)
    
    print(f"\nRunning inference on {len(all_prompts)} prompts")

    # Initialize results dictionary
    file_results = {output_file: [] for output_file in prompts_dict.keys()}
    file_counters = {output_file: 0 for output_file in prompts_dict.keys()}
    
    # Process prompts in batches
    for i in tqdm(range(0, len(all_prompts), batch_size), desc="Inference Batches"):
        batch_prompts = all_prompts[i:i + batch_size]
        batch_files = prompt_to_file[i:i + batch_size]
        
        # Generate texts
        generated_texts = llm.generate(batch_prompts, sampling_params)
        
        # Collect results for this batch
        for prompt, generated_text, output_file in zip(batch_prompts, generated_texts, batch_files):
            result = {
                "prompt_question_index": file_counters[output_file],
                "prompt": prompt,
                "response": generated_text.outputs[0].text,
                "prompt_length": len(generated_text.prompt_token_ids),
                "response_length": len(generated_text.outputs[0].token_ids),
                "model": model_name,
            }
            file_results[output_file].append(result)
            file_counters[output_file] += 1
    
        # Save results to respective files
        for output_file, results in file_results.items():
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
    
    return


# def run_only_hf_inference(
#         prompts_dict,
#     model_name: str,
#     model_path,
#     temperature: float = 0.01,
#     max_tokens: int = MAX_TOKENS,
#     tensor_parallel_size: int = NUM_GPUS,
#     **sampling_kwargs):
#     """
#     Run inference on a dictionary of prompts using HuggingFace Transformers.
    
#     Args:
#         prompts_dict: Dictionary mapping output_file -> list of prompts
#         model_name: Name of the model (key in model_configs)
#         model_path: Path to the model
#         temperature: Sampling temperature
#         max_tokens: Maximum tokens to generate
#         tensor_parallel_size: Number of GPUs to use
#         **sampling_kwargs: Additional sampling parameters
    
#     Returns:
#         None (saves results to files)
#     """    
#     import torch
#     from transformers import AutoModelForCausalLM, AutoTokenizer
    
#     print(f"Loading model: {model_name}")
#     # print(f"Model path: {model_path}")
#     print(f"Using {tensor_parallel_size} GPUs")
    
#     # Load model and tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(
#         model_name,
#         trust_remote_code=True
#     )
    
#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         torch_dtype="auto",
#         device_map="auto",
#         trust_remote_code=True,
#         cache_dir="/scratch/asing725/Huggingface/hub",
#         max_memory={
#             0: "75GB",  # Adjust per your GPU VRAM
#             1: "75GB",
#             2: "75GB",
#         },
#     )
    
#     # Set padding token if not set
#     if tokenizer.pad_token is None:
#         tokenizer.pad_token = tokenizer.eos_token
    
#     # Flatten all prompts and track their sources
#     all_prompts = []
#     prompt_to_file = []
    
#     for output_file, prompts in prompts_dict.items():
#         for prompt in prompts:
#             all_prompts.append(prompt)
#             prompt_to_file.append(output_file)
    
#     print(f"\nRunning inference on {len(all_prompts)} prompts")

#     # Initialize results dictionary
#     file_results = {output_file: [] for output_file in prompts_dict.keys()}
#     file_counters = {output_file: 0 for output_file in prompts_dict.keys()}
    
#     # Process prompts in batches
#     for i in tqdm(range(0, len(all_prompts), batch_size), desc="Inference Batches"):
#         batch_prompts = all_prompts[i:i + batch_size]
#         batch_files = prompt_to_file[i:i + batch_size]
        
#         # Tokenize batch
#         inputs = tokenizer(
#             batch_prompts,
#             return_tensors="pt",
#             padding=True,
#             padding_side='left',
#             truncation=True,
#             max_length=50000 - max_tokens,
#         ).to(model.device)
        
#         # Generate texts
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_new_tokens=max_tokens,
#                 temperature=temperature,
#                 do_sample=temperature > 0,
#                 pad_token_id=tokenizer.pad_token_id
#             )
        
#         # Decode generated texts
#         generated_texts = tokenizer.batch_decode(
#             outputs[:, inputs['input_ids'].shape[1]:],
#             skip_special_tokens=True
#         )
        
#         # Collect results for this batch
#         for prompt, generated_text, output, input_ids, output_file in zip(
#             batch_prompts, generated_texts, outputs, inputs['input_ids'], batch_files
#         ):
#             result = {
#                 "prompt_question_index": file_counters[output_file],
#                 "prompt": prompt,
#                 "response": generated_text,
#                 "prompt_length": len(input_ids),
#                 "response_length": len(output) - len(input_ids),
#                 "model": model_name,
#             }
#             file_results[output_file].append(result)
#             file_counters[output_file] += 1
    
#         # Save results to respective files
#         for output_file, results in file_results.items():
#             output_path = Path(output_file)
#             output_path.parent.mkdir(parents=True, exist_ok=True)
#             with open(output_path, 'w', encoding='utf-8') as f:
#                 json.dump(results, f, indent=2, ensure_ascii=False)
    
#     return


import json
from pathlib import Path
MODELS = {
    # "openai/gpt-oss-120b": "/scratch/asing725/Huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a",
    # "meta-llama/Llama-3.1-70B-Instruct": "/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.1-70B-Instruct/snapshots/1605565b47bb9346c5515c34102e054115b4f98b",
    "meta-llama/Llama-3.3-70B-Instruct": "/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b",
    # "Qwen/Qwen2.5-72B-Instruct": "/scratch/asing725/Huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/495f39366efef23836d0cfae4fbe635880d2be31",
}

def process_prompts(base_path: str, model_name: str):
    base_dir = Path(base_path)

    # Datasets to process
    datasets = ['cfpb', 'fir', 'fir_hash']
    
    print(f"Starting to process prompts from: {base_path}")
    print("=" * 80)
    
    # Dictionary to store all prompts: output_file -> list of prompts
    prompts_dict = {}
    
    for dataset in datasets:
        dataset_dir = base_dir / dataset
        
        if not dataset_dir.exists():
            print(f"Skipping {dataset} - directory not found")
            continue
        
        print(f"\nProcessing dataset: {dataset}")
        print("-" * 80)
        
        # Get all JSON files
        json_files = sorted(dataset_dir.glob("*.json"))
        for json_file in json_files:
            all_prompts = []
            print(f"\nFile: {json_file.name}")
            
            # Load JSON file
            with open(json_file, 'r') as f:
                data = json.load(f)
            
            # Handle both single object and array
            if isinstance(data, dict):
                data = [data]
            
            print(f"  Found {len(data)} entries")
            
            # Process each entry
            for idx, entry in enumerate(data, 1):
                # Extract required fields                           
                prompt = entry['prompt']
                setup = entry['setup']
                question_id = entry.get('base_question_id')
                all_prompts.append(prompt)
                
                # print(f"  Entry {idx}/{len(data)}: ID={question_id}, setup={setup}")
            
            # Store prompts with their output file path
            output_file = str(dataset_dir / json_file.name.replace("prompts", f"results_{model_name}"))
            prompts_dict[output_file] = all_prompts
            print()
    
    # Single run_hf_inference call with all prompts
    if prompts_dict:
        run_hf_inference(prompts_dict, model_path=MODELS[model_name], model_name=model_name)
                    
    print("\n" + "=" * 80)
    print(f"Processing complete!")
    print("=" * 80)
    return

INFO 12-31 06:53:19 __init__.py:190] Automatically detected platform cuda.


In [None]:
# NUM_GPUS = 3
# MAX_TOKENS = 512
# batch_size = 200

# import torch
# from datetime import datetime
# import gc
# from vllm import LLM, SamplingParams
# from typing import List
# from itertools import islice
# import json
# import time
# from pathlib import Path
# from tqdm.notebook import tqdm

# def run_hf_inference(
#     output_file,
#     prompts: List[str],
#     model_name: str,
#     model_path,
#     temperature: float = 0.01,
#     max_tokens: int = MAX_TOKENS,
#     tensor_parallel_size: int = NUM_GPUS,
#     **sampling_kwargs):
#     """
#     Run inference on a list of prompts using HuggingFace Transformers.
    
#     Args:
#         prompts: List of input prompts
#         model_name: Name of the model (key in model_configs)
#         model_configs: Dictionary containing model configurations
#         output_file: Optional path to save results as JSON (not used, kept for compatibility)
#         temperature: Sampling temperature
#         max_tokens: Maximum tokens to generate
#         tensor_parallel_size: Number of GPUs to use
#         batch_size: Batch size for inference
#         **sampling_kwargs: Additional sampling parameters
    
#     Returns:
#         List of dictionaries containing prompts and responses
#     """    
#     print(f"Loading model: {model_name}")
#     print(f"Model path: {model_path}")
#     print(f"Using {tensor_parallel_size} GPUs")
    
#     llm = LLM(
#         model=model_name,
#         dtype="auto",
#         # tensor_parallel_size=tensor_parallel_size,
#         trust_remote_code=True,
#         gpu_memory_utilization=0.95,
#         max_model_len=50000,
#     )

#     sampling_params = SamplingParams(
#         temperature=0.01,
#         max_tokens=MAX_TOKENS,
#     )

#     print(f"\nRunning inference on {len(prompts)} prompts")

#     results = []
    
#     # Process prompts in batches
#     for i in tqdm(range(0, len(prompts), batch_size),desc="Inference Batches"):
#         batch_prompts = prompts[i:i + batch_size]
        
#         # Tokenize batch
#         generated_texts = llm.generate(batch_prompts, sampling_params)
#         # Collect results for this batch
#         for j, (prompt, generated_text) in enumerate(zip(batch_prompts, generated_texts)):
#             result = {
#                 "prompt_question_index": i + j,
#                 "prompt": prompt,
#                 "response": generated_text.outputs[0].text,
#                 "prompt_length": len(generated_text.prompt_token_ids),
#                 "response_length": len(generated_text.outputs[0].token_ids),
#                 "model": model_name,
#             }
#             results.append(result)
#         if output_file:
#             output_path = Path(output_file)
#             output_path.parent.mkdir(parents=True, exist_ok=True)
#             with open(output_path, 'w', encoding='utf-8') as f:
#                 json.dump(results, f, indent=2, ensure_ascii=False)
#     return 

# import json
# from pathlib import Path
# MODELS = {
#     # "openai/gpt-oss-120b": "/scratch/asing725/Huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a",
#     # "meta-llama/Llama-3.1-70B-Instruct": "/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.1-70B-Instruct/snapshots/1605565b47bb9346c5515c34102e054115b4f98b",
#     "meta-llama/Llama-3.3-70B-Instruct": "/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b",
#     # "Qwen/Qwen2.5-72B-Instruct": "/scratch/asing725/Huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/495f39366efef23836d0cfae4fbe635880d2be31",
# }

# def process_prompts(base_path: str, model_name: str):
#     base_dir = Path(base_path)
#     results = []

#     # Datasets to process
#     datasets = ['cfpb', 'fir', 'fir_hash']
    
#     print(f"Starting to process prompts from: {base_path}")
#     print("=" * 80)
    
#     for dataset in datasets:
#         dataset_dir = base_dir / dataset
        
#         if not dataset_dir.exists():
#             print(f"Skipping {dataset} - directory not found")
#             continue
        
#         print(f"\nProcessing dataset: {dataset}")
#         print("-" * 80)
        
#         # Get all JSON files
#         json_files = sorted(dataset_dir.glob("*.json"))
#         for json_file in json_files:
#             all_prompts = []
#             print(f"\nFile: {json_file.name}")
            
#             # Load JSON file
#             with open(json_file, 'r') as f:
#                 data = json.load(f)
            
#             # Handle both single object and array
#             if isinstance(data, dict):
#                 data = [data]
            
#             print(f"  Found {len(data)} entries")
            
#             # Process each entry
#             for idx, entry in enumerate(data, 1):
#                 # Extract required fields                           
#                 prompt = entry['prompt']
#                 setup = entry['setup']
#                 question_id = entry.get('base_question_id')
#                 all_prompts.append(prompt)
                
#             print(f"  Entry {idx}/{len(data)}: ID={question_id}, setup={setup}")
#             run_hf_inference((dataset_dir / (json_file.name).replace("prompts", f"results_{model_name}")), all_prompts, model_path=MODELS[model_name], model_name=model_name)
                    
#     print("\n" + "=" * 80)
#     print(f"Processing complete!")
#     print("=" * 80)
#     return

: 

In [None]:
BASE_PATH = "/scratch/asing725/CSE336/privacy_qa/all_prompts"

process_prompts(BASE_PATH, model_name="meta-llama/Llama-3.3-70B-Instruct")
# process_prompts(BASE_PATH, model_name="openai/gpt-oss-120b")
# process_prompts(BASE_PATH, model_name="Qwen/Qwen2.5-72B-Instruct")

Starting to process prompts from: /scratch/asing725/CSE336/privacy_qa/all_prompts

Processing dataset: cfpb
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 1356 entries


File: prompts_setup1_k5.json
  Found 1356 entries


File: prompts_setup2_k10.json
  Found 1356 entries


File: prompts_setup2_k5.json
  Found 1356 entries


Processing dataset: fir
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 1001 entries


File: prompts_setup1_k5.json
  Found 1001 entries


File: prompts_setup2_k10.json
  Found 1001 entries


File: prompts_setup2_k5.json
  Found 1001 entries


Processing dataset: fir_hash
--------------------------------------------------------------------------------

File: prompts_setup1_k10.json
  Found 570 entries


File: prompts_setup1_k5.json
  Found 570 entries


File: prompts_setup2_k10.json
  Found 570 entries


File: prompts_setu

`torch_dtype` is deprecated! Use `dtype` instead!


  Found 570 entries

Loading model: meta-llama/Llama-3.3-70B-Instruct
Model path: /scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b
Using 2 GPUs
INFO 12-31 06:53:38 config.py:542] This model supports multiple tasks: {'embed', 'score', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 12-31 06:53:38 config.py:1401] Defaulting to use mp for distributed inference
INFO 12-31 06:53:38 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b', speculative_config=None, tokenizer='/scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=tor



INFO 12-31 06:53:43 custom_all_reduce_utils.py:244] reading GPU P2P access cache from /home/asing725/.cache/vllm/gpu_p2p_access_cache_for_0,1,2.json
[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 06:53:43 custom_all_reduce_utils.py:244] reading GPU P2P access cache from /home/asing725/.cache/vllm/gpu_p2p_access_cache_for_0,1,2.json
INFO 12-31 06:53:43 shm_broadcast.py:258] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_29dd9b9d'), local_subscribe_port=37625, remote_subscribe_port=None)
INFO 12-31 06:53:43 model_runner.py:1110] Starting to load model /scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b...
[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 06:53:43 model_runner.py:1110] Starting to load model /scratch/asing725/Huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f3

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 07:07:54 model_runner.py:1115] Loading model weights took 65.7409 GB
INFO 12-31 07:07:54 model_runner.py:1115] Loading model weights took 65.7409 GB


[1;36m(VllmWorkerProcess pid=2889447)[0;0m   torch._dynamo.utils.warn_once(msg)
  torch._dynamo.utils.warn_once(msg)


INFO 12-31 07:08:08 worker.py:267] Memory profiling takes 13.48 seconds
INFO 12-31 07:08:08 worker.py:267] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.97) = 76.87GiB
INFO 12-31 07:08:08 worker.py:267] model weights take 65.74GiB; non_torch_memory takes 0.65GiB; PyTorch activation peak memory takes 3.79GiB; the rest of the memory reserved for KV Cache is 6.70GiB.
[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 07:08:08 worker.py:267] Memory profiling takes 13.46 seconds
[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 07:08:08 worker.py:267] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.97) = 76.87GiB
[1;36m(VllmWorkerProcess pid=2889447)[0;0m INFO 12-31 07:08:08 worker.py:267] model weights take 65.74GiB; non_torch_memory takes 0.65GiB; PyTorch activation peak memory takes 3.79GiB; the rest of the memory reserved for KV Cache is 6.70GiB.
INFO 12-31 07:08:08 executor_base.py:110] #

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:25<00:00,  1.38it/s]


Failed: Cuda error /workspace/csrc/custom_all_reduce.cuh:368 'invalid argument'
