In [6]:
import requests
import time
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor, as_completed
import nest_asyncio
import json

nest_asyncio.apply()

API_BASE_URL = "http://localhost:8080/api"

print(f"API Base URL: {API_BASE_URL}\n")

# --- Helper function to make requests ---
def make_request(method, endpoint, payload=None, expected_status=200):
    url = f"{API_BASE_URL}{endpoint}"
    try:
        if method.upper() == "GET":
            response = requests.get(url, timeout=60)
        elif method.upper() == "POST":
            response = requests.post(url, json=payload, timeout=120) # Longer timeout for predict
        else:
            raise ValueError(f"Unsupported method: {method}")
        
        print(f"--- Request to {method.upper()} {endpoint} ---")
        if payload:
            print(f"Payload (first 50 chars if long): {str(payload)[:150]}")
        
        if response.status_code == expected_status:
            print(f"Status: {response.status_code} OK")
            try:
                res_json = response.json()
                print(f"Response (sample): {str(res_json)[:300]}...")
                return res_json
            except requests.exceptions.JSONDecodeError:
                print(f"Response (not JSON): {response.text[:300]}...")
                return response.text
        else:
            print(f"Status: {response.status_code} - Error: {response.text[:300]}...")
            return {"error": response.text, "status_code": response.status_code}
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {endpoint}: {e}")
        return {"error": str(e), "status_code": "N/A"}

API Base URL: http://localhost:8080/api



## 1. Checking Basic API Endpoints

In [13]:
print("\n--- Checking Basic API Endpoints ---")
make_request("GET", "/")
make_request("GET", "/healthz")
make_request("GET", "/readiness") # This might fail if default model preload fails
make_request("GET", "/metrics") # Check Prometheus metrics endpoint
make_request("GET", "/cache_info") # Check cache on one worker


--- Checking Basic API Endpoints ---
--- Request to GET / ---
Status: 200 OK
Response (sample): {'message': 'HuggingFace Dynamic Inference API', 'active_device': 'GPU', 'default_model_status': 'Preload attempted, cache populated', 'current_cache_size': 2}...
--- Request to GET /healthz ---
Status: 200 OK
Response (sample): {'status': 'ok', 'pid': 26}...
--- Request to GET /readiness ---
Status: 200 OK
Response (sample): {'status': 'ready', 'message': "Default model 'distilbert-base-uncased-finetuned-sst-2-english' accessible/loadable.", 'pid': 26}...
--- Request to GET /metrics ---
Status: 200 OK
Response (not JSON): # HELP hf_requests_total Total number of inference requests
# TYPE hf_requests_total counter
hf_requests_total{http_status="200",model_name="gpt2",task="text-generation",worker_pid="25"} 2.0
hf_requests_total{http_status="200",model_name="distilbert-base-uncased-finetuned-sst-2-english",task="sentim...
--- Request to GET /cache_info ---
Status: 200 OK
Response (sample): {

{'pid': 26,
 'lru_cache_stats': 'CacheInfo(hits=9, misses=2, maxsize=10, currsize=2)'}

## 2. Single Sentiment Prediction Inference

In [23]:
print("\n--- Test Single Prediction (Default Model) ---")
single_payload_sentiment = {
    "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
    "task": "sentiment-analysis",
    "inputs": "This is a fantastic product, I highly recommend it!"
}
result_single = make_request("POST", "/predict", single_payload_sentiment)
print("\nResponse:")
for key, val in result_single.items():
    print(f"{key}: {val}")


--- Test Single Prediction (Default Model) ---
--- Request to POST /predict ---
Payload (first 50 chars if long): {'model_name': 'distilbert-base-uncased-finetuned-sst-2-english', 'task': 'sentiment-analysis', 'inputs': 'This is a fantastic product, I highly recom
Status: 200 OK
Response (sample): {'model_name': 'distilbert-base-uncased-finetuned-sst-2-english', 'task': 'sentiment-analysis', 'predictions': [{'label': 'POSITIVE', 'score': 0.9998854398727417}], 'worker_pid': 27, 'total_request_time_ms': 7.02, 'inference_execution_time_ms': 6.88, 'pipeline_from_lru_cache': True}...

Response:
model_name: distilbert-base-uncased-finetuned-sst-2-english
task: sentiment-analysis
predictions: [{'label': 'POSITIVE', 'score': 0.9998854398727417}]
worker_pid: 27
total_request_time_ms: 7.02
inference_execution_time_ms: 6.88
pipeline_from_lru_cache: True


## 3. Batch Sentiment Prediction Inference

In [22]:
print("\n--- Test Batch Prediction (Default Model) ---")
batch_payload_sentiment = {
    "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
    "task": "sentiment-analysis",
    "inputs": [
        "I am incredibly happy with the service.",
        "This is the worst thing I have ever bought.",
        "It's an okay movie, neither good nor bad."
    ]
}
result_batch = make_request("POST", "/predict", batch_payload_sentiment)
print("\nResponse:")
for key, val in result_batch.items():
    print(f"{key}: {val}")


--- Test Batch Prediction (Default Model) ---
--- Request to POST /predict ---
Payload (first 50 chars if long): {'model_name': 'distilbert-base-uncased-finetuned-sst-2-english', 'task': 'sentiment-analysis', 'inputs': ['I am incredibly happy with the service.', 
Status: 200 OK
Response (sample): {'model_name': 'distilbert-base-uncased-finetuned-sst-2-english', 'task': 'sentiment-analysis', 'predictions': [{'label': 'POSITIVE', 'score': 0.9998799562454224}, {'label': 'NEGATIVE', 'score': 0.9997836947441101}, {'label': 'POSITIVE', 'score': 0.9969683289527893}], 'worker_pid': 27, 'total_reques...

Response:
model_name: distilbert-base-uncased-finetuned-sst-2-english
task: sentiment-analysis
predictions: [{'label': 'POSITIVE', 'score': 0.9998799562454224}, {'label': 'NEGATIVE', 'score': 0.9997836947441101}, {'label': 'POSITIVE', 'score': 0.9969683289527893}]
worker_pid: 27
total_request_time_ms: 20.76
inference_execution_time_ms: 20.63
pipeline_from_lru_cache: True


## 4. Test a Different Model and Task (Text Generation)

In [26]:
# This will cause a new model to be downloaded and cached if not used before.
print("\n--- Test Different Model/Task (Text Generation with GPT-2) ---")
# For actual generation, you might want 'gpt2-medium' or larger.
generation_payload = {
    "model_name": "gpt2",
    "task": "text-generation",
    "inputs": "Once upon a time, in a land far away",
    "pipeline_kwargs": {"max_new_tokens": 20, "num_return_sequences": 1} # Arguments for the pipeline
}
# Text generation can take longer, especially for the first load.
result_generation = make_request("POST", "/predict", generation_payload)

print("\nResponse:")
for key, val in result_generation.items():
    print(f"{key}: {val}")



--- Test Different Model/Task (Text Generation with GPT-2) ---
--- Request to POST /predict ---
Payload (first 50 chars if long): {'model_name': 'gpt2', 'task': 'text-generation', 'inputs': 'Once upon a time, in a land far away', 'pipeline_kwargs': {'max_new_tokens': 20, 'num_ret
Status: 200 OK
Response (sample): {'model_name': 'gpt2', 'task': 'text-generation', 'predictions': [{'generated_text': 'Once upon a time, in a land far away, a man, the son of a rich merchant of a wealthy family, was caught in the midst'}], 'worker_pid': 28, 'total_request_time_ms': 153.7, 'inference_execution_time_ms': 153.56, 'pip...

Response:
model_name: gpt2
task: text-generation
predictions: [{'generated_text': 'Once upon a time, in a land far away, a man, the son of a rich merchant of a wealthy family, was caught in the midst'}]
worker_pid: 28
total_request_time_ms: 153.7
inference_execution_time_ms: 153.56
pipeline_from_lru_cache: True


## Parallel Requests using `ThreadPoolExecutor`

In [34]:
print("\n--- Demonstrate Parallel Requests (ThreadPoolExecutor) ---")
parallel_texts_sentiment = [
    "The weather today is beautiful and sunny.",
    "I'm feeling a bit down after hearing the news.",
    "This new software update is incredibly buggy.",
    "The concert was an unforgettable experience!",
    "Customer support was surprisingly helpful and efficient.",
    "I am neutral about this new policy change.",
    "This book is a masterpiece of modern literature.",
    "The food at that restaurant was utterly disappointing."
] * 2 # 16 requests

def send_predict_request(text_input, model_name, task):
    payload = {"model_name": model_name, "task": task, "inputs": text_input}
    url = f"{API_BASE_URL}/predict"
    try:
        response = requests.post(url, json=payload, timeout=120)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        return {"error": str(e), "input_text": text_input, "status_code": response.status_code if 'response' in locals() else "N/A"}

start_time_parallel = time.time()
parallel_results_sentiment = []
# Aggressively use workers to demonstrate server parallelism
with ThreadPoolExecutor(max_workers=len(parallel_texts_sentiment)) as executor:
    futures = [
        executor.submit(send_predict_request, text, "distilbert-base-uncased-finetuned-sst-2-english", "sentiment-analysis")
        for text in parallel_texts_sentiment
    ]
    for i, future in enumerate(as_completed(futures)):
        try:
            data = future.result()
            parallel_results_sentiment.append(data)
            print(f"Parallel Req {i+1}/{len(parallel_texts_sentiment)}:\tPID: {data.get('worker_pid', 'N/A')} | Predictions:'{str(data.get('predictions', 'N/A'))}' | Total Request Time (ms): '{str(data.get('total_request_time_ms', 'N/A'))}'")
        except Exception as exc:
            print(f"Parallel Req {i+1}/{len(parallel_texts_sentiment)} generated an exception: {exc}")
            parallel_results_sentiment.append({"error": str(exc)})

end_time_parallel = time.time()
print(f"\nThreadPoolExecutor: Completed {len(parallel_results_sentiment)} sentiment requests in {end_time_parallel - start_time_parallel:.2f} seconds.")

worker_pids_sentiment = set()
successful_sentiment_requests = 0
for res in parallel_results_sentiment:
    if isinstance(res, dict) and "worker_pid" in res:
        worker_pids_sentiment.add(res['worker_pid'])
        successful_sentiment_requests +=1
print(f"Sentiment requests handled by PIDs: {worker_pids_sentiment} ({successful_sentiment_requests} successful)")


--- Demonstrate Parallel Requests (ThreadPoolExecutor) ---
Parallel Req 1/16:	PID: 27 | Predictions:'[{'label': 'POSITIVE', 'score': 0.9998791217803955}]' | Total Request Time (ms): '19.96'
Parallel Req 2/16:	PID: 26 | Predictions:'[{'label': 'NEGATIVE', 'score': 0.9992559552192688}]' | Total Request Time (ms): '21.83'
Parallel Req 3/16:	PID: 25 | Predictions:'[{'label': 'NEGATIVE', 'score': 0.9997283816337585}]' | Total Request Time (ms): '23.3'
Parallel Req 4/16:	PID: 28 | Predictions:'[{'label': 'POSITIVE', 'score': 0.9998494386672974}]' | Total Request Time (ms): '21.97'
Parallel Req 5/16:	PID: 27 | Predictions:'[{'label': 'POSITIVE', 'score': 0.9997918009757996}]' | Total Request Time (ms): '8.01'
Parallel Req 6/16:	PID: 25 | Predictions:'[{'label': 'NEGATIVE', 'score': 0.9997954964637756}]' | Total Request Time (ms): '6.49'
Parallel Req 7/16:	PID: 26 | Predictions:'[{'label': 'POSITIVE', 'score': 0.9998459815979004}]' | Total Request Time (ms): '7.31'
Parallel Req 8/16:	PID: 28 

In [49]:
print("\n--- Demonstrate Parallel Text Generation (ThreadPoolExecutor) ---")

# Prompts for text generation
parallel_prompts_generation = [
    "Once upon a time, in a land far away,",
    "The secret to a happy life is",
    "Artificial intelligence will eventually",
    "My favorite a_model_prompt is",
    "To build a successful startup, one must",
    "The future of space exploration looks",
    "A recipe for a perfect day includes",
    "If I could travel anywhere in time, I would go to"
]*2 

# Updated function to include pipeline_kwargs
def send_predict_request_with_kwargs(prompt_input, model_name, task, pipeline_kwargs=None):
    if pipeline_kwargs is None:
        pipeline_kwargs = {}
        
    payload = {
        "model_name": model_name,
        "task": task,
        "inputs": prompt_input,
        "pipeline_kwargs": pipeline_kwargs
    }
    url = f"{API_BASE_URL}/predict" 
    try:
        # Increased timeout for text generation
        response = requests.post(url, json=payload, timeout=180) 
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        # Try to get status code from response if available
        status_code = "N/A"
        if hasattr(e, 'response') and e.response is not None:
            status_code = e.response.status_code
        return {"error": str(e), "input_text": prompt_input, "status_code": status_code}
    except Exception as ex: # Catch other potential errors during request construction or handling
        return {"error": f"Unexpected error: {str(ex)}", "input_text": prompt_input, "status_code": "N/A"}


start_time_parallel_gen = time.time()
parallel_results_generation = []

# Define pipeline_kwargs for text generation
generation_kwargs = {
    "max_new_tokens": 100,      # Generate a small number of new tokens for the demo
    "num_return_sequences": 1, # Get one generated sequence per prompt
}

num_client_threads = min(4, len(parallel_prompts_generation)) 
print(f"Sending {len(parallel_prompts_generation)} text generation requests with {num_client_threads} parallel client threads...")

with ThreadPoolExecutor(max_workers=num_client_threads) as executor:
    futures = [
        executor.submit(send_predict_request_with_kwargs, prompt, "gpt2", "text-generation", generation_kwargs)
        for prompt in parallel_prompts_generation
    ]
    for i, future in enumerate(as_completed(futures)):
        try:
            data = future.result()
            # It's good to check if 'data' is an error dict before trying to access specific keys
            if "error" in data:
                print(f"Parallel Gen Req {i+1}/{len(parallel_prompts_generation)} -> ERROR: {data.get('error')} (Status: {data.get('status_code', 'N/A')}) | Input: '{data.get('input_text', 'N/A')[:50]}...'")
            else:
                generated_texts = data.get('predictions', [])
                display_text = "N/A"
                if generated_texts and isinstance(generated_texts, list) and len(generated_texts) > 0:
                    # Display the first generated text for brevity
                    if isinstance(generated_texts[0], dict) and "generated_text" in generated_texts[0]:
                        display_text = generated_texts[0]["generated_text"].replace('\n', ' ')[:100] # First 100 chars, newlines replaced
                    else: # If structure is different, just show raw
                        display_text = str(generated_texts[0])


                print(f"Parallel Gen Req {i+1}/{len(parallel_prompts_generation)}:\tPID: {data.get('worker_pid', 'N/A')} | Output: '{display_text}'\t| Time(ms): {data.get('total_request_time_ms', 'N/A')}")
        except Exception as exc: # Should ideally be caught by error handling in send_predict_request_with_kwargs
            print(f"Parallel Gen Req {i+1}/{len(parallel_prompts_generation)} generated an exception in future processing: {exc}")
            parallel_results_generation.append({"error": str(exc)})
        else: # Append result only if no exception during future.result() or processing
             parallel_results_generation.append(data)


end_time_parallel_gen = time.time()
print(f"\nThreadPoolExecutor: Completed {len(parallel_results_generation)} text generation requests in {end_time_parallel_gen - start_time_parallel_gen:.2f} seconds.")

worker_pids_generation = set()
successful_generation_requests = 0
for res in parallel_results_generation:
    if isinstance(res, dict) and "worker_pid" in res and "error" not in res: # Count successful requests
        worker_pids_generation.add(res['worker_pid'])
        successful_generation_requests +=1
print(f"Text generation requests handled successfully by PIDs: {worker_pids_generation} ({successful_generation_requests} successful out of {len(parallel_prompts_generation)} attempts)")



--- Demonstrate Parallel Text Generation (ThreadPoolExecutor) ---
Sending 16 text generation requests with 4 parallel client threads...
Parallel Gen Req 1/16:	PID: 28 | Output: 'The secret to a happy life is the desire to make it in the world. And that's what we do here in Cana'	| Time(ms): 1137.43
Parallel Gen Req 2/16:	PID: 27 | Output: 'Once upon a time, in a land far away, were not the birds of the sky. The world was never in a sense '	| Time(ms): 1138.69
Parallel Gen Req 3/16:	PID: 26 | Output: 'My favorite a_model_prompt is always "You're not playing at this time."  If you're getting a call fr'	| Time(ms): 1156.39
Parallel Gen Req 4/16:	PID: 25 | Output: 'Artificial intelligence will eventually be the next big thing.  At the same time, AI will only be ab'	| Time(ms): 1158.08
Parallel Gen Req 5/16:	PID: 27 | Output: 'The future of space exploration looks bright, but it's not ready yet."  NASA has been working with t'	| Time(ms): 1111.38
Parallel Gen Req 6/16:	PID: 28 | Output: 'T