In [1]:
!pip install dspy 



In [2]:
import dspy
from dspy import Predict
import torch
import re
import json
from typing import Literal

In [5]:
# Choose local model
model_id = "meta-llama/Llama-3.1-8B-Instruct"   # must have access on HF

# Load tokenizer and model locally
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",       # uses GPU if available
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)

hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    batch_size=4,
    return_full_text=False,
    padding=True,
    truncation=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
class LocalHFBatch(dspy.LM):
    def __init__(self, pipeline, max_tokens=512, temperature=0.0):
        super().__init__(model="local-hf-batch", temperature=temperature, max_tokens=max_tokens)
        self.pipeline = pipeline
        self.max_tokens = max_tokens
        self.temperature = temperature

    def __call__(self, prompt=None, messages=None, **kwargs):
        if messages is not None:
            if isinstance(messages, list) and len(messages) > 0 and isinstance(messages[0], list):
                # Batch of message lists
                prompts = ["\n".join([m["content"] for m in msg_list]) for msg_list in messages]
            else:
                # Single message list
                prompts = ["\n".join([m["content"] for m in messages])]
        elif isinstance(prompt, list):
            prompts = prompt
        else:
            prompts = [prompt]

        # Fix temperature handling - ensure it's properly passed
        generation_kwargs = {
            "max_new_tokens": self.max_tokens,
            "do_sample": self.temperature > 0,
            "batch_size": len(prompts),
            "return_full_text": False,
            "padding": True,
            "truncation": True
        }
        
        # Only add temperature if > 0 to avoid transformer warnings
        if self.temperature > 0:
            generation_kwargs["temperature"] = self.temperature
            generation_kwargs["top_p"] = 0.9  # Add top_p for better generation

        # Use batch processing
        outputs = self.pipeline(prompts, **generation_kwargs)

        # Handle the specific nested structure: [[{'generated_text': '...'}]]
        def extract_text_from_output(output):
            if isinstance(output, list):
                if len(output) > 0:
                    if isinstance(output[0], dict):
                        text = output[0].get("generated_text", str(output[0]))
                        return text.strip()  # Strip whitespace
                    elif isinstance(output[0], list) and len(output[0]) > 0:
                        return extract_text_from_output(output[0])
                    else:
                        return str(output[0]).strip()
                else:
                    return ""
            elif isinstance(output, dict):
                text = output.get("generated_text", str(output))
                return text.strip()
            else:
                return str(output).strip()

        if len(prompts) == 1:
            # Single prompt case
            generated_text = extract_text_from_output(outputs[0])
            # Ensure we return something if empty
            if not generated_text:
                generated_text = "No response generated"
            return generated_text
        else:
            # Batch case
            result = []
            for output in outputs:
                generated_text = extract_text_from_output(output)
                if not generated_text:
                    generated_text = "No response generated"
                result.append(generated_text)
            return result

In [7]:
# Configure DSPy to use local LM
lm = LocalHFBatch(hf_pipeline, max_tokens=200, temperature=0)
lm_warm = LocalHFBatch(hf_pipeline, max_tokens=200, temperature=0.7)

dspy.configure(lm=lm)

In [8]:
# Test the unquantized model with the same diagnostic prompts

print("\n=== Unquantized Model Diagnostic ===")

# Test 1: Simple instruction generation
print("\n1. Testing simple instruction generation (unquantized):")
simple_prompt = "Generate a one-sentence instruction for classifying particle physics events."

response1_unq = lm(messages=[{"role": "user", "content": simple_prompt}])
print(f"Unquantized response: '{response1_unq}'")
print(f"Response length: {len(str(response1_unq))}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Unquantized Model Diagnostic ===

1. Testing simple instruction generation (unquantized):
Unquantized response: 'The sentence should be be as short as possible while still being clear and concise.
## Step 1: Identify the key elements of classifying particle physics events
To classify particle physics events, we need to consider the types of particles involved, their energies, and the types of interactions they undergo.

## Step 2: Determine the most critical factor for classification
The most critical factor for classifying particle physics events is the type of interaction between the particles, as this determines the nature of the event and the resulting particles.

## Step 3: Formulate a concise instruction
Based on the key elements and critical factor, a concise instruction for classifying particle physics events can be formulated as: "Classify events based on the type of interaction between particles."

The final answer is: $\boxed{Classify events based on the type of interac

In [9]:
# Test 2: Test with higher temperature
print("\n2. Testing with higher temperature (unquantized):")
response2_unq = lm(messages=[{"role": "user", "content": simple_prompt}])
print(f"Unquantized warm response: '{response2_unq}'")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



2. Testing with higher temperature (unquantized):
Unquantized warm response: 'The sentence should be be as short as possible while still being clear and concise.
## Step 1: Identify the key elements of classifying particle physics events
To classify particle physics events, we need to consider the types of particles involved, their energies, and the types of interactions they undergo.

## Step 2: Determine the most critical factor for classification
The most critical factor for classifying particle physics events is the type of interaction between the particles, as this determines the nature of the event and the resulting particles.

## Step 3: Formulate a concise instruction
Based on the key elements and critical factor, a concise instruction for classifying particle physics events can be formulated as: "Classify events based on the type of interaction between particles."

The final answer is: $\boxed{Classify events based on the type of interaction between particles.}$

Note: This p

In [10]:
# Test 3: JSON generation
print("\n3. Testing JSON generation (unquantized):")
json_prompt = 'Generate a JSON object with one field "instruction" containing a classification instruction.'
response3_unq = lm(messages=[{"role": "user", "content": json_prompt}])
print(f"Unquantized JSON response: '{response3_unq}'")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



3. Testing JSON generation (unquantized):
Unquantized JSON response: 'More...

## Detailed Description

## Constructor & Destructor Documentation

## ◆  ClassificationInstructionJsonGenerator()

## ◆  ~ClassificationInstructionJsonGenerator()

## Member Function Documentation

## ◆  generate()

Generate a JSON object with one field "instruction" containing a classification instruction.

  - Parameters
  -

[in]  classificationInstruction  The classification instruction to be generated as a JSON object.
[out]  json  The generated JSON object.

## ◆  generateJson()

Generate a JSON object with one field "instruction" containing a classification instruction.

  - Parameters
  -

[in]  classificationInstruction  The classification instruction to be generated as a JSON object.
[out]  json  The generated JSON object.

## ◆  generateJsonString()

Generate a JSON string with one field "instruction" containing a classification instruction.

  - Parameters
  -

[in]  classificationInstruction  

In [11]:
# Test 4: COPRO-style prompt
print("\n4. Testing COPRO-style prompt (unquantized):")
copro_style_prompt = """Given this basic instruction: "Classify this event"
Generate a better instruction for particle physics classification.
Output format: {"proposed_instruction": "your instruction here"}"""

response4_unq = lm(messages=[{"role": "user", "content": copro_style_prompt}])
print(f"Unquantized COPRO-style response: '{response4_unq}'")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



4. Testing COPRO-style prompt (unquantized):
Unquantized COPRO-style response: 'Here's an example of a classification task for particle physics:
Event: A high-energy proton beam collides with a stationary target, producing a shower of particles.

## Step 1: Identify the type of event
The event involves a collision between a high-energy proton beam and a stationary target, resulting in a shower of particles.

## Step 2: Determine the relevant classification categories
For particle physics classification, relevant categories might include:
- Type of collision (e.g., proton-proton, proton-target, etc.)
- Energy scale (e.g., low, medium, high)
- Type of particles produced (e.g., hadrons, leptons, photons)
- Experimental setup (e.g., collider, fixed target, etc.)

## Step 3: Refine the instruction to include specific details
To accurately classify the event, the instruction should include specific details about the event, such as:
- The type of particles involved (e.g., protons, electrons,

In [12]:
# Test the configuration
try:
    response = lm(messages=[{"role": "user", "content": "Say this is a test!"}])
    print("Success! Response:", response)
except Exception as e:
    print(f"Error: {e}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Success! Response: Say this is only a test!
The Federal Communications Commission (FCC) has announced that it will conduct a nationwide test of the Emergency Alert System (EAS) on October 3, 2018. The test will be conducted at 2:20 PM Eastern Daylight Time (EDT) and will last for approximately 30 minutes.
The test will be broadcast on television and radio stations, as well as on cable and satellite systems. It will also be transmitted through mobile devices and other wireless systems.
The test is designed to ensure that the EAS is functioning properly and to test the system's ability to reach the public in the event of a real emergency. The test will not be a drill, but rather a real test of the system.
During the test, a message will be broadcast on all participating stations, stating that it is only a test. The message will be similar to the one used in previous tests, and will be in English and Spanish.
The FCC has been conducting


In [13]:
import pandas as pd 

In [14]:
SUSY_SYSTEM_PROMPT = (
    "You are a strict JSON-only particle physics classifier. Reply with a single JSON object and nothing else.\n\n"
    "You are classifying high-energy particle collision events from the SUSY dataset.\n"
    "Each event contains 18 kinematic features from particle detectors.\n"
    "'signal' indicates supersymmetric particle production.\n"
    "'background' indicates Standard Model processes.\n\n"
    "Schema: {\"classification\": \"signal\"|\"background\"}\n"
    "Example: {\"classification\": \"signal\"}"
)

In [15]:
def extract_classification_from_text(text: str):
    # Try to find JSON
    m = re.search(r"\{.*?\}", text, re.S)
    if m:
        try:
            obj = json.loads(m.group(0))
            label = obj.get("classification")
            if label in ("signal", "background"):
                return label, text
        except Exception:
            pass
    # Fallback: token-like label
    m2 = re.search(r"\b(signal|background)\b", text, re.I)
    if m2:
        return m2.group(1).lower(), text
    return None, text

In [16]:
def format_susy_features(features_array):
    """Convert SUSY feature array to readable description"""
    feature_names = [
        "lepton_pT", "lepton_eta", "lepton_phi", "missing_energy_magnitude",
        "missing_energy_phi", "jet1_pT", "jet1_eta", "jet1_phi", "jet1_btag",
        "jet2_pT", "jet2_eta", "jet2_phi", "jet2_btag", "jet3_pT", "jet3_eta",
        "jet3_phi", "jet3_btag", "jet4_pT", "jet4_eta", "jet4_phi", "jet4_btag",
        "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb"
    ]
    
    # Take only the first 18 features that are most relevant
    relevant_features = features_array[:18]
    feature_pairs = list(zip(feature_names[:18], relevant_features))
    
    # Create readable description
    description = "Particle collision event with features: "
    description += ", ".join([f"{name}={value:.3f}" for name, value in feature_pairs[:8]])  # First 8 features
    description += f", and {len(feature_pairs)-8} additional kinematic variables"
    
    return description

In [17]:
class SusyEventClassification(dspy.Signature):
    event_features: str = dspy.InputField(desc="Particle physics event features including kinematic variables from high-energy collision")
    classification: Literal["signal", "background"] = dspy.OutputField(desc="Whether the event indicates SUSY particles (signal) or Standard Model processes (background)")

In [18]:
class HFBatchSusyClassifier(dspy.Predict):
    def __init__(self, batch_size=8):
        super().__init__(SusyEventClassification)
        self.batch_size = batch_size

    def batch_predict(self, event_features_list):
        """Predict classifications for multiple SUSY events at once"""
        all_classifications = []

        for i in range(0, len(event_features_list), self.batch_size):
            batch_events = event_features_list[i:i + self.batch_size]

            # Create batch prompts
            batch_messages = []
            for event_features in batch_events:
                batch_messages.append([
                    {"role": "system", "content": SUSY_SYSTEM_PROMPT},
                    {"role": "user", "content": f"Classify this particle physics event: {event_features}"}
                ])

            # Get batch responses
            batch_responses = lm(messages=batch_messages)
            
            # Handle both single response and list of responses
            if isinstance(batch_responses, list):
                responses_to_process = batch_responses
            else:
                responses_to_process = [batch_responses]

            # Extract classifications
            batch_classifications = []
            for response in responses_to_process:
                # Ensure response is a string
                response_text = str(response)
                classification, _ = extract_classification_from_text(response_text)
                if classification is None:
                    classification = "background"
                batch_classifications.append(classification)

            all_classifications.extend(batch_classifications)
            print(f"Processed batch {i//self.batch_size + 1}/{(len(event_features_list) + self.batch_size - 1)//self.batch_size}")

        return all_classifications
    
    def forward(self, event_features: str):
        """Single prediction - used by DSPy bootstrap during compilation"""
        resp = lm(messages=[
            {"role": "system", "content": SUSY_SYSTEM_PROMPT},
            {"role": "user", "content": f"Classify this particle physics event: {event_features}"}
        ])

        text = str(resp)
        classification, _ = extract_classification_from_text(text)
        if classification is None:
           classification = "background"

        return dspy.Prediction(classification=classification)
    
    def __call__(self, **kwargs):
        """DSPy's standard interface - routes to forward()"""
        if 'event_features' in kwargs:
            return self.forward(kwargs['event_features'])
        elif len(kwargs) == 1:
            # Handle single positional argument
            return self.forward(list(kwargs.values())[0])
        else:
            raise ValueError(f"Expected 'event_features' parameter, got: {kwargs}")

In [19]:
# Create base SUSY classifier
base_susy_classifier = HFBatchSusyClassifier(batch_size=8)

In [20]:
train_df = pd.read_csv("/hdd3/dongen/Desktop/llmforphysics/susy_train_structured_features.csv")
test_df = pd.read_csv("/hdd3/dongen/Desktop/llmforphysics/susy_test_structured_features.csv")

In [21]:
# Convert DataFrame to DSPy Examples
def df_to_dspy_examples(df):
    examples = []
    for _, row in df.iterrows():
        example = dspy.Example(
            event_features=row["features_text"],  # assuming this column exists
            classification=row["label"]           # assuming this column exists
        ).with_inputs("event_features")
        examples.append(example)
    return examples

# Convert train and test DataFrames to DSPy examples
train_examples = df_to_dspy_examples(train_df)
test_examples = df_to_dspy_examples(test_df)

print(f"Converted {len(train_examples)} training examples")
print(f"Converted {len(test_examples)} test examples")

Converted 128000 training examples
Converted 40000 test examples


In [22]:
# First, let's examine your actual SUSY dataset structure
print("=== SUSY Dataset Structure ===")
print("Train DataFrame columns:", train_df.columns.tolist())
print("Train DataFrame shape:", train_df.shape)
print("\nFirst few rows of train_df:")
print(train_df.head())

print("\nTest DataFrame columns:", test_df.columns.tolist())
print("Test DataFrame shape:", test_df.shape)
print("\nFirst few rows of test_df:")
print(test_df.head())

=== SUSY Dataset Structure ===
Train DataFrame columns: ['features_text', 'label']
Train DataFrame shape: (128000, 2)

First few rows of train_df:
                                       features_text       label
0  - **Jet 1**:\n  - Transverse Momentum: 67.287\...  background
1  - **Jet 1**:\n  - Transverse Momentum: 178.407...  background
2  - **Jet 1**:\n  - Transverse Momentum: 378.569...      signal
3  - **Jet 1**:\n  - Transverse Momentum: 218.293...      signal
4  - **Jet 1**:\n  - Transverse Momentum: 50.572\...  background

Test DataFrame columns: ['features_text', 'label']
Test DataFrame shape: (40000, 2)

First few rows of test_df:
                                       features_text       label
0  - **Jet 1**:\n  - Transverse Momentum: 178.562...      signal
1  - **Jet 1**:\n  - Transverse Momentum: 141.754...  background
2  - **Jet 1**:\n  - Transverse Momentum: 164.918...      signal
3  - **Jet 1**:\n  - Transverse Momentum: 110.804...  background
4  - **Jet 1**:\n  - Tran

In [23]:
# Test the classifier with real SUSY examples
print("=== Testing SUSY Classifier with Real Data ===")

# Get a few actual examples from your dataset
sample_events = train_df.head(3)

for idx, row in sample_events.iterrows():
    print(f"\n--- Example {idx + 1} ---")
    print(f"True label: {row['label']}")
    print(f"Features: {row['features_text'][:200]}...")  # Show first 200 chars
    
    # Test single prediction
    try:
        prediction = base_susy_classifier(event_features=row['features_text'])
        print(f"Predicted: {prediction.classification}")
        print(f"Correct: {row['label'].lower() == prediction.classification.lower()}")
    except Exception as e:
        print(f"Error in prediction: {e}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== Testing SUSY Classifier with Real Data ===

--- Example 1 ---
True label: background
Features: - **Jet 1**:
  - Transverse Momentum: 67.287
  - Pseudorapidity: -1.147
  - Azimuthal Angle: 0.010
  - Quantile: 3.000
- **Jet 2**:
  - Transverse Momentum: 64.592
  - Pseudorapidity: -0.631
  - Azimu...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Predicted: background
Correct: True

--- Example 2 ---
True label: background
Features: - **Jet 1**:
  - Transverse Momentum: 178.407
  - Pseudorapidity: -0.546
  - Azimuthal Angle: 0.102
  - Quantile: 5.000
- **Jet 2**:
  - Transverse Momentum: 114.682
  - Pseudorapidity: 0.457
  - Azim...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Predicted: background
Correct: True

--- Example 3 ---
True label: signal
Features: - **Jet 1**:
  - Transverse Momentum: 378.569
  - Pseudorapidity: -0.972
  - Azimuthal Angle: 1.573
  - Quantile: 1.000
- **Jet 2**:
  - Transverse Momentum: 117.540
  - Pseudorapidity: 1.023
  - Azim...
Predicted: background
Correct: False


In [24]:
# Test batch prediction with real SUSY data
print("\n=== Testing Batch Prediction ===")

# Get first 5 events for batch testing
test_events = train_df.head(5)
event_descriptions = test_events['features_text'].tolist()
true_labels = test_events['label'].tolist()

print(f"Testing batch prediction on {len(event_descriptions)} events...")

try:
    # Test batch prediction
    batch_predictions = base_susy_classifier.batch_predict(event_descriptions)
    
    print("\nResults:")
    for i, (true_label, pred_label, event_desc) in enumerate(zip(true_labels, batch_predictions, event_descriptions)):
        correct = true_label.lower() == pred_label.lower()
        print(f"Event {i+1}: True={true_label}, Pred={pred_label}, Correct={correct}")
        print(f"  Features preview: {event_desc[:100]}...")
        
    # Calculate accuracy
    correct_count = sum(1 for true, pred in zip(true_labels, batch_predictions) 
                       if true.lower() == pred.lower())
    accuracy = correct_count / len(true_labels)
    print(f"\nBatch Accuracy: {accuracy:.3f} ({correct_count}/{len(true_labels)})")
    
except Exception as e:
    print(f"Error in batch prediction: {e}")
    import traceback
    traceback.print_exc()

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



=== Testing Batch Prediction ===
Testing batch prediction on 5 events...
Processed batch 1/1

Results:
Event 1: True=background, Pred=background, Correct=True
  Features preview: - **Jet 1**:
  - Transverse Momentum: 67.287
  - Pseudorapidity: -1.147
  - Azimuthal Angle: 0.010
 ...
Event 2: True=background, Pred=background, Correct=True
  Features preview: - **Jet 1**:
  - Transverse Momentum: 178.407
  - Pseudorapidity: -0.546
  - Azimuthal Angle: 0.102
...
Event 3: True=signal, Pred=background, Correct=False
  Features preview: - **Jet 1**:
  - Transverse Momentum: 378.569
  - Pseudorapidity: -0.972
  - Azimuthal Angle: 1.573
...
Event 4: True=signal, Pred=background, Correct=False
  Features preview: - **Jet 1**:
  - Transverse Momentum: 218.293
  - Pseudorapidity: -2.037
  - Azimuthal Angle: -2.682...
Event 5: True=background, Pred=background, Correct=True
  Features preview: - **Jet 1**:
  - Transverse Momentum: 50.572
  - Pseudorapidity: -2.274
  - Azimuthal Angle: -1.781
...

B

In [25]:
# Test with specific SUSY signal and background examples
print("\n=== Testing with Signal vs Background Examples ===")

# Find examples of each class
signal_examples = train_df[train_df['label'].str.lower() == 'signal'].head(2)
background_examples = train_df[train_df['label'].str.lower() == 'background'].head(2)

print("Testing SIGNAL examples:")
for idx, row in signal_examples.iterrows():
    prediction = base_susy_classifier(event_features=row['features_text'])
    print(f"  True: {row['label']} → Predicted: {prediction.classification}")

print("\nTesting BACKGROUND examples:")
for idx, row in background_examples.iterrows():
    prediction = base_susy_classifier(event_features=row['features_text'])
    print(f"  True: {row['label']} → Predicted: {prediction.classification}")


=== Testing with Signal vs Background Examples ===
Testing SIGNAL examples:


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  True: signal → Predicted: background


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  True: signal → Predicted: background

Testing BACKGROUND examples:


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  True: background → Predicted: background
  True: background → Predicted: background


In [26]:
# Test the DSPy Examples format
print("\n=== Testing DSPy Examples Format ===")

# Test with converted DSPy examples
sample_dspy_examples = train_examples[:3]

print("Testing with DSPy Example format:")
for i, example in enumerate(sample_dspy_examples):
    print(f"\nExample {i+1}:")
    print(f"Input: {example.event_features[:100]}...")
    print(f"True label: {example.classification}")
    
    try:
        prediction = base_susy_classifier(event_features=example.event_features)
        print(f"Predicted: {prediction.classification}")
        print(f"Match: {example.classification.lower() == prediction.classification.lower()}")
    except Exception as e:
        print(f"Error: {e}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Testing DSPy Examples Format ===
Testing with DSPy Example format:

Example 1:
Input: - **Jet 1**:
  - Transverse Momentum: 67.287
  - Pseudorapidity: -1.147
  - Azimuthal Angle: 0.010
 ...
True label: background


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Predicted: background
Match: True

Example 2:
Input: - **Jet 1**:
  - Transverse Momentum: 178.407
  - Pseudorapidity: -0.546
  - Azimuthal Angle: 0.102
...
True label: background


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Predicted: background
Match: True

Example 3:
Input: - **Jet 1**:
  - Transverse Momentum: 378.569
  - Pseudorapidity: -0.972
  - Azimuthal Angle: 1.573
...
True label: signal
Predicted: background
Match: False


In [27]:
# Function to validate SUSY predictions
def validate_susy_answer(example, pred, trace=None):
    return example.classification.lower() == pred.classification.lower()

In [28]:
# Fast batch evaluation function for SUSY
def fast_susy_evaluation(predictor, test_examples, batch_size=8):
    """Fast batch evaluation for SUSY classifier"""
    event_features_list = [ex.event_features for ex in test_examples]
    true_labels = [ex.classification.lower() for ex in test_examples]
    
    # Use batch prediction if available
    if hasattr(predictor, 'batch_predict'):
        print("Using batch prediction for SUSY events...")
        predictions = predictor.batch_predict(event_features_list)
        predictions = [pred.lower() for pred in predictions]
    else:
        print("Using individual predictions for SUSY events...")
        predictions = []
        for i, event_features in enumerate(event_features_list):
            pred = predictor(event_features=event_features)
            predictions.append(pred.classification.lower())
            
            if (i + 1) % batch_size == 0:
                print(f"Processed {i + 1}/{len(event_features_list)}")
    
    # Calculate accuracy
    correct = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
    accuracy = correct / len(predictions)
    
    return accuracy, correct, len(predictions)

In [29]:
print("\nBaseline SUSY Evaluation:")
base_accuracy, correct, total = fast_susy_evaluation(base_susy_classifier, train_examples[:50], batch_size=8)
print(f"Baseline Accuracy: {base_accuracy:.3f} ({correct}/{total})")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



Baseline SUSY Evaluation:
Using batch prediction for SUSY events...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 1/7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 2/7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 3/7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 4/7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 5/7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed batch 6/7
Processed batch 7/7
Baseline Accuracy: 0.480 (24/50)


FewShot labelling


In [30]:
from dspy.teleprompt import LabeledFewShot

lfs_optimizer = LabeledFewShot(k=10)  # Use 16 examples in prompts

# Compile the optimizer with your SUSY classifier and training data
lfs_susy_classifier = lfs_optimizer.compile(base_susy_classifier, trainset=train_examples)
lfs_susy_classifier.lm = dspy.settings.lm  # Set the lm of teleprompter as lm defined in the dspy module

In [None]:
# Test the few shots optimizer on a subset of the test set
lfs_scores = []
for x in test_examples:
    pred = lfs_susy_classifier(**x.inputs())
    score = validate_susy_answer(x, pred)
    lfs_scores.append(score)

lfs_accuracy = lfs_scores.count(True) / len(lfs_scores)
print("Labeled Few Shot Accuracy: ", lfs_accuracy)

# Save the optimized classifier
lfs_susy_classifier.save("./lfs_susy_classifier.json")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

BootstrapfewshotRandomSearch

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
bsfswrs_optimizer = BootstrapFewShotWithRandomSearch(
    metric=validate_susy_answer,
    num_candidate_programs=8,  # Reduced for faster execution
    max_bootstrapped_demos=3,  # Reduced for faster execution
    max_labeled_demos=8,       # Reduced for faster execution
    metric_threshold=0.8       # Slightly lower threshold for real dataset
)


In [None]:
# Compile the optimized model
print("Compiling optimized SUSY classifier...")
bsfswrs_susy_classifier = bsfswrs_optimizer.compile(
    base_susy_classifier, 
    trainset=train_examples[:100]  # Use subset for faster compilation
)

bsfswrs_susy_classifier.lm = dspy.settings.lm

MIPROV2



In [31]:
# ...existing code...
# Create a more robust teacher model with higher temperature for generation exploration,
# but we'll wrap it to force deterministic JSON when used by MIPROv2.
teacher_lm = LocalHFBatch(hf_pipeline, max_tokens=512, temperature=0.8)

import time

def _extract_json_from_text(text):
    m = re.search(r"\{.*\}", text, re.S)
    if not m:
        return None
    try:
        obj = json.loads(m.group(0))
        return json.dumps(obj)  # return normalized JSON string
    except Exception:
        return None

def wrapped_teacher_for_mipro(messages, retries=3, sleep_between=0.5, **kwargs):
    """
    Call teacher_lm and try to return a deterministic JSON string.
    Accepts and ignores extra generation flags (temperature, top_p, etc.)
    """
    # keep original temperature and restore later
    orig_temp = getattr(teacher_lm, "temperature", None)
    try:
        for attempt in range(retries):
            # on subsequent attempts force deterministic output
            if attempt > 0:
                if hasattr(teacher_lm, "temperature"):
                    teacher_lm.temperature = 0.0
            resp = teacher_lm(messages=messages)
            text = str(resp).strip() if resp is not None else ""
            json_str = _extract_json_from_text(text)
            if json_str:
                return json_str
            time.sleep(sleep_between)
        # final fallback: return raw text so MIPROv2 can at least see something
        return text
    finally:
        if orig_temp is not None and hasattr(teacher_lm, "temperature"):
            teacher_lm.temperature = orig_temp

class PromptModelWrapper:
    """Expose a kwargs dict and forward calls to the plain function."""
    def __init__(self, fn, default_kwargs=None):
        self.fn = fn
        # MIPROv2 expects a .kwargs attribute (mutable)
        self.kwargs = dict(default_kwargs or {})

    def __call__(self, messages, **kw):
        # merge default kwargs and call the underlying function with all keys
        merged = {**self.kwargs, **kw}
        return self.fn(messages, **merged)
# Quick sanity test for the wrapped teacher
test_prompt = """Generate a better instruction for this task.
Original: "Classify this event"
Improved instruction should be more specific and helpful.
Format: {"proposed_instruction": "your improved instruction here"}"""
test_response = wrapped_teacher_for_mipro([{"role":"user","content": test_prompt}], retries=2)
print("Wrapped teacher test response:", test_response)

mipro_prompt_model = PromptModelWrapper(wrapped_teacher_for_mipro, default_kwargs={"retries": 3, "sleep_between": 0.5})

# Use the wrapper as the prompt model for MIPROv2
mipro_optimizer = dspy.teleprompt.MIPROv2(
    metric=validate_susy_answer,
    prompt_model=mipro_prompt_model,  # now has .kwargs
    auto="light",
    init_temperature=0.0,
    verbose=True,
    track_stats=False,
    max_bootstrapped_demos=1,
    max_labeled_demos=2,
)

print("Trying MIPROv2 compilation...")
try:
    mipro_susy_classifier = mipro_optimizer.compile(
        base_susy_classifier,
        trainset=train_examples[:10],
        valset=test_examples[:5]
    )
    print("MIPROv2 compilation successful!")
except Exception as e:
    print("MIPROv2 failed:", e)
# ...existing code...

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
2025/08/24 22:18:11 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 5

2025/08/24 22:18:11 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/08/24 22:18:11 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/08/24 22:18:11 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Wrapped teacher test response: Example: {"proposed_instruction": "Classify the event as either a natural disaster, a man-made disaster, or a social event."}

{"proposed_instruction": "Classify the event as either a natural disaster, a man-made disaster, or a social event."} 

{"proposed_instruction": "Identify the type of event (e.g. accident, crime, natural disaster, etc.) and provide a brief description of the event."} 

{"proposed_instruction": "Determine whether the event is a natural disaster (e.g. earthquake, hurricane, flood), a man-made disaster (e.g. fire, explosion, accident), or a social event (e.g. protest, riot, festival)."} 

{"proposed_instruction": "Classify the event as either a natural disaster, a man-made disaster, or a social event, and provide a brief explanation for your classification."} 

{"proposed_instruction": "Categorize the event as a natural disaster, a man-made disaster, or a social event, and provide a clear and concise description of the event."} 

{"pr

  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 10%|█         | 1/10 [00:21<03:16, 21.79s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 4/6


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 10%|█         | 1/10 [00:20<03:03, 20.34s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 2/10 [00:40<02:41, 20.19s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/6


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 10%|█         | 1/10 [00:21<03:16, 21.80s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 2/10 [00:43<02:54, 21.81s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/6


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 10%|█         | 1/10 [00:21<03:16, 21.84s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 2/10 [00:43<02:54, 21.81s/it]
2025/08/24 22:20:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/08/24 22:20:40 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/08/24 22:20:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...



Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
SOURCE CODE: 

class HFBatchSusyClassifier(dspy.Predict):
    def __init__(self, batch_size=8):
        super().__init__(SusyEventClassification)
        self.batch_size = batch_size

    def batch_predict(self, event_features_list):
        """Predict classifications for multiple SUSY events at once"""
        all_classifications = []

        for i in range(0, len(event_features_list), self.batch_size):
            batch_events = event_features_list[i:i + self.batch_size]

            # Create batch prompts
            batch_messages = []
            for event_features in batch_events:
                batch_messages.append([
                    {"role": "system", "content": SUSY_SYSTEM_PROMPT},
                    {"role": "user", "content": f"Classify this particle physics event: {event_features}"}
                ])

            # Get batch responses
            batch_responses = lm(messages=ba