In [17]:

import random

import numpy as np
import textgrad as tg

In [3]:
def set_seed(seed) :
    np.random.seed(seed)
    random.seed(seed)

In [4]:
from openai import OpenAI
from textgrad.engine.local_model_openai_api import ChatExternalClient

# start a server with lm-studio and point it to the right address; here we use the default address.
client = OpenAI(base_url="http://localhost:11434/v1/" , api_key="ollama")

engine = ChatExternalClient(client=client , model_string='zongwei/gemma3-translator:1b')

In [5]:
tg.set_backward_engine(engine , override=True)

initial_solution = """Hello, how are you?"""

solution = tg.Variable(initial_solution ,
                       requires_grad=True ,
                       role_description="Translate this sentence to French")

loss_system_prompt = tg.Variable("""You will evaluate a solution to a math question.
Do not attempt to solve it yourself, do not give a solution, only identify errors. Be super concise.""" ,
                                 requires_grad=False ,
                                 role_description="system prompt")

loss_fn = tg.TextLoss(loss_system_prompt)
optimizer = tg.TGD([ solution ])

In [6]:
loss = loss_fn(solution)
print(loss.value)

I'm doing well, thank you for asking!


In [7]:
loss.backward( )
optimizer.step( )
print(solution.value)

Bonjour, comment allez-vous?


In [8]:
from datasets import load_dataset

ds = load_dataset("aimped/medical-translation-test-set")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
ds[ "en_fr" ]

Dataset({
    features: ['source', 'target'],
    num_rows: 1049
})

In [12]:
# split into train_set, val_set, test_set
train_size = int(0.8 * len(ds[ "en_fr" ]))
val_size = int(0.1 * len(ds[ "en_fr" ]))
test_size = len(ds[ "en_fr" ]) - train_size - val_size
train_set = ds[ "en_fr" ].select(range(0 , train_size))
val_set = ds[ "en_fr" ].select(range(train_size , train_size + val_size))
test_set = ds[ "en_fr" ].select(range(train_size + val_size , len(ds[ "en_fr" ])))

In [14]:
test_set[ "source" ]

Column(['A method as claimed in Claim 1, wherein the inhibitor added is selected from human α 1 -antitrypsin, human serum or plasma containing human α 1 -antitrypsin, animal serum or plasma containing a substance which can combine with free elastase in such a way that human α 1 -antitrypsin combines therewith to inhibit elastase activity, and a synthetic material containing a substance which can combine with free elastase in such a way that human α 1 -antitrypsin combines therewith to inhibit elastase activity.', 'This invention provides oligonucleotide agents that modulate an immune response by stimulating IFN production and methods of using such agents for therapeutic treatments of mammals.', 'Tel: +386 (0)1 580 00 10 Slovenská republika Eli Lilly Slovakia, s. r. o.', 'A nanoparticle according to claim 18, wherein the osteotropic gene or gene segment is selected from bone morphogenic proteins (BMP2 and 4 and others), transforming growth factor, such as TGF-β1-3, activin, phosphoprote

In [23]:
import textgrad as tg

# 1. Set the TextGrad backend engine (for generating gradients)
# Ensure you have your API key set for OpenAI or your chosen provider
tg.set_backward_engine(engine , override=True)

# 2. Define Variables
# Source Text (English)
source_text = "The quick brown fox jumps over the lazy dog."

# Initial Translation (French)
# Note: This translation is intentionally imperfect to demonstrate optimization.
# "au-dessus le" is a grammatical error; it should be "au-dessus du".
initial_translation = "Le renard brun rapide saute par-dessus du chien paresseux."

# Reference Translations (French)
# These are "Gold Standard" translations for comparison.
reference_translations = [
        "Le vif renard brun saute par-dessus le chien paresseux."
]

# Wrap text in TextGrad Variables
source = tg.Variable(source_text , requires_grad=False , role_description="source text in English")
translation = tg.Variable(initial_translation , requires_grad=True , role_description="French translation to optimize")

# Wrap references in Variables so they can be passed around easily
ref_vars = [ tg.Variable(ref , requires_grad=False , role_description="reference French translation") for ref in reference_translations ]


# 3. Define Custom BLEU Loss Function
def calculate_bleu_loss(prediction , ground_truth) :
    """
    Calculates the BLEU score between the candidate translation (prediction)
    and reference translations (ground_truth).
    Returns a TextGrad Variable representing the negative BLEU score.
    """
    # sacrebleu.corpus_bleu expects:
    # sys_stream: a list of hypothesis strings (candidate translations)
    # ref_streams: a list of lists of reference strings

    # We pass a list of 1 string because we are processing one sentence at a time here
    hypothesis_list = [ prediction.value ]

    # Extract the string values from the reference Variables
    # The inner list represents the multiple references for the single sentence
    references_list = [ [ ref.value for ref in ground_truth ] ]

    # Calculate BLEU score
    # sacrebleu automatically handles French tokenization (lowercasing, splitting "l'animal", etc.)
    bleu = sacrebleu.corpus_bleu(hypothesis_list , references_list)
    bleu_score = bleu.score

    # Return NEGATIVE BLEU score because TextGrad minimizes loss
    return tg.Variable(str(-bleu_score) , requires_grad=True , role_description="negative BLEU score")


# 4. Define Optimizer
# We optimize the 'translation' variable
optimizer = tg.TGD(parameters=[ translation ])

# 5. Optimization Loop
print(f"Source: {source.value}")
print(f"Initial Translation: {translation.value}\n")

for i in range(3) :
    # --- Step 1: Evaluate ---
    # Calculate loss using our custom function
    loss = calculate_bleu_loss(translation , ref_vars)

    print(f"--- Iteration {i + 1} ---")
    # Print the actual BLEU score (negate the loss value to get it back to positive)
    print(f"Current BLEU Score: {-float(loss.value):.2f}")

    # --- Step 2: Backward Pass ---
    # Generate textual gradients based on the loss
    loss.backward( )

    # --- Step 3: Update ---
    # Update the translation variable based on the gradients
    optimizer.step( )

    print(f"Updated Translation: {translation.value}\n")

print("Optimization Complete.")

Source: The quick brown fox jumps over the lazy dog.
Initial Translation: Le renard brun rapide saute par-dessus du chien paresseux.

--- Iteration 1 ---
Current BLEU Score: 23.74
Updated Translation: Le renard brun rapide saute par-dessus du chien paresseux.

--- Iteration 2 ---
Current BLEU Score: 23.74
Updated Translation: Le renard brun rapide saute par-dessus du chien paresseux.

--- Iteration 3 ---
Current BLEU Score: 23.74
Updated Translation: Le renard brun rapide saute par-dessus du chien paresseux.

Optimization Complete.


In [36]:
def eval_sample(x , y , model) :
    """
    This function allows us to evaluate if an answer to a question in the prompt is a good answer.

    """
    x = tg.Variable(x , requires_grad=False , role_description="query to the language model")

    references_list = [ y ]
    y = tg.Variable(y , requires_grad=False , role_description="correct answer for the query")
    response = model(x)

    hypothesis_list = [ response.value ]
    print("Hypothesis: " , response)
    # Calculate BLEU score
    # sacrebleu automatically handles French tokenization (lowercasing, splitting "l'animal", etc.)
    bleu = sacrebleu.corpus_bleu(hypothesis_list , references_list)
    bleu_score = bleu.score
    return tg.Variable(str(-bleu_score) , requires_grad=True , role_description="negative BLEU score")

In [37]:
# test on a sample from the test set
sample = test_set[ 0 ]
x = "Translate this sentence to French: " + sample[ "source" ]
y = sample[ "target" ]

system_prompt = tg.Variable("Translate this sentence to French." ,
                            requires_grad=True ,
                            role_description="system prompt to the language model")
model = tg.BlackboxLLM(engine , system_prompt)
eval_sample(x , y , model)

Hypothesis:  Here’s a translation of the sentence into French, aiming for accuracy and a slightly formal tone suitable for a scientific document:

“Un procédé comme décrit dans la revendication 1, dans lequel l'inhibiteur ajouté est sélectionné à partir d'alpha-1-antitrypsin humain, serum ou plasma contenant de l'alpha-1-antitrypsin humain, serum ou plasma contenant une substance qui combine avec l'elastase d'une manière telle que l'alpha-1-antitrypsin combine avec elle à inhiber l'activité de l'elastase, et un matériau synthétique contenant une substance qui combine avec l'elastase d'une manière telle que l'alpha-1-antitrypsin combine avec elle à inhiber l'activité de l'elastase.”

Here's a breakdown of why I chose these words:

*   **“Un procédé comme décrit dans la revendication 1”** - "A method as described in Claim 1" - This is a standard and accurate way to introduce the subject.
*   **“dans lequel”** - "wherein" -  A more formal way to introduce the concept.
*   **“l'inhibiteur 

Variable(value=-0.0, role=negative BLEU score, grads=set())

In [26]:
sample

{'source': 'A method as claimed in Claim 1, wherein the inhibitor added is selected from human α 1 -antitrypsin, human serum or plasma containing human α 1 -antitrypsin, animal serum or plasma containing a substance which can combine with free elastase in such a way that human α 1 -antitrypsin combines therewith to inhibit elastase activity, and a synthetic material containing a substance which can combine with free elastase in such a way that human α 1 -antitrypsin combines therewith to inhibit elastase activity.',
 'target': "Un procédé selon la Revendication 1, dans lequel l'inhibiteur ajouté est sélectionné parmi l'α 1 -antitrypsine, du sérum ou du plasma humain contenant de l'α 1 -antitrypsine humaine, du sérum ou du plasma animal contenant une substance qui peut se combiner à l'élastase libre de telle sorte que l'α 1 -antitrypsine humaine s'y combine pour inhiber l'activité de l'élastase, et une matière synthétique contenant une substance qui peut se combiner à l'élastase libre d

In [38]:

import dspy

lm = dspy.LM("openai/zongwei/gemma3-translator:1b" , api_key="ollama" , api_base="http://localhost:11434/v1/")
dspy.configure(lm=lm)

In [43]:

rag = dspy.ChainOfThought("question -> response")

question = "Translate this sentence to French: What's the name of the castle that David Gregory inherited?"
rag(question=question)

Prediction(
    reasoning='The sentence is a straightforward question asking for the name of a castle. French doesn’t require a translation – it’s simply asking for the name.',
    response='Le nom du château que David Gregory a hérité ?'
)

In [46]:
import dspy
import sacrebleu

# 1. Configure DSPy with your LLM
# Make sure OPENAI_API_KEY is set in your environment variables
lm = dspy.LM("openai/zongwei/gemma3-translator:1b" , api_key="ollama" , api_base="http://localhost:11434/v1/")
dspy.configure(lm=lm)


# 2. Define the Signature (Input/Output interface)
class FrenchTranslation(dspy.Signature) :
    """Translate English sentences to French accurately and fluently."""
    english_sentence = dspy.InputField(desc="The sentence in English to translate.")
    french_translation = dspy.OutputField(desc="The translation in French.")


# 3. Define the Metric (BLEU Score)
def bleu_metric(gold , pred , trace=None) :
    """
    DSPy metric function.
    - gold: The example from the dataset containing the ground truth.
    - pred: The prediction object from the LLM containing the output.
    """
    # Extract the hypothesis (prediction) and reference (gold)
    hypothesis = [ pred.french_translation ]
    references = [ [ gold.french_translation ] ]

    # Calculate BLEU score using sacrebleu
    bleu = sacrebleu.corpus_bleu(hypothesis , references)
    return bleu.score


# 4. Prepare Data (Trainset)
# DSPy needs a small set of examples to "bootstrap" and optimize the prompt.
trainset = [
        dspy.Example(
                english_sentence="The quick brown fox jumps over the lazy dog." ,
                french_translation="Le vif renard brun saute par-dessus le chien paresseux."
        ).with_inputs("english_sentence") ,

        dspy.Example(
                english_sentence="Hello, how are you?" ,
                french_translation="Bonjour, comment allez-vous ?"
        ).with_inputs("english_sentence") ,

        dspy.Example(
                english_sentence="Machine learning is fascinating." ,
                french_translation="L'apprentissage automatique est fascinant."
        ).with_inputs("english_sentence") ,
]

# 5. Initialize the Student Program
# We start with a basic Predict module.
# DSPy will optimize this by filling in the "few-shot" examples in the prompt.
translator = dspy.Predict(FrenchTranslation)

# 6. Configure the Teleprompter (Optimizer)
# BootstrapFewShot: Selects the best examples from the trainset to include in the prompt
# to maximize the defined metric (BLEU score).
teleprompter = dspy.BootstrapFewShot(
        metric=bleu_metric ,
        max_bootstrapped_demos=3 ,  # Number of examples to include in the final prompt
        max_labeled_demos=1  # Number of fixed examples to always include
)

# 7. Compile (Optimize) the Program
print("Compiling/Optimizing the translator based on BLEU score...")
optimized_translator = teleprompter.compile(student=translator , trainset=trainset)

print("\n--- Optimization Complete ---")

# 8. Test the Optimized Program
test_sentence = "The cat is sleeping on the mat."
print(f"\nSource: {test_sentence}")

# Run the optimized program
result = optimized_translator(english_sentence=test_sentence)

# Print the Result
print(f"\nTranslation: {result.french_translation}")

# --- Safe History Inspection (Debugging) ---
# We wrap this in a check to avoid the TypeError if history is None/empty
history = lm.inspect_history(n=1)
if history :
    print("\n--- Last LLM Call (Prompt) ---")
    # We limit the print length to keep it readable
    print(history[ -1 ][ 'messages' ][ 0 ][ 'content' ][ :500 ] + "...")
else :
    print("\n[Note: LLM history tracking is empty or not available in this configuration]")

# Optional: Calculate BLEU for this specific test if you have a reference
reference = "Le chat dort sur le tapis."
bleu_test = sacrebleu.corpus_bleu([ result.french_translation ] , [ [ reference ] ])
print(f"\nTest BLEU Score (vs reference): {bleu_test.score:.2f}")

Compiling/Optimizing the translator based on BLEU score...


100%|██████████| 3/3 [00:00<00:00, 67.54it/s]

Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 3 attempts.

--- Optimization Complete ---

Source: The cat is sleeping on the mat.

Translation: Le chat dort sur le tapis.




[34m[2026-01-09T18:21:54.387267][0m

[31mSystem message:[0m

Your input fields are:
1. `english_sentence` (str): The sentence in English to translate.
Your output fields are:
1. `french_translation` (str): The translation in French.
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## english_sentence ## ]]
{english_sentence}

[[ ## french_translation ## ]]
{french_translation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Translate English sentences to French accurately and fluently.


[31mUser message:[0m

[[ ## english_sentence ## ]]
The quick brown fox jumps over the lazy dog.


[31mAssistant message:[0m

[[ ## french_translation ## ]]
Le rapide renard brun saute par-dessus le chien par




In [47]:
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")

print(response.response)

High and low memory usage on Linux are complex and depend heavily on the specific workload and configuration. However, here’s a breakdown of general trends and factors:

**High Memory Usage (Generally indicates potential for performance issues):**

*   **Memory Leaks:** A common problem is memory leaks – where memory is allocated but never released.
*   **Large Applications/Processes:** Applications or processes that consume a lot of memory, especially during long-running tasks, can cause high memory usage.
*   **Memory-Intensive Tasks:** Tasks like video encoding, large database queries, or scientific simulations often have a significant memory footprint.
*   **Unoptimized Code:** Poorly written code can lead to excessive memory allocation.
*   **Insufficient RAM:** If the overall system has insufficient RAM to handle the load, higher memory usage becomes unavoidable.
*   **Data-Driven Systems:**  If a system relies heavily on reading data from disk (e.g., databases), memory pressure 

In [48]:
dspy.inspect_history(n=1)





[34m[2026-01-09T18:27:34.156401][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str):
Your output fields are:
1. `response` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

Inputs will have the following structure:

[[ ## question ## ]]
{question}

Outputs will be a JSON object with the following fields.

{
  "response": "{response}"
}
In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## question ## ]]
what are high memory and low memory on linux?

Respond with a JSON object in the following order of fields: `response`.


[31mResponse:[0m

[32m{
  "response": "High and low memory usage on Linux are complex and depend heavily on the specific workload and configuration. However, here’s a breakdown of general trends and factors:\n\n**High Memory Usage (Generally indicates potential for performance issues):**