# core

> Core functionality for synthetic data generation

In [None]:
#| default_exp core

In [None]:
#| hide
from IPython.display import Markdown

In [None]:
#| export
from claudette import *
from fastcore.utils import *
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import os

import concurrent.futures

In [None]:
#| export
class FastData:
    def __init__(self,
                 model: str = "claude-3-haiku-20240307",
                 calls: int = 100,
                 period: int = 60,
                 api_key: str = None):
        self.cli = Client(model)
        self.set_rate_limit(calls, period)
        if api_key:
            os.environ["ANTHROPIC_API_KEY"] = api_key

    def set_rate_limit(self, calls: int, period: int):
        """Set a new rate limit."""
        @sleep_and_retry
        @limits(calls=calls, period=period)
        def rate_limited_call(prompt: str, schema, temp: float, sp: str):
            return self.cli.structured(
                prompt,
                temp=temp,
                tools=schema,
            )[0]
        
        self._rate_limited_call = rate_limited_call

    def generate(self, 
                 prompt_template: str, 
                 inputs: list[dict], 
                 schema,
                 temp: float = 1.,
                 sp: str = "You are a helpful assistant.",
                 max_workers: int = 64) -> list[dict]:
        
        def process_input(input_data):
            try:
                prompt = prompt_template.format(**input_data)
                return self._rate_limited_call(
                    prompt=prompt,
                    schema=schema,
                    temp=temp,
                    sp=sp
                )
            except Exception as e:
                print(f"Error processing input: {e}")
                return None

        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_input, input_data) for input_data in inputs]
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):
                result = future.result()
                results.append(result)
        
        return results

In [None]:
#| hide
def to_md(ss): return '\n'.join(f'- {s}' for s in ss) 
def show(ss): return Markdown(to_md(ss))

In [None]:
class Translation():
    "Translation from an English phrase to a Spanish phrase"
    def __init__(self, english: str, spanish: str): store_attr()
    def __repr__(self): return f"{self.english} ➡ *{self.spanish}*"

Translation("Hello, how are you today?", "Hola, ¿cómo estás hoy?")

Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*

In [None]:
examples = [
    Translation(
        english="Hello, my name is Nathan. I am a research scientist at an AI startup.",
        spanish="Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA."),
    Translation(
        english="How much wood could a woodchuck chuck if a woodchuck could chuck wood?",
        spanish="¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?"),
    Translation(
        english="Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.",
        spanish="Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede."
    ),
]
show(examples)

- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*
- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*
- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*

In [None]:
from datasets import load_dataset

In [None]:
# Load personas
personas = load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(3))['persona']
show(personas)

- A Political Analyst specialized in El Salvador's political landscape.
- A legal advisor who understands the legal implications of incomplete or inaccurate project documentation
- A maternal health advocate focused on raising awareness about postpartum complications.

In [None]:
sp = "You will help generate synthetic data of English and Spanish phrases."
prompt_template = """\
<examples>
{examples}
</examples>

Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:
<persona>{persona}</persona>
"""

Let's see what the prompt looks like in action:

In [None]:
examples_md = to_md(examples)
prompt = prompt_template.format(examples=examples_md, persona=personas[0])
print(prompt)

<examples>
- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*
- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*
- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de l

In [None]:
# Generate translations
fast_data = FastData(model="claude-3-haiku-20240307")
translations = fast_data.generate(
    prompt_template=prompt_template,
    inputs=[{"persona": persona, "examples": examples} for persona in personas],
    schema=Translation,
    sp=sp
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.59it/s]


In [None]:
show(translations)

- Postpartum complications can be life-threatening. New mothers need support and resources to stay healthy after giving birth. ➡ *Las complicaciones posparto pueden ser mortales. Las nuevas madres necesitan apoyo y recursos para mantener su salud después del parto.*
- Incomplete or inaccurate project documentation can have serious legal consequences. It's important to ensure all project details are properly recorded and communicated. ➡ *La documentación de proyectos incompleta o inexacta puede tener graves consecuencias legales. Es importante asegurarse de que todos los detalles del proyecto se registren y se comuniquen adecuadamente.*
- El Salvador's former president Nayib Bukele has consolidated power and cracked down on opposition, drawing criticism from international observers. However, his populist policies remain popular among many Salvadorans frustrated with high crime rates and economic stagnation. ➡ *El expresidente de El Salvador, Nayib Bukele, ha consolidado el poder y reprimido a la oposición, lo que ha generado críticas de observadores internacionales. Sin embargo, sus políticas populistas siguen siendo populares entre muchos salvadoreños frustrados por las altas tasas de delincuencia y el estancamiento económico.*

In [None]:
class TranslationCritique():
    "A critique of the translation."
    def __init__(self, critique: str, score: int): store_attr()
    def __repr__(self): return f"\t- **Critique:** {self.critique}\n\t- **Score:** {self.score}"

In [None]:
sp = "You will help critique synthetic data of English and Spanish phrases."
critique_template = """\
Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:

- Add 1 point if the translation conveys the basic meaning of the source text, even if it includes some minor errors or awkward phrasing.
- Add another point if the translation is generally accurate but lacks refinement in style or fails to capture some nuances of the original. It might use inconsistent terminology or have occasional lapses in register.
- Award a third point if the translation is appropriate for professional use and accurately conveys key concepts of the source text. It demonstrates good understanding of both languages, though it may not be flawless or could include some slight inconsistencies. It resembles the work of a competent translator but may have room for improvement in fluency or precision.
- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.
- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.

<translation>{translation}</translation>

After examining the translation:

- Briefly justify your total score, up to 100 words.
- Conclude with the score of the translation.
"""

In [None]:
fast_data = FastData(model="claude-3-5-sonnet-20240620")
critiques = fast_data.generate(
    prompt_template=critique_template,
    inputs=[{"translation": f"{t.english} -> {t.spanish}"} for t in translations],
    schema=TranslationCritique,
    sp=sp
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.33s/it]


In [None]:
show(f'{t}\n\n{c}' for t, c in zip(translations, critiques))

- Postpartum complications can be life-threatening. New mothers need support and resources to stay healthy after giving birth. ➡ *Las complicaciones posparto pueden ser mortales. Las nuevas madres necesitan apoyo y recursos para mantener su salud después del parto.*

	- **Critique:** This translation demonstrates a high level of accuracy and fluency. It effectively conveys the original message, maintaining the tone and emphasis on the importance of proper documentation. The translator has chosen appropriate terminology (e.g., "consecuencias legales" for "legal consequences") and has accurately rendered the passive voice construction. The translation reads naturally in Spanish and captures the nuances of the original text. While it's a strong translation, there's a slight opportunity for improvement in terms of conciseness, which prevents it from reaching the highest level of excellence.
	- **Score:** 4
- Incomplete or inaccurate project documentation can have serious legal consequences. It's important to ensure all project details are properly recorded and communicated. ➡ *La documentación de proyectos incompleta o inexacta puede tener graves consecuencias legales. Es importante asegurarse de que todos los detalles del proyecto se registren y se comuniquen adecuadamente.*

	- **Critique:** The translation accurately conveys the main ideas of the source text, maintaining the structure and key information. It correctly translates complex terms like "consolidated power" and "cracked down on opposition." The translator effectively renders "populist policies" and "economic stagnation." The text reads naturally in Spanish and maintains the original tone. There are no significant errors or awkward phrasings. The translation demonstrates a high level of competence, capturing nuances and providing a faithful rendering of the content. It's suitable for professional use, though it doesn't reach the level of exceptional mastery that would warrant a perfect score.
	- **Score:** 4
- El Salvador's former president Nayib Bukele has consolidated power and cracked down on opposition, drawing criticism from international observers. However, his populist policies remain popular among many Salvadorans frustrated with high crime rates and economic stagnation. ➡ *El expresidente de El Salvador, Nayib Bukele, ha consolidado el poder y reprimido a la oposición, lo que ha generado críticas de observadores internacionales. Sin embargo, sus políticas populistas siguen siendo populares entre muchos salvadoreños frustrados por las altas tasas de delincuencia y el estancamiento económico.*

	- **Critique:** The translation accurately conveys the main message and maintains the tone of the original text. It demonstrates a good understanding of both languages and uses appropriate medical terminology (e.g., "posparto" for "postpartum"). The Spanish version reads naturally and maintains the concise style of the source. It effectively captures the urgency of the situation with "pueden ser mortales" for "life-threatening". The translator made a good choice in translating "stay healthy" to "mantener su salud", which sounds more natural in Spanish. Overall, it's a high-quality translation suitable for professional use, with no significant errors or awkward phrasing.
	- **Score:** 5

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()