# core

> Core functionality for synthetic data generation

In [None]:
#| default_exp core

In [None]:
#| hide
from IPython.display import Markdown

In [None]:
#| export
import concurrent.futures
import json
import shutil
from pathlib import Path
from uuid import uuid4
from typing import Optional, Union

from tqdm import tqdm
from fastcore.utils import *
from ratelimit import limits, sleep_and_retry
from huggingface_hub import CommitScheduler, DatasetCard
from claudette import *

In [None]:
#| export
DATASET_CARD_TEMPLATE = """
---
tags:
- fastdata
- synthetic
---

# {title}

_Note: This is an AI-generated dataset, so its content may be inaccurate or false._

**Source of the data:**

The dataset was generated using [Fastdata](https://github.com/AnswerDotAI/fastdata) library and {model_id} with the following input:

## System Prompt

```
{system_prompt}
```

## Prompt Template

```
{prompt_template}
```

## Sample Input

```json
{sample_input}
```

"""


class FastData:
    def __init__(self,
                 model: str = "claude-3-haiku-20240307",
                 calls: int = 100,
                 period: int = 60):
        self.cli = Client(model)
        self._set_rate_limit(calls, period)

    def _set_rate_limit(self, calls: int, period: int):
        """Set a new rate limit."""
        @sleep_and_retry
        @limits(calls=calls, period=period)
        def rate_limited_call(prompt: str, schema, temp: float, sp: str):
            return self.cli.structured(
                prompt,
                temp=temp,
                tools=schema,
            )[0]
        
        self._rate_limited_call = rate_limited_call

    def _process_input(self, prompt_template, schema, temp, sp, input_data):
        try:
            prompt = prompt_template.format(**input_data)
            return self._rate_limited_call(
                prompt=prompt, schema=schema, temp=temp, sp=sp
            )
        except Exception as e:
            print(f"Error processing input {input_data}: {e}")
            return None

    def _save_results(self, results: list[dict], save_path: Path) -> None:
        with open(save_path, "w") as f:
            for res in results:
                obj_dict = getattr(res, "__stored_args__", res.__dict__)
                f.write(json.dumps(obj_dict) + "\n")

    def generate(self, 
                 prompt_template: str, 
                 inputs: list[dict], 
                 schema,
                 temp: float = 1.,
                 sp: str = "You are a helpful assistant.",
                 max_workers: int = 64) -> list[dict]:
        "For every input in INPUTS, fill PROMPT_TEMPLATE and generate a value fitting SCHEMA"
        
        with tqdm(total=len(inputs)) as pbar:
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [
                        executor.submit(
                            self._process_input,
                            prompt_template,
                            schema,
                            temp,
                            sp,
                            input_data,
                        )
                        for input_data in inputs
                    ]

                for completed_future in concurrent.futures.as_completed(futures):
                    pbar.update(1)
        return [f.result() for f in futures]

    def generate_to_hf(
        self,
        prompt_template: str,
        inputs: list[dict],
        schema,
        repo_id: str,
        temp: float = 1.0,
        sp: str = "You are a helpful assistant.",
        max_workers: int = 64,
        max_items_per_file: int = 100,
        commit_every: Union[int, float] = 5,
        private: bool = False,
        token: Optional[str] = None,
        delete_files_after: bool = True,
    ) -> tuple[str, list[dict]]:
        """
        Generate data based on a prompt template and schema, and save it to Hugging Face dataset repository.
        This function writes the generated records to multiple files, each containing a maximum of `max_items_per_file` records. 
        Due to the multi-threaded execution of the function, the order of the records in the files is not guaranteed to match the order of the input data. 

        Args:
            prompt_template (str): The template for generating prompts.
            inputs (list[dict]): A list of input dictionaries to be processed.
            schema: The schema to parse the generated data.
            repo_id (str): The HuggingFace dataset name.
            temp (float, optional): The temperature for generation. Defaults to 1.0.
            sp (str, optional): The system prompt for the assistant. Defaults to "You are a helpful assistant.".
            max_workers (int, optional): The maximum number of worker threads. Defaults to 64.
            max_items_per_file (int, optional): The maximum number of items to save in each file. Defaults to 100.
            commit_every (Union[int, float], optional): The number of minutes between each commit. Defaults to 5.
            private (bool, optional): Whether the repository is private. Defaults to False.
            token (Optional[str], optional): The token to use to commit to the repo. Defaults to the token saved on the machine.
            delete_files_after (bool, optional): Whether to delete files after processing. Defaults to True.

        Returns:
            tuple[str, list[dict]]: A tuple with the generated repo_id and the list of generated data dictionaries.
        """
        dataset_dir = Path(repo_id.replace("/", "_"))
        dataset_dir.mkdir(parents=True, exist_ok=True)
        data_dir = dataset_dir / "data"
        data_dir.mkdir(exist_ok=True)

        try:
            scheduler = CommitScheduler(
                repo_id=repo_id,
                repo_type="dataset",
                folder_path=dataset_dir,
                every=commit_every,
                private=private,
                token=token,
            )

            readme_path = dataset_dir / "README.md"

            if not readme_path.exists():
                DatasetCard(
                    DATASET_CARD_TEMPLATE.format(
                        title=repo_id,
                        model_id=self.cli.model,
                        system_prompt=sp,
                        prompt_template=prompt_template,
                        sample_input=inputs[:2],
                    )
                ).save(readme_path)

            results = []
            total_inputs = len(inputs)

            with tqdm(total=total_inputs) as pbar:
                with concurrent.futures.ThreadPoolExecutor(
                    max_workers=max_workers
                ) as executor:
                    futures = [
                        executor.submit(
                            self._process_input,
                            prompt_template,
                            schema,
                            temp,
                            sp,
                            input_data,
                        )
                        for input_data in inputs
                    ]

                    current_file = data_dir / f"train-{uuid4()}.jsonl"
                    for completed_future in concurrent.futures.as_completed(futures):
                        result = completed_future.result()
                        if result is not None:
                            results.append(result)
                            with scheduler.lock:
                                self._save_results(results, current_file)
                        pbar.update(1)
                        if len(results) >= max_items_per_file:
                            current_file = data_dir / f"train-{uuid4()}.jsonl"
                            results.clear()
        finally:
            scheduler.trigger().result()  # force upload last result
            if delete_files_after:
                shutil.rmtree(dataset_dir)

        return scheduler.repo_id, [f.result() for f in futures if f.done()]

In [None]:
#| hide
def to_md(ss): return '\n'.join(f'- {s}' for s in ss) 
def show(ss): return Markdown(to_md(ss))

In [None]:
class Translation():
    "Translation from an English phrase to a Spanish phrase"
    def __init__(self, english: str, spanish: str): store_attr()
    def __repr__(self): return f"{self.english} ➡ *{self.spanish}*"

Translation("Hello, how are you today?", "Hola, ¿cómo estás hoy?")

Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*

In [None]:
examples = [
    Translation(
        english="Hello, my name is Nathan. I am a research scientist at an AI startup.",
        spanish="Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA."),
    Translation(
        english="How much wood could a woodchuck chuck if a woodchuck could chuck wood?",
        spanish="¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?"),
    Translation(
        english="Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.",
        spanish="Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede."
    ),
]
show(examples)

- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*
- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*
- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*

In [None]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load personas
personas = load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(3))['persona']
show(personas)

Generating train split: 100%|█| 200000/200000 [00:00<00:00, 3852934.04 examples/


- A Political Analyst specialized in El Salvador's political landscape.
- A legal advisor who understands the legal implications of incomplete or inaccurate project documentation
- A maternal health advocate focused on raising awareness about postpartum complications.

In [None]:
sp = "You will help generate synthetic data of English and Spanish phrases."
prompt_template = """\
<examples>
{examples}
</examples>

Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:
<persona>{persona}</persona>
"""

Let's see what the prompt looks like in action:

In [None]:
examples_md = to_md(examples)
prompt = prompt_template.format(examples=examples_md, persona=personas[0])
print(prompt)

<examples>
- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*
- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*
- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de l

In [None]:
# Generate translations
fast_data = FastData(model="claude-3-haiku-20240307")
translations = fast_data.generate(
    prompt_template=prompt_template,
    inputs=[{"persona": persona, "examples": examples} for persona in personas],
    schema=Translation,
    sp=sp
)

100%|█████████████████████████████████████████████| 3/3 [00:01<00:00,  1.64it/s]


In [None]:
show(translations)

- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*
- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*
- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*

In [None]:
# Generate translations and push results to Hugging Face Hub as a dataset
# Be sure to have the HF_TOKEN environment variable set to your Hugging Face API token
fast_data = FastData(model="claude-3-haiku-20240307")
repo_id, translations = fast_data.generate_to_hf(
    prompt_template=prompt_template,
    inputs=[{"persona": persona, "examples": examples} for persona in personas],
    schema=Translation,
    sp=sp,
    repo_id=f"personas-translation-{uuid4()}",
    max_items_per_file=2, # It will create a local file each 2 translations  
)
assert len(translations) == len(personas)

new_dataset = load_dataset(repo_id)
assert len(translations) == len(personas)

In [None]:
class TranslationCritique():
    "A critique of the translation."
    def __init__(self, critique: str, score: int): store_attr()
    def __repr__(self): return f"\t- **Critique:** {self.critique}\n\t- **Score:** {self.score}"

In [None]:
sp = "You will help critique synthetic data of English and Spanish phrases."
critique_template = """\
Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:

- Add 1 point if the translation conveys the basic meaning of the source text, even if it includes some minor errors or awkward phrasing.
- Add another point if the translation is generally accurate but lacks refinement in style or fails to capture some nuances of the original. It might use inconsistent terminology or have occasional lapses in register.
- Award a third point if the translation is appropriate for professional use and accurately conveys key concepts of the source text. It demonstrates good understanding of both languages, though it may not be flawless or could include some slight inconsistencies. It resembles the work of a competent translator but may have room for improvement in fluency or precision.
- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.
- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.

<translation>{translation}</translation>

After examining the translation:

- Briefly justify your total score, up to 100 words.
- Conclude with the score of the translation.
"""

In [None]:
fast_data = FastData(model="claude-3-5-sonnet-20240620")
critiques = fast_data.generate(
    prompt_template=critique_template,
    inputs=[{"translation": f"{t.english} -> {t.spanish}"} for t in translations],
    schema=TranslationCritique,
    sp=sp
)

100%|█████████████████████████████████████████████| 3/3 [00:03<00:00,  1.27s/it]


In [None]:
show(f'{t}\n\n{c}' for t, c in zip(translations, critiques))

- The political situation in El Salvador continues to be complex, with ongoing tensions between the ruling party and opposition groups. President Nayib Bukele has consolidated significant power, raising concerns about the state of democracy in the country. ➡ *La situación política en El Salvador sigue siendo compleja, con tensiones persistentes entre el partido gobernante y los grupos de oposición. El presidente Nayib Bukele ha consolidado un poder significativo, lo que genera preocupaciones sobre el estado de la democracia en el país.*

	- **Critique:** The translation accurately conveys the meaning of the source text, capturing the complexity of El Salvador's political situation. It effectively translates key terms and maintains the tone of the original. The translator demonstrates a strong grasp of both languages, rendering the content naturally in Spanish. The phrasing is appropriate and flows well, preserving the nuances of the English version. There are no noticeable errors or awkward constructions. The translation skillfully handles the political terminology and concepts, making it suitable for professional use. It reads as if it were originally written in Spanish, indicating the translator's expertise.
	- **Score:** 5
- Thorough documentation is critical for any legal proceedings. Incomplete or inaccurate records can have serious consequences. ➡ *La documentación exhaustiva es fundamental para cualquier proceso legal. Los registros incompletos o inexactos pueden tener consecuencias graves.*

	- **Critique:** The translation accurately conveys the main message of the source text, maintaining both the meaning and tone. It effectively captures the importance of thorough documentation in legal proceedings and the potential consequences of incomplete or inaccurate records. The translator has chosen appropriate Spanish equivalents for key terms, such as "exhaustiva" for "thorough" and "proceso legal" for "legal proceedings". The sentence structure is natural in Spanish, and the translation maintains the formal register suitable for legal contexts. There are no errors in grammar or vocabulary. The translation demonstrates a high level of competence, accurately conveying complex concepts while reading naturally in the target language. It could be considered the work of an experienced translator.
	- **Score:** 4
- Postpartum complications can be life-threatening, but with proper care and support, new mothers can recover and thrive. Let's work together to ensure all women have access to the resources they need during this crucial time. ➡ *Las complicaciones posparto pueden poner en riesgo la vida, pero con la atención y el apoyo adecuados, las nuevas madres pueden recuperarse y prosperar. Trabajemos juntos para garantizar que todas las mujeres tengan acceso a los recursos que necesitan durante este momento crucial.*

	- **Critique:** The translation accurately conveys the meaning of the original text, maintaining both the informative and encouraging tone. It correctly translates key terms like "postpartum complications" and "life-threatening." The Spanish version flows naturally and captures the nuances of the original, including the call to action. The translator has made excellent choices in vocabulary and structure, resulting in a text that reads as if it were originally written in Spanish. The translation demonstrates a high level of proficiency in both languages and would be suitable for professional use in healthcare communications.
	- **Score:** 5

### Test that generate outputs align with inputs

Let's verify that the `FastData.generate` returns results in the same order as the inputs it was passed.

To show this, we will define a new prompt template, where the model is asked only to echo a piece of data
from the input. Then we will verify that the values in the inputs matches the values in the outputs, in order and in value.

In [None]:
sp="You will help with accurate and faithful data processing."
prompt_template = """\
Below you find an item of data, a datum, which is an alphanumeric string:

<datum>{datum}</datum>

After reviewing this datum, please echo is back exactly, without any preamble:
"""

In [None]:
class Datum:
    "A data value"
    def __init__(self, datum: str): store_attr()
    def __repr__(self): return f"{self.datum}"

First we'll test that the prompt and schema class work as execpted.

In [None]:
print(prompt_template.format(**dict(datum=str(uuid4()))))

Below you find an item of data, a datum, which is an alphanumeric string:

<datum>b9121446-e46c-47c0-9e6d-b4df35c0974b</datum>

After reviewing this datum, please echo is back exactly, without any preamble:



In [None]:
Datum(str(uuid4()))

04da7de4-cc39-4699-9d25-5a476e366732

Now we ask the model to "generate" (i.e., echo) 100 of these values.

In [None]:
in_vals = [{"datum":str(uuid4())} for _ in range(100)]
out_vals = fast_data.generate(
    prompt_template=prompt_template,
    inputs=in_vals,
    schema=Datum,
    sp=sp
)

100%|█████████████████████████████████████████| 100/100 [00:04<00:00, 24.17it/s]


Now we will see if the inputs and outputs are aligned.

If they are aligned, then that shows two things. First it shows that the model is echoing the values faithfully. Second, it shows _either_ that the model itself returned outputs in the order in which they were submitted, or else that `generate` has returned outputs in submission order.

We are submitting a large enough quantity of items, that we _asssume_ the model will return some results out of submission order. If you want confidence which does not depend on this assumption, then could modify the test above to increase the number and complexity of the generation task, or simply inspect the implementation.

Let's start by spot checking the first item:

In [None]:
in_list = [x['datum'] for x in in_vals]
out_list = [x.datum for x in out_vals]
(in_list[0],out_list[0])

('f42ea0db-24ce-4e09-a50d-edf74d0eb611',
 'f42ea0db-24ce-4e09-a50d-edf74d0eb611')

In [None]:
for (idx,(in_item,out_item)) in enumerate(zip(in_list,out_list)):
    if in_item != out_item:
        print("Failure: output items were not aligned with input items!")
        print(f"\titem {idx} had in={in_item} and out={out_item}")
        break
else:
    print("Success: output items are aligned with input items")        

Success: output items are aligned with input items


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()