# DATASET GENERATOR

## SETUP

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

In [2]:
!pip install -U bitsandbytes



In [3]:
import os
import requests
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextStreamer,
    BitsAndBytesConfig,
)
import torch
import gradio as gr
from huggingface_hub import login

In [4]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

hf_token = os.getenv("HF_TOKEN")
login(hf_token, add_to_git_credential=False)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## CODE

In [10]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    # quantization_config=quant_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [13]:
def generate_dataset(
    topic,
    number_of_data,
    inst1,
    resp1,
    inst2,
    resp2,
    inst3,
    resp3,
):
    multi_shot_examples = [
        {"instrction": inst1, "response": resp1},
        {"instrction": inst2, "response": resp2},
        {"instrction": inst3, "response": resp3},
    ]

    system_prompt = f"""
        You are a helpful assistant whose main purpose is to generate datasets.
        Topic: {topic}
        Return the dataset in JSON format. Use examples with simple, fun, and easy to understand instructions for kids.
        Include the following examples: {multi_shot_examples}
        Return {number_of_data} examples each time.
        Do not repeat the provided examples.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please generate my dataset for {topic}"},
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    streamer = TextStreamer(tokenizer)

    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [14]:
def gradio_interface(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):
    return generate_dataset(
        topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3
    )

In [15]:
default_topic = "Talking to a (5-8) years old and teaching them manners."
default_number_of_data = 10
default_multi_shot_examples = [
    {
        "instruction": "Why do I have to say please when I want something?",
        "response": "Because it’s like magic! It shows you’re nice, and people want to help you more.",
    },
    {
        "instruction": "What should I say if someone gives me a toy?",
        "response": "You say, 'Thank you!' because it makes them happy you liked it.",
    },
    {
        "instruction": "why should I listen to my parents?",
        "response": "Because parents want the best for you and they love you the most.",
    },
]

In [20]:
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Topic", value=default_topic, lines=2),
        gr.Number(
            label="Number of Examples", value=default_number_of_data, precision=0
        ),
        gr.Textbox(
            label="Instruction 1", value=default_multi_shot_examples[0]["instruction"]
        ),
        gr.Textbox(
            label="Response 1", value=default_multi_shot_examples[0]["response"]
        ),
        gr.Textbox(
            label="Instruction 2", value=default_multi_shot_examples[1]["instruction"]
        ),
        gr.Textbox(
            label="Response 2", value=default_multi_shot_examples[1]["response"]
        ),
        gr.Textbox(
            label="Instruction 3", value=default_multi_shot_examples[2]["instruction"]
        ),
        gr.Textbox(
            label="Response 3", value=default_multi_shot_examples[2]["response"]
        ),
    ],
    outputs=gr.Textbox(label="Generated Dataset"),
)

In [None]:
gr_interface.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant whose main purpose is to generate datasets.
        Topic: Talking to a 7 year old, teaching them to practise soccer at home.
        Return the dataset in JSON format. Use examples with simple, fun, and easy to understand instructions for kids.
        Include the following examples: [{'instrction': 'Why do I have to say please when I want something?','response': 'Because it’s like magic! It shows you’re nice, and people want to help you more.'}, {'instrction': 'What should I say if someone gives me a toy?','response': "You say, 'Thank you!' because it makes them happy you liked it."}, {'instrction': 'why should I listen to my parents?','response': 'Because parents want the best for you and they love you the most.'}]
        Return 10 examples each time.
        Do not repeat the provided examples.<|eot_id|><|start_header_id|>user<|end



home.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**Practicing Soccer at Home for 7-year-olds**
