<a href="https://colab.research.google.com/github/AhmedMostafa3m/NLP/blob/main/dataset_generator/MyOwn_data_set_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import requests
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [3]:
def load_model(model_name):

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
      model_name,
      device_map="auto",
      quantization_config=quant_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


In [7]:
model, tokenizer = load_model("meta-llama/Llama-3.2-1B-Instruct")

In [8]:
def generate_dataset(topic, number_of_data, inq1, resp1, inq2, resp2):
    # Convert user inputs into multi-shot examples
    multi_shot_examples = [
        {"inquiry": inq1, "response": resp1},
        {"inquiry": inq2, "response": resp2}
    ]

    # System prompt
    system_prompt = f"""
    You are a helpful assistant whose main purpose is to generate datasets.
    Topic: {topic}
    Return the dataset in JSON format. Use examples with simple, and easy-to-understand inquiry for foreigners.
    Include the following examples: {multi_shot_examples}
    Return {number_of_data} examples each time.
    Do not repeat the provided examples.
    """
        # Example Messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please generate my dataset for {topic}"}
    ]

    # Tokenize Input
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)

    # Generate Output
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

    # Decode and Return
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [9]:
def gradio_interface(topic, number_of_data, inq1, resp1, inq2, resp2):
    return generate_dataset(topic, number_of_data, inq1, resp1, inq2, resp2)

In [10]:
default_topic = "Talking to foreigners explain to them Ancient Egyptian monuments."
default_number_of_data = 5
default_multi_shot_examples = [
  {
    "inquiry": "Give safety and cultural tips for tourists visiting ancient Egyptian sites.",
    "response": "Wear comfortable clothing and shoes suitable for walking on uneven ground. Always carry water, sunscreen, and a hat to protect yourself from the sun. Respect local customs by dressing modestly, avoid touching or climbing on monuments, and follow guidance from local authorities or tour guides."
  },
  {
    "inquiry": "Explain why visiting the Valley of the Kings is a must for history lovers.",
    "response": "The Valley of the Kings offers visitors a chance to step directly into the tombs of ancient pharaohs, including Tutankhamun. The vivid wall paintings and hieroglyphs reveal stories of the afterlife and the beliefs of ancient Egyptians. It’s an extraordinary place for anyone fascinated by archaeology and ancient history."
  },

]

In [12]:
gr_interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Topic", value=default_topic, lines=2),
        gr.Number(label="Number of Examples", value=default_number_of_data, precision=0),
        gr.Textbox(label="inquiry 1", value=default_multi_shot_examples[0]["inquiry"]),
        gr.Textbox(label="Response 1", value=default_multi_shot_examples[0]["response"]),
        gr.Textbox(label="inquiry 2", value=default_multi_shot_examples[1]["inquiry"]),
        gr.Textbox(label="Response 2", value=default_multi_shot_examples[1]["response"])

    ],
    outputs=gr.Textbox(label="Generated Dataset", lines=10, show_copy_button=True)
)

In [13]:
gr_interface.launch(inbrowser=True, share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2df80bdb183246fd71.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


