In [1]:
LLAMA = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
MAX_DATASET_SIZE = 100
MODEL_STORE_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/Resources/Models'

In [2]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.2/908.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install -q gradio

In [4]:
import json
import torch
import os
import gradio as gr
import pandas as pd
from huggingface_hub import login
from google.colab import (userdata, drive)
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TextIteratorStreamer,
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoConfig
)

In [5]:
login(userdata.get('HF_TOKEN'), add_to_git_credential=True)

In [6]:
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
def build_model(model_name=LLAMA):
  model_path = os.path.join(MODEL_STORE_DIRECTORY, model_name)

  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token

  if os.path.exists(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", quantization_config=quant_config, local_files_only=True)
  else:
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quant_config, cache_dir=MODEL_STORE_DIRECTORY)


  return model, tokenizer

In [27]:
def generate_user_prompt(type_of_dataset, no_of_samples, attrs):
  user_prompt = user_prompt_template.format(type_of_dataset=type_of_dataset, n=no_of_samples)
  if attrs is not None and len(attrs.strip()) > 0:
    user_prompt += f"\n Use only these attributes ({attrs})" if bool(attrs.strip()) else ""
  return user_prompt

# zero-shot
sys_prompt = "You're a synthetic data generator you provide a certain dataset of the type given by the user, number of samples to be given by the user \n\
  reply with only the dataset without talking and without explanation\n\
  reply in JSON format (array of objects)"

user_prompt_template = "Genearte {n} number of sample of {type_of_dataset}"

In [29]:
def generate(type_of_dataset, no_of_samples, attrs, model, tokenizer) -> str:
  if not bool(type_of_dataset.strip()):
    raise ValueError("Must enter dataset type")
  if no_of_samples is None or no_of_samples > MAX_DATASET_SIZE:
    raise ValueError("Input value exceeds maximum limit of {max}.".format(max=MAX_DATASET_SIZE))
  messages = [
      {"role": "system", "content": sys_prompt},
      {"role": "user", "content": generate_user_prompt(type_of_dataset, no_of_samples, attrs)}
  ]
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
  input_length = inputs.shape[1]
  outputs = model.generate(inputs, max_new_tokens=2000)
  new_tokens = outputs[:, inputs.shape[1]:]
  generated_text = tokenizer.decode(new_tokens[0], skip_special_tokens=True)
  return generated_text[len("assistant"):].strip()

In [30]:
def is_valid_response(response) -> bool:
  try:
    json.loads(response)
    return True
  except:
    return False

In [25]:
model, tokenizer = build_model()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [31]:
def on_click(type_of_dataset, no_of_samples, attrs):
  try:
    response = generate(type_of_dataset, no_of_samples, attrs, model, tokenizer)
  except:
    print("invalid model response", response)
    response = '[{"Error", "Sorry, An error occured while generated your data"}]'
  if not is_valid_response(response):
    print("invalid model response", response)
    response = '[{"Error": "Sorry, I cannot provide the requested data"}]'

  df = pd.DataFrame(json.loads(response))
  file_path = "output.csv"
  df.to_csv(file_path, index=False)

  return df, file_path

In [33]:
with gr.Blocks() as UI:
  gr.Markdown("## Synthetic dataset generator")
  with gr.Column():
    input_type = gr.Textbox(label="What kind of data?", placeholder="Country Population")
    input_count = gr.Number(label="Number of samples", placeholder="Max is 100", precision=0, minimum=1, maximum=MAX_DATASET_SIZE)
    input_attr = gr.Textbox(label="Dataset Attributes (optional)", placeholder="Use comma separated column names")
    submit_btn = gr.Button("Generate")

  with gr.Column(show_progress=True):
    download_btn = gr.File(label="Export CSV", interactive=False, height="2em")
    output_grid = gr.Dataframe(label="Result", interactive=False)

  submit_btn.click(on_click, inputs=[input_type, input_count, input_attr], outputs=[output_grid, download_btn])

UI.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3649d15d015cdf3282.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


