<a href="https://colab.research.google.com/github/Durgance/LLM-Syn-Data-Gen/blob/main/Data_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q --upgrade bitsandbytes accelerate

In [2]:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
  if gpu_info.find('Tesla T4') >= 0:
    print("Success - Connected to a T4")
  else:
    print("NOT CONNECTED TO A T4")

Sun Nov 30 19:39:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch
from transformers import AutoModelForCausalLM , AutoTokenizer, pipeline,  TextStreamer, BitsAndBytesConfig
from huggingface_hub import login
import os
from google.colab import userdata
import gc

hf_token = userdata.get('HF_TOKEN')

# hf_token = os.getenv('HF_API_KEY')
if hf_token:
    print(f"OpenRouter API Key exists and begins {hf_token[:3]}")
else:
    print("OpenRouter API Key not set (and this is optional)")

# torch.cuda.empty_cache()

OpenRouter API Key exists and begins hf_


In [4]:
login(hf_token, add_to_git_credential=True)

In [5]:
quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    )

In [69]:
def generate(MODEL,messages,quant = True):

    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.pad_token = tokenizer.eos_token
    if quant :
        model = AutoModelForCausalLM.from_pretrained(
                            MODEL,
                            quantization_config=quant_config,
                            device_map="cuda",
                            trust_remote_code=True,

                            )
    else :
        model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="cuda", trust_remote_code=True, offload_folder="offload")


    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt = True).to(model.device)
    # input_ids = input_ids.to(model.device)
    attention_mask = torch.ones_like(input_ids,dtype = torch.long , device = model.device)
    #add_generation_prompt=True tells the tokenizer to add special token to tell the start of the assistant response.
    gen_kwargs = dict(
        input_ids= input_ids,
        attention_mask= attention_mask,
        do_sample = False,
        temperature = 0.0,
        max_new_tokens = 10000
        # return_dict_in_generate=True
        )

    outputs = model.generate(**gen_kwargs)
    response = tokenizer.decode(outputs[0],skip_special_tokens=True).strip()
    try :
        del model , tokenizer
        gc.collect()
        torch.cuda.empty_cache()
    except Exception:
        pass
    return response.split("assistant")[1]



# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [70]:
messages = [
    {"role": "user", "content": "Tell me a joke please !"}
]

In [71]:
MODEL = ["google/gemma-3-270m-it","microsoft/Phi-3-mini-4k-instruct","Qwen/Qwen3-4B-Instruct-2507","meta-llama/Llama-3.1-8B-Instruct"]

In [72]:
# generate(MODEL[0],messages,False)

In [73]:
# # Source - https://stackoverflow.com/a
# # Posted by Maunish Dave, modified by community. See post 'Timeline' for change history
# # Retrieved 2025-11-30, License - CC BY-SA 4.0

# with torch.no_grad():
#     torch.cuda.empty_cache()

# import gc
# gc.collect()
# torch.cuda.empty_cache()

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()


# import torch, gc, sys

# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()

# # Kill variables in global scope
# for name in dir():
#     if not name.startswith('_'):
#         del globals()[name]



In [96]:
def create_system_prompt(schema_type):
    return  f"""
You are an assistant that outputs only valid {schema_type} with headers. Do NOT add any commentary. Use ISO date format YYYY-MM-DD.
"""

In [97]:
def create_user_prompt(row,schema,schema_type):
    user_prompt = f"""

    Create exactly {row} rows of synthetic customer data using this schema (columns): {schema}
Return ONLY {schema_type} (header + {row} rows).
"""
#     Example of Sample Data  in CSV Format :
#     customer_id,date_registration,name,address,phone_number,email,gender,Age,city,state,Review,Rating
# 1001,2025-01-01,Michal,123 Main St,+91 2345624901,william.strong@my-own-personal-domain.com
# ,M,32,Mumbai,Maharashtra,"The product was good and the delivery was fast.",4
# 1002,2025-02-14,Aisha Khan,45 Lotus Ave,+91 9123456780,aisha.khan.27@my-own-personal-domain.com
# ,F,28,Delhi,Delhi,"Quality was okay but packaging could improve.",3
# 1003,2025-03-03,Rahul Verma,78 Green Park,+91 8234567891,rahul.verma.82@my-own-personal-domain.com
# ,M,35,Bengaluru,Karnataka,"Exceeded expectations — will buy again.",5
# 1004,2025-04-20,Neha Reddy,9 Lake Road,+91 7034567892,neha.reddy.14@my-own-personal-domain.com
# ,F,41,Chennai,Tamil Nadu,"Item arrived defective, support was slow.",2
# 1010,2025-10-01,Poonam Jain,77 Sun St,+91 6745123980,poonam.jain.72@my-own-personal-domain.com
# ,F,38,Surat,Gujarat,"Not satisfied with the product quality.",2
    # """
    return user_prompt


In [98]:
SCHEMA = """
customer_id : TEXT  , 1234
date_registration : DATE, 2025-01-01
name : TEXT, Michal
address : TEXT, 123 Main St
phone_number : TEXT, +91 2345624901
email : TEXT, william.strong@my-own-personal-domain.com
gender : TEXT, M
Age : INTEGER, 32
city : TEXT, Mumbai
state : TEXT, Maharashtra
Review : TEXT, 'The product was good and the delivery was fast.'
Rating : INTEGER , 4
"""

In [99]:
def get_messages_prompt(rows,schema_type):
    messages = [
        {"role":"system", "content":create_system_prompt(schema_type),
        "role":"user", "content":create_user_prompt(rows,SCHEMA,schema_type)}
    ]
    return messages



In [86]:
response_1 = generate(MODEL[2],get_messages_prompt(),True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [87]:
print(response_1)
# del model , tokenizer


[
  {
    "customer_id": "1234",
    "date_registration": "2025-01-01",
    "name": "Michal",
    "address": "123 Main St",
    "phone_number": "+91 2345624901",
    "email": "william.strong@my-own-personal-domain.com",
    "gender": "M",
    "Age": 32,
    "city": "Mumbai",
    "state": "Maharashtra",
    "Review": "The product was good and the delivery was fast.",
    "Rating": 4
  },
  {
    "customer_id": "1234",
    "date_registration": "2025-01-02",
    "name": "Elena",
    "address": "456 Oak Ave",
    "phone_number": "+91 2345624902",
    "email": "elena.garcia@my-own-personal-domain.com",
    "gender": "F",
    "Age": 28,
    "city": "Delhi",
    "state": "Delhi",
    "Review": "Excellent quality, arrived on time.",
    "Rating": 5
  },
  {
    "customer_id": "1234",
    "date_registration": "2025-01-03",
    "name": "Raj",
    "address": "789 Pine Rd",
    "phone_number": "+91 2345624903",
    "email": "raj.singh@my-own-personal-domain.com",
    "gender": "M",
    "Age": 35,

In [90]:
# Read JSON string into a dataframe

df_1 = pd.read_json(StringIO(response_1))

# Save back to a JSON file
df_1.to_json("json_output.json", orient="records", indent=4)


In [79]:
response = generate(MODEL[2],get_messages_prompt(),True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [80]:
print(response)


customer_id,date_registration,name,address,phone_number,email,gender,Age,city,state,Review,Rating
1234,2025-01-01,Michal,123 Main St,+91 2345624901,william.strong@my-own-personal-domain.com,M,32,Mumbai,Maharashtra,"The product was good and the delivery was fast.",4
1235,2025-01-02,Emma,456 Oak Ave,+91 2345624902,emma.jones@my-own-personal-domain.com,F,28,Bangalore,Karnataka,"The quality exceeded expectations.",5
1236,2025-01-03,Liam,789 Pine Rd,+91 2345624903,liam.brown@my-own-personal-domain.com,M,35,Delhi,Uttar Pradesh,"Fast shipping and excellent packaging.",4
1237,2025-01-04,Sophia,321 Elm Blvd,+91 2345624904,sophia.garcia@my-own-personal-domain.com,F,29,Chennai,Tamil Nadu,"Great value for money.",4
1238,2025-01-05,Noah,654 Maple Dr,+91 2345624905,noah.wilson@my-own-personal-domain.com,M,38,Kolkata,West Bengal,"Product arrived on time.",5
1239,2025-01-06,Ava,987 Cedar Ln,+91 2345624906,ava.miller@my-own-personal-domain.com,F,26,Pune,Maharashtra,"Excellent customer service.",5
1240

In [89]:
import pandas as pd
from io import StringIO
df = pd.read_csv(StringIO(response))
df.to_csv("output.csv", index=False)

In [None]:
# # # Restart the kernel

# import IPython
# IPython.Application.instance().kernel.do_shutdown(True)

In [58]:
print(response)

user


    Create exactly 100 rows of synthetic customer data using this schema (columns): 
customer_id : TEXT  , 1234
date_registration : DATE, 2025-01-01
name : TEXT, Michal
address : TEXT, 123 Main St
phone_number : TEXT, +91 2345624901
email : TEXT, william.strong@my-own-personal-domain.com
gender : TEXT, M
Age : INTEGER, 32
city : TEXT, Mumbai
state : TEXT, Maharashtra
Review : TEXT, 'The product was good and the delivery was fast.'
Rating : INTEGER , 4

Return ONLY CSV (header + 10 rows).

    Example of Sample Data  in CSV Format : 
    customer_id,date_registration,name,address,phone_number,email,gender,Age,city,state,Review,Rating
1001,2025-01-01,Michal,123 Main St,+91 2345624901,william.strong@my-own-personal-domain.com
,M,32,Mumbai,Maharashtra,"The product was good and the delivery was fast.",4
1002,2025-02-14,Aisha Khan,45 Lotus Ave,+91 9123456780,aisha.khan.27@my-own-personal-domain.com
,F,28,Delhi,Delhi,"Quality was okay but packaging could improve.",3
1003,2025-03-03,Rah

In [54]:
print(response)

user


    Create exactly 10 rows of synthetic customer data using this schema (columns): 
customer_id : TEXT  , 1234
date_registration : DATE, 2025-01-01
name : TEXT, Michal
address : TEXT, 123 Main St
phone_number : TEXT, +91 2345624901
email : TEXT, william.strong@my-own-personal-domain.com
gender : TEXT, M
Age : INTEGER, 32
city : TEXT, Mumbai
state : TEXT, Maharashtra
Review : TEXT, 'The product was good and the delivery was fast.'
Rating : INTEGER , 4

Return ONLY CSV (header + 10 rows).

    
assistant
customer_id,date_registration,name,address,phone_number,email,gender,Age,city,state,Review,Rating
1234,2025-01-01,Michal,123 Main St,+91 2345624901,william.strong@my-own-personal-domain.com,M,32,Mumbai,Maharashtra,"The product was good and the delivery was fast.",4
1235,2025-01-02,Emma,456 Oak Ave,+91 2345624902,emma.jones@my-own-personal-domain.com,F,28,Bangalore,Karnataka,"The product arrived on time and was well-packaged.",5
1236,2025-01-03,Liam,789 Pine Rd,+91 2345624903,liam.

# Creating a UI for the Synthetic data Generator

In [None]:
import gradio as gr

In [102]:
import gradio as gr
import pandas as pd
from io import StringIO
import json

def generate_data_ui(schema_type, num_rows):
    # Ensure num_rows is an integer for the prompt
    num_rows_int = int(num_rows)

    messages = get_messages_prompt(num_rows_int, schema_type)
    response_text = generate(MODEL[2], messages, True)

    if schema_type == "JSON":
        try:
            # Pretty-print JSON for better readability in the Textbox
            parsed_json = json.loads(response_text)
            return json.dumps(parsed_json, indent=2)
        except json.JSONDecodeError:
            return f"Error: Invalid JSON response.\n{response_text}"
    elif schema_type == "CSV":
        try:
            # Read CSV string into a DataFrame for gr.Dataframe display
            df = pd.read_csv(StringIO(response_text))
            return df
        except Exception as e:
            return f"Error parsing CSV: {e}\n{response_text}"
    else:
        return "Unsupported schema type."

with gr.Blocks() as demo:
    gr.Markdown("# Synthetic Data Generator")
    with gr.Row():
        schema_type_radio = gr.Radio(
            ["JSON", "CSV"], label="Schema Type", value="JSON"
        )
        num_rows_slider = gr.Slider(
            minimum=1, maximum=100, step=1, value=10, label="Number of Rows"
        )

    status_message = gr.Textbox(label="Status", interactive=False, value="", visible=True)
    output_json = gr.Textbox(label="Generated JSON Data", lines=20, interactive=False, visible=False)
    output_csv = gr.Dataframe(label="Generated CSV Data", interactive=False, visible=False)

    def conditional_output_fn(schema_type, num_rows):
        yield gr.update(value="Generating data... Please wait.", visible=True), gr.update(visible=False), gr.update(visible=False)
        try:
            if schema_type == "JSON":
                json_data = generate_data_ui(schema_type, num_rows)
                yield gr.update(value="Data generation complete!", visible=True), gr.update(value=json_data, visible=True), gr.update(value=None, visible=False)
            else: # CSV
                df_data = generate_data_ui(schema_type, num_rows)
                yield gr.update(value="Data generation complete!", visible=True), gr.update(value=None, visible=False), gr.update(value=df_data, visible=True)
        except Exception as e:
            yield gr.update(value=f"Error: {e}", visible=True), gr.update(value=None, visible=False), gr.update(value=None, visible=False)

    generate_button = gr.Button("Generate Data")
    generate_button.click(
        fn=conditional_output_fn,
        inputs=[schema_type_radio, num_rows_slider],
        outputs=[status_message, output_json, output_csv],
    )

demo.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7aa31f0044e9e551fb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Keyboard interruption in main thread... closing server.


KeyboardInterrupt: 