# Synthetic Dataset Generator

In [1]:
# Importing required libraries

import re
import os
import io
import sys
import json
import ollama
import requests
import tempfile
import subprocess
import pandas as pd
import gradio as gr
from pathlib import Path
from openai import OpenAI
from datetime import datetime
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display

In [2]:
load_dotenv()
OPENAI = os.getenv('OPENAI_API_KEY')

if OPENAI:
    print(f"OpenAI API Key exists and begins with {OPENAI[:8]}")
else:
    print("OpenAI API Key is not working properly. Please recheck!")

OpenAI API Key exists and begins with sk-proj-


In [3]:
OLLAMA = os.getenv('OLLAMA_MODEL')

if OLLAMA:
    print("Ollama is running...")
else:
    print("Failed to establish connection to ollama. Please recheck!")

Ollama is running...


### Creating Prompts

In [4]:
system_prompt = """You are a data generation expert specialized in creating high-quality, realistic synthetic datasets for business analysis and machine learning applications. 
Your role is to understand the business context and generate structured data that mimics real-world scenarios, reflecting typical patterns, anomalies, and variability seen in actual business operations. 
You will always generate data in the format requested (e.g., CSV, JSON, Parquet) and make educated assumptions to define relevant and meaningful columns if not specified.

Key requirements:
- Tailor the dataset to the specific business domain.
- Include date/time, categorical, numerical, and text fields where applicable.
- Use realistic ranges and distributions for prices, counts, and dates.
- Inject minor noise or variability where appropriate to simulate real-world behavior.
- Respect the requested output format strictly.
"""

In [5]:
user_prompt = """Please generate a synthetic dataset for the following business scenario. 
If specific columns are not provided, infer and create suitable ones based on your domain knowledge. 
Format the output as requested. Include about 50-75 rows of data unless specified otherwise.

Example 1:
Business: A luxury watch retail store
Format: CSV
Output:
Item,Price,Quantity,Brand,Sale Date
Superocean II,20000,3,Breitling,2025-04-08
Daytona,32000,2,Rolex,2025-04-10
Portugieser,15000,4,IWC,2025-04-11

Example 2:
Business: A ride-sharing platform operating in urban areas
Format: CSV
Output:
Ride ID,Pickup Location,Dropoff Location,Fare (USD),Driver Rating,Trip Duration (min),Timestamp
R12345,Midtown,Soho,18.50,4.9,16,2025-06-01 14:23:00
R12346,Brooklyn,Downtown,22.10,4.7,23,2025-06-01 15:05:00
"""

In [6]:
def dataset_format(data_format, num_records):
    format_message = ''

    if data_format.upper() == 'CSV':
        format_message = 'Please provide the dataset in CSV format.'
    elif data_format.upper() == 'JSON':
        format_message = 'Please provide the dataset in JSON format.'
    elif data_format.upper() == 'TABULAR':
        format_message = 'Please provide the dataset in a tabular format (markdown style).'
    else:
        format_message = 'Please choose a valid dataset format: CSV, JSON, or Tabular.'

    return f"{format_message} Generate exactly {num_records} realistic records based on the business context."

In [7]:
def complete_user_prompt(user_input, data_format, num_records):
  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_input + dataset_format(data_format, num_records)}
    ]

  return messages

### Generating the dataset

In [10]:
def generate_dataset(user_input, data_format, model_choice, num_records):
    
    messages = complete_user_prompt(user_input, data_format, num_records)

    # Convert messages to plain prompt for non-chat models (like Ollama)
    def convert_to_prompt(messages):
        return "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages)

    if model_choice == "OpenAI":
        try:
            client = OpenAI()
            response = client.chat.completions.create(
                model = "gpt-4o",
                messages = messages,
                temperature = 0.7,
                stream = True
            )
            full_response = ""
            for chunk in response:
                token = chunk.choices[0].delta.content or ""
                full_response += token
                yield full_response 
        except Exception as e:
            yield f"OpenAI API Error: {str(e)}"

    elif model_choice == "Ollama":
        try:
            from ollama import chat as ollama_chat
            prompt = convert_to_prompt(messages)
            stream = ollama_chat(
                model = "llama3",
                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                stream = True
            )
            full_response = ""
            for chunk in stream:
                token = chunk.get("message", {}).get("content", "")
                full_response += token
                yield full_response 
        except Exception as e:
            yield f"Ollama API Error: {str(e)}"
    
    else:
        yield f"Unsupported model: {model_choice}"

### Gradio UI

In [11]:
with gr.Blocks(title = "Synthetic Data Generator", theme = gr.themes.Soft()) as ui:
    gr.Markdown("<h1 style='text-align: center;'>Synthetic Data Generator</h1>")
    gr.Markdown("Describe a business scenario to generate synthetic data using OpenAI or Ollama.")

    with gr.Row():
        with gr.Column(scale = 1):
            user_inputs = gr.Textbox(
                label = "Business Scenario & Data Requirements",
                placeholder = "E.g., A startup analyzing food delivery order trends in urban cities...",
                lines = 15
            )

            model_choice = gr.Dropdown(
                choices = ["OpenAI", "Ollama"],
                label = "Choose Model",
                value = "OpenAI"
            )

            target_format = gr.Radio(
                choices = ["CSV", "JSON", "Tabular"],
                label = "Output Format",
                value = "CSV"
            )

            num_records = gr.Slider(
                minimum = 10, maximum = 200, step = 10, value = 50,
                label = "Number of Records"
            )

            with gr.Row():
                generate_button = gr.Button("Generate")
                clear_button = gr.Button("Clear")

        with gr.Column(scale = 1):
            output = gr.Textbox(
                label = "Generated Synthetic Data",
                lines = 30,
                interactive = False,
                show_copy_button = True
            )

    # Event bindings
    generate_button.click(
        fn = generate_dataset,
        inputs = [user_inputs, target_format, model_choice, num_records],
        outputs = output
    )

    clear_button.click(
        fn = lambda: "",
        inputs = [],
        outputs = output
    )

if __name__ == "__main__":
    ui.launch(share=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://09e46aff9428d40a58.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
