# Synthetic Data Generator 

**Generate solid, realistic and well structured tabular synthetic data using natural language prompts via LLMs**

This notebook demonstrates how to create synthetic datasets using the **Meta-LLaMA 3.1 8B-Instruct** model from HuggingFace. You can choose another models on HuggingFace and ask which type of dataset you want then LLM powered SDG will create it.

### Features
- üéØ Natural language data specification
- üìä Automatic JSON-to-DataFrame conversion
- üîß 4-bit quantization for efficient GPU usage
- üñ•Ô∏è Interactive Streamlit UI

### Requirements
- Google Colab with GPU runtime (T4 or better)
- HuggingFace account with access to LLaMA models
- HuggingFace API token stored in Colab secrets as `HF_TOKEN`

---
## 1. Install Dependencies

Install PyTorch with CUDA support, HuggingFace Transformers, and Streamlit.

In [None]:
# Install PyTorch with CUDA 12.4 support
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124

# Install HuggingFace libraries and Streamlit
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 streamlit pyngrok

---
## 2. Import Libraries & Authenticate with HuggingFace

Import all necessary libraries and authenticate with HuggingFace using your API token.

In [None]:
import os
import json
import re

import pandas as pd
import torch
from IPython.display import display

from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
# Authenticate with HuggingFace
# Set HF_TOKEN in Colab secrets (key icon in left sidebar)
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
print('Successfully authenticated with HuggingFace!')

---
## 3. Configure Model

Select which LLaMA model variant to use.

In [None]:
# Choose your model variant
MODEL_ID = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
print(f'Selected model: {MODEL_ID}')

---
## 4. Load Model with 4-bit Quantization

Using 4-bit quantization to reduce memory usage while maintaining quality.

In [None]:
# Configure 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type='nf4'
)

print('Loading model...')

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quant_config,
    device_map='auto',
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token

print('Model loaded successfully!')

---
## 5. Define Core Functions

In [None]:
SYSTEM_PROMPT = '''You are a synthetic dataset generator. Generate realistic tabular data based on user descriptions.

CRITICAL RULES:
1. Output ONLY valid JSON - no explanations, no markdown, no extra text
2. EVERY column array MUST have EXACTLY the same number of elements
3. Column names should be lowercase with underscores (snake_case)

OUTPUT FORMAT:
{"column_name": ["value1", "value2"], "another_column": [1, 2]}

Remember: EXACTLY N rows per column where N is the requested count.'''


def build_messages(user_instructions):
    return [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': user_instructions}
    ]


def extract_json(text):
    decoder = json.JSONDecoder()
    starts = [m.start() for m in re.finditer(r'{', text)]
    for pos in reversed(starts):
        try:
            parsed, _ = decoder.raw_decode(text[pos:])
            if isinstance(parsed, dict):
                return parsed
        except json.JSONDecodeError:
            continue
    return None


def normalize_arrays(data, target_length):
    normalized = {}
    for key, values in data.items():
        if not isinstance(values, list):
            values = [values]
        current_len = len(values)
        if current_len == target_length:
            normalized[key] = values
        elif current_len < target_length:
            cycles = (target_length // current_len) + 1
            normalized[key] = (values * cycles)[:target_length]
        else:
            normalized[key] = values[:target_length]
    return normalized


def generate_data(instructions, n_rows, temperature=0.7):
    user_query = f'Generate EXACTLY {n_rows} rows of data.\n\nRequirements: {instructions.strip()}'
    messages = build_messages(user_query)
    
    inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')
    
    outputs = model.generate(
        inputs,
        max_new_tokens=1500,
        do_sample=True,
        temperature=temperature,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_text = decoded.split('assistant')[-1] if 'assistant' in decoded else decoded
    parsed = extract_json(response_text)
    
    if parsed is None:
        raise ValueError('Failed to parse model response as JSON')
    
    normalized = normalize_arrays(parsed, n_rows)
    return pd.DataFrame(normalized)

print('Core functions defined!')

---
## 6. Test Data Generation

In [None]:
# Example: Generate customer data
test_instructions = '''
Customer data with:
- first_name and last_name
- age between 25 and 55
- email addresses
- city (US cities)
'''

print('Generating data...')
df = generate_data(test_instructions, n_rows=10)
print('Data generated!')
display(df)

---
## 7. Interactive Streamlit UI

Launch an interactive interface for generating synthetic data.

In [None]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import json
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

st.set_page_config(page_title='SynthGen', page_icon='üß™', layout='wide')

SYSTEM_PROMPT = '''You are a synthetic dataset generator. Output ONLY valid JSON.
Format: {"column": ["val1", "val2"], "other": [1, 2]}
EXACTLY N rows per column. No explanations.'''

@st.cache_resource
def load_model():
    MODEL_ID = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type='nf4'
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, quantization_config=quant_config, device_map='auto'
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def extract_json(text):
    decoder = json.JSONDecoder()
    for pos in reversed([m.start() for m in re.finditer(r'{', text)]):
        try:
            parsed, _ = decoder.raw_decode(text[pos:])
            if isinstance(parsed, dict): return parsed
        except: continue
    return None

st.title('üß™ SynthGen')
st.markdown('Generate synthetic datasets using LLaMA 3.1')

with st.spinner('Loading model...'):
    model, tokenizer = load_model()
st.success('Model loaded!')

if 'dataframe' not in st.session_state:
    st.session_state.dataframe = pd.DataFrame()

col1, col2 = st.columns([1, 2])

with col1:
    st.subheader('Configuration')
    n_rows = st.number_input('Number of Rows', min_value=1, max_value=100, value=10)
    temperature = st.slider('Temperature', 0.1, 1.5, 0.7, 0.1)
    instructions = st.text_area('Describe your data', height=150)
    
    if st.button('Generate', type='primary'):
        if instructions:
            with st.spinner('Generating...'):
                try:
                    messages = [
                        {'role': 'system', 'content': SYSTEM_PROMPT},
                        {'role': 'user', 'content': f'Generate EXACTLY {n_rows} rows. Requirements: {instructions}'}
                    ]
                    inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')
                    outputs = model.generate(inputs, max_new_tokens=1500, temperature=temperature, do_sample=True)
                    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
                    parsed = extract_json(decoded.split('assistant')[-1] if 'assistant' in decoded else decoded)
                    if parsed:
                        st.session_state.dataframe = pd.DataFrame(parsed)
                        st.success('Generated!')
                    else:
                        st.error('Failed to parse response')
                except Exception as e:
                    st.error(f'Error: {e}')
    
    if not st.session_state.dataframe.empty:
        st.download_button('Download CSV', st.session_state.dataframe.to_csv(index=False), 'data.csv')

with col2:
    st.subheader('Generated Dataset')
    if not st.session_state.dataframe.empty:
        st.dataframe(st.session_state.dataframe, use_container_width=True)
    else:
        st.info('Enter instructions and click Generate')

In [None]:
# Launch Streamlit with ngrok tunnel
from pyngrok import ngrok
import time

# Start Streamlit
!nohup streamlit run streamlit_app.py --server.port 8501 > /dev/null 2>&1 &
time.sleep(3)

# Create public URL
public_url = ngrok.connect(8501)
print(f'\nüöÄ Open this URL: {public_url}')

---
## 8. Export Data

In [None]:
# Export to CSV
# df.to_csv('my_data.csv', index=False)