In [2]:
# Step 1: Install required libraries and resolve conflicts
!pip install transformers datasets gradio fpdf python-docx fsspec==2024.9.0 gcsfs --upgrade

# Step 2: Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gradio as gr
from fpdf import FPDF  # For generating PDF files
from docx import Document  # For generating Word documents

# Step 3: Load a more powerful model
model_name = "EleutherAI/gpt-neo-1.3B"  # Larger model for better text generation
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 4: Define the function for synthetic data generation
def generate_data(prompt):
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    synthetic_data = generator(
        prompt,
        max_length=500,  # Generate longer outputs
        num_return_sequences=1,
        temperature=0.8,  # Adjust randomness
        top_p=0.9         # Use nucleus sampling
    )
    return synthetic_data[0]["generated_text"]

# Step 5: Define functions to save outputs in different formats
def save_as_pdf(text):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)
    file_path = "synthetic_data.pdf"
    pdf.output(file_path)
    return file_path

def save_as_word(text):
    doc = Document()
    doc.add_paragraph(text)
    file_path = "synthetic_data.docx"
    doc.save(file_path)
    return file_path

def save_as_text(text):
    file_path = "synthetic_data.txt"
    with open(file_path, "w") as f:
        f.write(text)
    return file_path

# Step 6: Create Gradio interface
def gradio_interface(prompt, save_format):
    generated_text = generate_data(prompt)  # Generate synthetic data

    # Save the text in the chosen format
    if save_format == "PDF":
        file_path = save_as_pdf(generated_text)
    elif save_format == "Word":
        file_path = save_as_word(generated_text)
    elif save_format == "Text":
        file_path = save_as_text(generated_text)
    else:
        file_path = None

    return generated_text, file_path

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your prompt", placeholder="E.g., Generate a dataset of customer names..."),
        gr.Radio(["PDF", "Word", "Text"], label="Save format"),
    ],
    outputs=[
        gr.Textbox(label="Generated Synthetic Data"),
        gr.File(label="Download File"),
    ],
    title="Synthetic Data Generator",
    description="Generate synthetic datasets with a powerful model and save the output as PDF, Word, or Text."
)

# Step 7: Launch the interface
interface.launch()


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.
Collecting gcsfs
  Downloading gcsfs-2024.10.0-py2.py3-none-any.whl.metadata (1.6 kB)
  Downloading gcsfs-2024.9.0.post1-py2.py3-none-any.whl.metadata (1.6 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K 

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://463bd0af6ef214f265.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


