In [1]:

# https://ssahuupgrad-93226.medium.com/using-llms-for-synthetic-data-generation-the-definitive-guide-78aab5f506f0
# https://www.confident-ai.com/blog/the-definitive-guide-to-synthetic-data-generation-using-llms
# https://www.confident-ai.com/blog/why-llm-as-a-judge-is-the-best-llm-evaluation-method
# https://arxiv.org/abs/2304.12244

In [1]:
pip install -U deepeval langchain langchain-community jq langchain-core 'ollama<0.4.0' langchain-ollama tqdm pandas

Note: you may need to restart the kernel to use updated packages.


The system cannot find the file specified.


In [2]:
import os
import pandas as pd

In [3]:
from langchain.schema import Document

def metadata_func(record: dict, metadata: dict) -> dict:
    """
    Merges existing metadata with the metadata extracted from the JSON file
    under the 'product_details' key.
    """
    metadata["product_details"] = record.get("metadata", {})
    return metadata

In [5]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path='../product_descriptions.json',
    jq_schema=".[]",
    content_key="content",
    metadata_func=metadata_func)

docs = loader.load()

In [6]:
docs[0]

Document(metadata={'source': 'C:\\Users\\dinit\\Documents\\Research\\Development\\RetailARVA\\notebooks\\product_descriptions.json', 'seq_num': 1, 'product_details': {'id': 1, 'name': 'The Ordinary Peeling Solution', 'brand': 'The Ordinary', 'category': 'Exfoliating Peel', 'price': 'LKR 6,350.00'}}, page_content='**Introducing The Ordinary Peeling Solution: A High-Strength Exfoliating Peel for Radiant Skin**\n\nExperience the power of a clinically formulated exfoliating peel with The Ordinary Peeling Solution, designed to improve skin texture, clear pore congestion, and target uneven skin tone. This high-strength exfoliator is packed with 30% Alpha Hydroxy Acids (AHA) and 2% Beta Hydroxy Acids (BHA), including Glycolic Acid, Salicylic Acid, Lactic Acid, Tartaric Acid, and Citric Acid.\n\n**Key Benefits:**\n\n* Improves skin texture for a smoother complexion\n* Clears pore congestion to reduce the appearance of enlarged pores\n* Targets uneven skin tone to reveal brighter, more radiant 

In [7]:
from langchain_ollama.chat_models import ChatOllama
from langchain.embeddings import OllamaEmbeddings

In [20]:
import os

EMBEDDING_MODEL = 'nomic-embed-text:latest'
CHAT_MODEL = 'qwen3:30b-a3b'

In [9]:
ollama_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

  ollama_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)


In [None]:
content = [doc.page_content for doc in docs]
embeddings = []
synthetic_dataset = []

In [13]:
from tqdm import tqdm

for text in tqdm(content, desc="Embedding Documents", total=len(content)):
    embedding = ollama_embeddings.embed_query(text)
    embeddings.append(embedding)

print(f"{len(embeddings)} Embeddings generated successfully.")

Embedding Documents: 100%|██████████| 100/100 [09:13<00:00,  5.53s/it]

100 Embeddings generated successfully.





In [14]:
import random

# randomly selecting a chunk of data to act as your focal anchor
reference_index = random.randint(0, len(embeddings) - 1)
reference_embedding = embeddings[reference_index]
contexts = [content[reference_index]]

In [15]:
contexts

['**Introducing Banila Co Clean It Zero Cleansing Balm: A Revolutionary Skincare Essential**\n\nExperience the power of a deep cleanse with the Banila Co Clean It Zero Cleansing Balm, a game-changing skincare product designed to remove makeup and impurities while softening and hydrating your skin. This innovative cleansing balm is perfect for all skin types, including oily, dry, combination, sensitive, and normal skin.\n\n**Key Ingredients and Benefits**\n\nThe Banila Co Clean It Zero Cleansing Balm features a unique blend of key ingredients, including Acerola Fruit Extract, Papaya Extract, and Bambusa Vulgaris Leaf Extract. These powerful ingredients work together to provide exfoliation, tighten pores, and hydrate the skin, leaving you with a smoother and more even-toned complexion.\n\n**How it Works**\n\nTo use, simply scoop a moderate amount of the cleansing balm and massage it onto your dry face. Add warm water to emulsify, then rinse thoroughly. For best results, use the spatula p

In [16]:
# set a similarity threshold and use cosine similarity to identify related chunks to build your context
import numpy as np

similarity_threshold = 0.9
similar_indices = []
for i, embedding in enumerate(embeddings):
    product = np.dot(reference_embedding, embedding)
    norm = np.linalg.norm(reference_embedding) * np.linalg.norm(embedding)
    similarity = product / norm
    if similarity >= similarity_threshold:
        similar_indices.append(i)
for i in similar_indices:
    contexts.append(content[i])

In [17]:
len(contexts)

2

In [21]:
prompt = f"""I want you act as a copywriter. Based on the given context,
which is list of strings, please generate a list of JSON objects
with a `input` key. The `input` can either be a question or a
statement that can be addressed by the given context.
contexts:
{contexts}"""

In [25]:
llm = ChatOllama(
    base_url='http://64.247.196.62:11434', 
    model=CHAT_MODEL, 
    temperature=0.6
)

In [26]:
query = llm.invoke(prompt)

In [27]:
query.content

'<think>\nOkay, I need to generate a list of JSON objects with an "input" key based on the given contexts. The input should be either a question or a statement that can be addressed by the context. Let me look at the contexts provided.\n\nFirst, the contexts are two identical paragraphs about the Banila Co Clean It Zero Cleansing Balm. So, the information is the same in both. The user wants me to create questions or statements that can be answered using this context.\n\nI should start by breaking down the context into different sections. The sections are: Introduction, Key Ingredients and Benefits, How it Works, Skin Suitability and Concerns, Safety Information, What Our Customers Say, and Get Ready to Experience the Power of Clean Skin. Each of these sections has specific information.\n\nFor each section, I can generate possible questions or statements. For example, under Key Ingredients, the ingredients are Acerola Fruit Extract, Papaya Extract, and Bambusa Vulgaris Leaf Extract. So 

### if you are using a reasoning model do this

In [28]:
import re, json

no_think = re.sub(r'<think>.*?</think>', '', query.content, flags=re.DOTALL).strip()

# 2. Locate the JSON array
match = re.search(r'(\[\s*\{.*\}\s*\])', no_think, flags=re.DOTALL)
if not match:
    raise ValueError("Could not find JSON array in the output")

json_text = match.group(1)

# 3. Parse it
result = json.loads(json_text)
print(result)

[{'input': 'What are the key ingredients in the Banila Co Clean It Zero Cleansing Balm?'}, {'input': 'How do you use the Banila Co Clean It Zero Cleansing Balm?'}, {'input': 'Is the Banila Co Clean It Zero Cleansing Balm suitable for sensitive skin?'}, {'input': 'What skin concerns does the Banila Co Clean It Zero Cleansing Balm address?'}, {'input': 'Does the product contain fragrances?'}, {'input': 'What do customers say about the Banila Co Clean It Zero Cleansing Balm?'}, {'input': 'What is the price of the Banila Co Clean It Zero Cleansing Balm?'}, {'input': 'How does the product work when added to water?'}, {'input': 'Can the Banila Co Clean It Zero Cleansing Balm replace double cleansing?'}, {'input': 'What benefits do the key ingredients provide?'}, {'input': 'Why is the Banila Co Clean It Zero Cleansing Balm considered revolutionary?'}, {'input': 'How can users avoid contamination while using the product?'}, {'input': 'What do expert reviewers say about the product?'}, {'input'

In [39]:
multi_context_template = """
I want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`
1. `Input` should require information from all `Context` elements.
2. `Rewritten Input` must be concise and fully answerable from `Context`.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""
reasoning_template = """
I want you to rewrite the given `input` so that it explicitly requests multi-step reasoning.
1. `Rewritten Input` should require multiple logical connections or inferences.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""
hypothetical_scenario_template = """
I want you to rewrite the given `input` to incorporate a hypothetical or speculative scenario.
1. `Rewritten Input` should encourage applying knowledge from `Context` to deduce outcomes.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

In [40]:
from langchain_core.prompts import PromptTemplate

multi_context_template = PromptTemplate.from_template(multi_context_template)
reasoning_template = PromptTemplate.from_template(reasoning_template)
hypothetical_scenario_template = PromptTemplate.from_template(hypothetical_scenario_template)

In [41]:
multi_context_template

PromptTemplate(input_variables=['context', 'original_input'], input_types={}, partial_variables={}, template="\nI want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`\n1. `Input` should require information from all `Context` elements.\n2. `Rewritten Input` must be concise and fully answerable from `Context`.\n3. Do not use phrases like 'based on the provided context.'\n4. `Rewritten Input` should not exceed 15 words.\nContext: {context}\nInput: {original_input}\nRewritten Input:\n")

In [42]:
evolution_templates = [multi_context_template, reasoning_template, hypothetical_scenario_template]
# Number of evolution steps to apply
num_evolution_steps = 3

In [43]:
multi_context_prompt = multi_context_template.format(context=contexts, original_input=result[0]['input'])
reasoning_prompt = reasoning_template.format(context=contexts, original_input=result[0]['input'])
hypothetical_scenario_prompt = hypothetical_scenario_template.format(context=contexts, original_input=result[0]['input'])

In [44]:
multi_context_prompt

'\nI want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`\n1. `Input` should require information from all `Context` elements.\n2. `Rewritten Input` must be concise and fully answerable from `Context`.\n3. Do not use phrases like \'based on the provided context.\'\n4. `Rewritten Input` should not exceed 15 words.\nContext: [\'**Introducing Banila Co Clean It Zero Cleansing Balm: A Revolutionary Skincare Essential**\\n\\nExperience the power of a deep cleanse with the Banila Co Clean It Zero Cleansing Balm, a game-changing skincare product designed to remove makeup and impurities while softening and hydrating your skin. This innovative cleansing balm is perfect for all skin types, including oily, dry, combination, sensitive, and normal skin.\\n\\n**Key Ingredients and Benefits**\\n\\nThe Banila Co Clean It Zero Cleansing Balm features a unique blend of key ingredients, including Acerola Fruit Extract, Papaya Extract, and Bamb

In [None]:
# Function to perform random evolution steps
def evolve_query(original_input, context, steps):
    current_input = original_input
    for _ in range(steps):
        # Choose a random (or using custom logic) template from the list
        chosen_template = random.choice(evolution_templates)
        # Replace the placeholders with the current context and input
        evolved_prompt = chosen_template.invoke({"context": context, "original_input": current_input})
        # Update the current input with the "Rewritten Input" section
        current_input = llm.invoke(evolved_prompt)
        # commment out if not using a reasoning model
        current_input = re.sub(r'<think>.*?</think>', '', current_input.content, flags=re.DOTALL).strip()
    return current_input



In [None]:
evolved_queries = []

In [None]:
for original_input in result:
    # Evolve the input by randomly selecting the evolution type
    evolved_query = evolve_query(original_input['input'], contexts, num_evolution_steps)
    # evolve_query = re.sub(r'<think>.*?</think>', '', evolve_query.content, flags=re.DOTALL).strip()
    print(f"Original Input: {original_input['input']}")
    print(f"Evolved Query: {evolved_query}")
    print("-" * 50)
    evolved_queries.append(evolved_query)
    

Original Input: What are the key ingredients in the Banila Co Clean It Zero Cleansing Balm?
Evolved Query: What ingredients exfoliate and what skin concerns do they address?
--------------------------------------------------
Original Input: How do you use the Banila Co Clean It Zero Cleansing Balm?
Evolved Query: How do ingredients and usage benefit diverse skin types?
--------------------------------------------------
Original Input: Is the Banila Co Clean It Zero Cleansing Balm suitable for sensitive skin?
Evolved Query: Is it suitable for sensitive skin with fragrance concerns?
--------------------------------------------------
Original Input: What skin concerns does the Banila Co Clean It Zero Cleansing Balm address?
Evolved Query: Which ingredients target concerns and their application steps?
--------------------------------------------------


KeyboardInterrupt: 

In [None]:
expected_output_template = """
I want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.
Context: {context}
Input: {evolved_query}
Answer:
"""

In [None]:
expected_outputs = []

In [None]:
for evolved_query in evolved_queries:
    expected_output_template = PromptTemplate.from_template(expected_output_template)
    expected_output_prompt = expected_output_template.invoke({"context": contexts, "evolved_query": evolved_query})
    print(f"Evolved Query: {evolved_query}")
    expected_output = llm.invoke(expected_output_prompt)
    # only if using a reasoning model
    expected_ouput = re.sub(r'<think>.*?</think>', '', expected_output.content, flags=re.DOTALL).strip()
    print(f"Expected Output: {expected_output}")
    print("-" * 50)
    expected_outputs.append(expected_output)

In [None]:
%pip install pydantic



In [None]:
from pydantic import BaseModel
from typing import Optional, List

In [None]:
class SyntheticData(BaseModel):
	query: str
	expected_output: Optional[str]
	context: List[str]


In [None]:
for i, evolved_query in enumerate(evolved_queries):
    synthetic_data = SyntheticData(
	    query=evolved_query,
	    expected_output=expected_outputs[i],
	    context=contexts
)

In [None]:
synthetic_dataset.append(synthetic_data)