In [1]:

# https://ssahuupgrad-93226.medium.com/using-llms-for-synthetic-data-generation-the-definitive-guide-78aab5f506f0
# https://www.confident-ai.com/blog/the-definitive-guide-to-synthetic-data-generation-using-llms
# https://www.confident-ai.com/blog/why-llm-as-a-judge-is-the-best-llm-evaluation-method
# https://arxiv.org/abs/2304.12244

In [None]:
pip install -U deepeval langchain langchain-community jq langchain-core 'ollama<0.4.0' langchain-ollama tqdm pandas pydantic

Note: you may need to restart the kernel to use updated packages.


The system cannot find the file specified.


In [2]:
import os
import pandas as pd

In [3]:
from pydantic import BaseModel
from typing import Optional, List

In [46]:
class SyntheticData(BaseModel):
	query: str
	expected_output: Optional[str]
	context: str
	groundedness_score: Optional[int]
	groundedness_eval: Optional[str]
	relevance_score: Optional[int]
	relevance_eval: Optional[str]

In [5]:
from langchain.schema import Document

def metadata_func(record: dict, metadata: dict) -> dict:
    """
    Merges existing metadata with the metadata extracted from the JSON file
    under the 'product_details' key.
    """
    metadata["product_details"] = record.get("metadata", {})
    return metadata

In [6]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path='../product_descriptions.json',
    jq_schema=".[]",
    content_key="content",
    metadata_func=metadata_func)

docs = loader.load()

In [7]:
docs[0]

Document(metadata={'source': 'D:\\research\\RetailARVA\\notebooks\\product_descriptions.json', 'seq_num': 1, 'product_details': {'id': 1, 'name': 'The Ordinary Peeling Solution', 'brand': 'The Ordinary', 'category': 'Exfoliating Peel', 'price': 'LKR 6,350.00'}}, page_content='**Introducing The Ordinary Peeling Solution: A High-Strength Exfoliating Peel for Radiant Skin**\n\nExperience the power of a clinically formulated exfoliating peel with The Ordinary Peeling Solution, designed to improve skin texture, clear pore congestion, and target uneven skin tone. This high-strength exfoliator is packed with 30% Alpha Hydroxy Acids (AHA) and 2% Beta Hydroxy Acids (BHA), including Glycolic Acid, Salicylic Acid, Lactic Acid, Tartaric Acid, and Citric Acid.\n\n**Key Benefits:**\n\n* Improves skin texture for a smoother complexion\n* Clears pore congestion to reduce the appearance of enlarged pores\n* Targets uneven skin tone to reveal brighter, more radiant skin\n\n**How to Use:**\n\nFor optimal

In [8]:
import os

EMBEDDING_MODEL = 'nomic-embed-text:latest'
CHAT_MODEL = 'qwen3:14b-q4_K_M'
OLLAMA_URL = 'http://209.137.198.202:11434'

In [9]:
os.environ["LANGSMITH_TRACING"]='true'
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]="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"
os.environ["LANGSMITH_PROJECT"]="retailarva-rag-eval"

In [10]:

from langchain_ollama import OllamaEmbeddings

ollama_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=OLLAMA_URL)

In [11]:
# content = [doc.page_content for doc in docs]
contents = []
embeddings = []
synthetic_dataset = []

In [12]:
for doc in docs:
     contents.append({
          "content": doc.page_content,
          "name": doc.metadata['product_details']['name']
     })

In [13]:
from tqdm import tqdm

for doc in tqdm(contents, desc="Embedding Documents", total=len(contents)):
    embedding = ollama_embeddings.embed_query(doc['content'])
    embeddings.append(embedding)

print(f"{len(embeddings)} Embeddings generated successfully.")

Embedding Documents:   0%|          | 0/100 [00:00<?, ?it/s]

Embedding Documents: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]

100 Embeddings generated successfully.





In [49]:
import random

# randomly selecting a chunk of data to act as your focal anchor
reference_index = random.randint(0, len(embeddings) - 1)
reference_embedding = embeddings[reference_index]
context = contents[reference_index]

In [50]:
context

{'content': '**Introducing Differin Acne Treatment: A Powerful Solution for Clearer Skin**\n\nSay goodbye to acne, blackheads, and clogged pores with Differin Acne Treatment, a clinically-proven formula designed to regulate skin cell turnover and reduce inflammation. This effective treatment is specifically formulated for oily, combination, and acne-prone skin types, addressing concerns such as sensitivity, redness, and irritation.\n\n**Key Ingredients:**\n\nDifferin Acne Treatment features Adapalene as its key ingredient, a derivative of vitamin A that helps to prevent clogged pores and reduce the appearance of acne. With a concentration of 0.1%, this gel is gentle yet effective in controlling breakouts. The full ingredient list includes:\n\n* Active: Adapalene 0.1%\n* Inactive: Carbomer 940, Edetate Disodium, Methylparaben, Poloxamer 182, Propylene Glycol, Purified Water, Sodium Hydroxide\n\n**Benefits and Claims:**\n\nThis fragrance-free, dye-free, lanolin-free, and sulfate-free tre

In [51]:
prompt = f"""I want you act as a copywriter. Based on the given context,
which is list of strings, please generate a list of 5 JSON objects
with a `input` key. 
- The `input` should be a question that can be addressed by the given context.
- The `input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.

context:
{context['content']}"""

In [52]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    base_url=OLLAMA_URL, 
    model=CHAT_MODEL, 
    temperature=0.6,
    max_tokens=512,
)

In [54]:
query = llm.invoke(prompt)

In [55]:
query.content

'<think>\nOkay, so I need to generate 5 JSON objects based on the given context about Differin Acne Treatment. Each JSON should have an "input" key that\'s a question a customer might ask a shop assistant regarding skincare products. Let me start by reading through the context carefully to understand the key points.\n\nThe context mentions Differin\'s key ingredient is Adapalene 0.1%, which is a vitamin A derivative. It\'s for oily, combination, and acne-prone skin. Benefits include clearing acne, reducing inflammation, and being gentle. Usage instructions say to apply once daily, start with a small amount, avoid sun exposure, and use sunscreen. It also mentions possible side effects like irritation, dryness, and that it contains Methylparaben and rubber as an allergen. Customer reviews are positive, and the price is 8,950 LKR.\n\nNow, thinking about possible customer questions. They might ask about the main ingredient, how it works, suitability for their skin type, how to use it, side

### if you are using a reasoning model do this

In [56]:
import re, json

no_think = re.sub(r'<think>.*?</think>', '', query.content, flags=re.DOTALL).strip()

# 2. Locate the JSON array
match = re.search(r'(\[\s*\{.*\}\s*\])', no_think, flags=re.DOTALL)
if not match:
    raise ValueError("Could not find JSON array in the output")

json_text = match.group(1)

# 3. Parse it
result = json.loads(json_text)
print(result)

[{'input': 'What is the main active ingredient in Differin Acne Treatment and how does it help with acne?'}, {'input': 'Is Differin Acne Treatment suitable for sensitive skin?'}, {'input': 'How should I apply Differin Acne Treatment for the best results?'}, {'input': 'What are the potential side effects of using Differin Acne Treatment?'}, {'input': 'What is the price of Differin Acne Treatment?'}]


In [57]:
for input in result:
     print(input['input'])

What is the main active ingredient in Differin Acne Treatment and how does it help with acne?
Is Differin Acne Treatment suitable for sensitive skin?
How should I apply Differin Acne Treatment for the best results?
What are the potential side effects of using Differin Acne Treatment?
What is the price of Differin Acne Treatment?


In [58]:
len(result)

5

In [59]:
multi_context_template = """
I want you to rewrite the given `input` so that it requires readers to use information in `Context`
1. `Input` should require information from `Context` elements.
2. `Rewritten Input` must be concise and fully answerable from `Context`.
3. `Rewritten Input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.
4. Do not use phrases like 'based on the provided context.'
5. `Rewritten Input` should not exceed 30 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

reasoning_template = """
I want you to rewrite the given `input` so that it explicitly requests multi-step reasoning.
1. `Rewritten Input` should require multiple logical connections or inferences.
2. `Rewritten Input` should be concise and understandable.
3. `Rewritten Input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.
4. Do not use phrases like 'based on the provided context.'
5. `Rewritten Input` must be fully answerable from `Context`.
6. `Rewritten Input` should not exceed 30 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

hypothetical_scenario_template = """
I want you to rewrite the given `input` to incorporate a hypothetical or speculative scenario.
1. scenario is a customer asking information about a skincare or asking alternative products/recommendations from a shop assistant.
2. `Rewritten Input` should encourage applying knowledge from `Context` to deduce outcomes.
3. `Rewritten Input` should be concise and understandable.
4. `Rewritten Input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.
5. Do not use phrases like 'based on the provided context.'
6. `Rewritten Input` must be fully answerable from `Context`.
7. `Rewritten Input` should not exceed 30 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

In [60]:
from langchain_core.prompts import PromptTemplate

multi_context_template = PromptTemplate.from_template(multi_context_template)
reasoning_template = PromptTemplate.from_template(reasoning_template)
hypothetical_scenario_template = PromptTemplate.from_template(hypothetical_scenario_template)

In [61]:
multi_context_template

PromptTemplate(input_variables=['context', 'original_input'], input_types={}, partial_variables={}, template="\nI want you to rewrite the given `input` so that it requires readers to use information in `Context`\n1. `Input` should require information from `Context` elements.\n2. `Rewritten Input` must be concise and fully answerable from `Context`.\n3. `Rewritten Input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.\n4. Do not use phrases like 'based on the provided context.'\n5. `Rewritten Input` should not exceed 30 words.\nContext: {context}\nInput: {original_input}\nRewritten Input:\n")

In [62]:
evolution_templates = [multi_context_template, reasoning_template, hypothetical_scenario_template]
# Number of evolution steps to apply
num_evolution_steps = 3

In [63]:
multi_context_prompt = multi_context_template.format(context=context['content'], original_input=result[0]['input'])
reasoning_prompt = reasoning_template.format(context=context['content'], original_input=result[0]['input'])
hypothetical_scenario_prompt = hypothetical_scenario_template.format(context=context['content'], original_input=result[0]['input'])

In [64]:
multi_context_prompt

'\nI want you to rewrite the given `input` so that it requires readers to use information in `Context`\n1. `Input` should require information from `Context` elements.\n2. `Rewritten Input` must be concise and fully answerable from `Context`.\n3. `Rewritten Input` should Mimic the kind of queries a customer might ask from shop assistant regarding skincare products.\n4. Do not use phrases like \'based on the provided context.\'\n5. `Rewritten Input` should not exceed 30 words.\nContext: **Introducing Differin Acne Treatment: A Powerful Solution for Clearer Skin**\n\nSay goodbye to acne, blackheads, and clogged pores with Differin Acne Treatment, a clinically-proven formula designed to regulate skin cell turnover and reduce inflammation. This effective treatment is specifically formulated for oily, combination, and acne-prone skin types, addressing concerns such as sensitivity, redness, and irritation.\n\n**Key Ingredients:**\n\nDifferin Acne Treatment features Adapalene as its key ingred

In [65]:
# Function to perform random evolution steps
def evolve_query(original_input, context, steps):
    current_input = original_input
    for _ in range(steps):
        # Choose a random (or using custom logic) template from the list
        chosen_template = random.choice(evolution_templates)
        # Replace the placeholders with the current context and input
        evolved_prompt = chosen_template.invoke({"context": context['content'], "original_input": current_input})
        # Update the current input with the "Rewritten Input" section
        current_input = llm.invoke(evolved_prompt)
        # commment out if not using a reasoning model
        current_input = re.sub(r'<think>.*?</think>', '', current_input.content, flags=re.DOTALL).strip()
    return current_input



In [66]:
evolved_queries = []

In [67]:
for original_input in result:
    # Evolve the input by randomly selecting the evolution type
    evolved_query = evolve_query(original_input['input'], context, num_evolution_steps)
    # evolve_query = re.sub(r'<think>.*?</think>', '', evolve_query.content, flags=re.DOTALL).strip()
    print(f"Original Input: {original_input['input']}")
    print(f"Evolved Query: {evolved_query}")
    print("-" * 50)
    evolved_queries.append(evolved_query)
    

Original Input: What is the main active ingredient in Differin Acne Treatment and how does it help with acne?
Evolved Query: "I have acne-prone skin. How does Differin's key ingredient help prevent breakouts by regulating skin cell turnover?"
--------------------------------------------------
Original Input: Is Differin Acne Treatment suitable for sensitive skin?
Evolved Query: "Can Differin Acne Treatment irritate sensitive skin?"
--------------------------------------------------
Original Input: How should I apply Differin Acne Treatment for the best results?
Evolved Query: What's the correct way to apply Differin Acne Treatment for best results?
--------------------------------------------------
Original Input: What are the potential side effects of using Differin Acne Treatment?
Evolved Query: What side effects might occur with Differin Acne Treatment, and why does Adapalene cause initial irritation?
--------------------------------------------------
Original Input: What is the pri

In [68]:
print(f"Generated {len(evolved_queries)} evolved queries.")

Generated 5 evolved queries.


In [69]:
expected_output_prompt_template = """
I want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.
Ensure the answer resembles a shop assistant tasked with helping a customer with the questions they have regarding skincare products. It should pose additional questions if the details are inadequate or provides an answer when the input is sufficiently detailed.
Context: {context}
Input: {evolved_query}
Answer:
"""

In [70]:
expected_outputs = []

In [71]:
for evolved_query in evolved_queries:
    expected_output_template = PromptTemplate.from_template(expected_output_prompt_template)
    expected_output_prompt = expected_output_template.invoke({"context": context['content'], "evolved_query": evolved_query})
    print(f"Evolved Query: {evolved_query}")
    expected_output = llm.invoke(expected_output_prompt)
    # only if using a reasoning model
    expected_output = re.sub(r'<think>.*?</think>', '', expected_output.content, flags=re.DOTALL).strip()
    print(f"Expected Output: {expected_output}")
    print("-" * 50)
    expected_outputs.append(expected_output)

Evolved Query: "I have acne-prone skin. How does Differin's key ingredient help prevent breakouts by regulating skin cell turnover?"
Expected Output: Differin Acne Treatment’s key ingredient, **Adapalene** (a vitamin A derivative), helps prevent breakouts by **regulating skin cell turnover**. Here’s how it works:  
- **Accelerates cell turnover**: Adapalene promotes the shedding of dead skin cells, preventing them from clogging pores and reducing the formation of blackheads and whiteheads.  
- **Normalizes skin texture**: By encouraging healthy cell renewal, it minimizes the buildup of keratin (a protein that can block pores), which is a common cause of acne.  
- **Reduces inflammation**: It also calms inflamed skin, lessening the severity of existing pimples and preventing new ones from forming.  

Since it’s a **0.1% concentration retinoid**, it’s gentler than some other retinoids but still effective for acne-prone skin. For best results, start with a small amount, apply it once dail

In [38]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to sales assistant .
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [39]:
qa_sets = []

In [40]:
for i, evolved_query in enumerate(evolved_queries):
     qa_dict = {
          "question": evolved_query,
          "context": context['content'],
          "expected_output": expected_outputs[i]
     }
     qa_sets.append(qa_dict)

In [42]:
print("Generating critique for each QA couple...")
for data in tqdm(qa_sets):
     groundedness_critic_template = PromptTemplate.from_template(question_groundedness_critique_prompt)
     groundedness_critic_prompt = groundedness_critic_template.invoke({"context": data["context"], "question": data["question"]})
     
     relevance_critique_template = PromptTemplate.from_template(question_relevance_critique_prompt)
     relevance_critique_prompt = relevance_critique_template.invoke({"question": data["question"]})
     
     groundedness_result = llm.invoke(groundedness_critic_prompt)
     relevance_result = llm.invoke(relevance_critique_prompt)
     
     evaluations = {
          "groundedness": re.sub(r'<think>.*?</think>', '', groundedness_result.content, flags=re.DOTALL).strip(),
          "relevance": re.sub(r'<think>.*?</think>', '', relevance_result.content, flags=re.DOTALL).strip()
     }
     
     try:
          for criterion, evaluation in evaluations.items():
               score, eval = (
                    int(evaluation.split("Total rating: ")[-1].strip()),
                    evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
               )
               if criterion == "groundedness":
                    data['groundedness_score'] = score
                    data['groundedness_eval'] = eval
               else:
                    data['relevance_score'] = score
                    data['relevance_eval'] = eval
     except Exception as e:
          continue

Generating critique for each QA couple...


100%|██████████| 5/5 [01:17<00:00, 15.52s/it]


In [43]:
for data in qa_sets:
     print(f"Question: {data['question']}")
     print(f"Expected Output: {data['expected_output']}")
     print(f"Groundedness Score: {data['groundedness_score']}")
     print(f"Relevance Score: {data['relevance_score']}")
     print("-" * 50)

Question: Is this cleanser safe for sensitive skin, considering potential allergens and the need for patch testing?
Expected Output: Yes, the Heimish All Clean Balm is **suitable for sensitive skin** and formulated with a **low-irritation formula** to minimize discomfort. However, as noted in the safety information, it **may contain fragrances and parabens**, which are potential allergens for some individuals. To ensure safety, **patch testing is strongly recommended** before full facial use, especially if you have a history of allergies or sensitive skin.  

If you’d like, I can help you check the full ingredient list for any specific allergens you’re concerned about or guide you on how to perform a patch test. Would you like that?
Groundedness Score: 5
Relevance Score: 5
--------------------------------------------------
Question: Does this balm effectively remove waterproof eye makeup without irritating sensitive skin?
Expected Output: Yes, the Heimish All Clean Balm is specifically

In [47]:
synthetic_dataset.append(SyntheticData(
     query=data['question'],
     expected_output=data['expected_output'],
     context=data['context'],
     groundedness_score=int(data['groundedness_score']),
     groundedness_eval=data['groundedness_eval'],
     relevance_score=int(data['relevance_score']),
     relevance_eval=data['relevance_eval']
))

In [48]:
len(synthetic_dataset)

1