In [None]:

# https://ssahuupgrad-93226.medium.com/using-llms-for-synthetic-data-generation-the-definitive-guide-78aab5f506f0
# https://www.confident-ai.com/blog/the-definitive-guide-to-synthetic-data-generation-using-llms
# https://www.confident-ai.com/blog/why-llm-as-a-judge-is-the-best-llm-evaluation-method
# https://arxiv.org/abs/2304.12244

In [None]:
!pip install -U deepeval langchain langchain-community jq langchain-core 'ollama<0.4.0' langchain-ollama

Collecting langchain-ollama
  Using cached langchain_ollama-0.2.3-py3-none-any.whl.metadata (1.9 kB)
INFO: pip is looking at multiple versions of langchain-ollama to determine which version is compatible with other requirements. This could take a while.
  Using cached langchain_ollama-0.2.2-py3-none-any.whl.metadata (1.9 kB)


In [None]:
import os
import pandas as pd

In [None]:
from langchain.schema import Document

def metadata_func(record: dict, metadata: dict) -> dict:
    """
    Merges existing metadata with the metadata extracted from the JSON file
    under the 'product_details' key.
    """
    metadata["product_details"] = record.get("metadata", {})
    return metadata

In [None]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path='/content/drive/MyDrive/4th year research/Development/data/product_descriptions.json',
    jq_schema=".[]",
    content_key="content",
    metadata_func=metadata_func)

docs = loader.load()

In [None]:
docs[0]

Document(metadata={'source': '/content/drive/MyDrive/4th year research/Development/data/product_descriptions.json', 'seq_num': 1, 'product_details': {'id': 1, 'name': 'Himalaya Purifying Neem Face Wash 100ml', 'brand': 'Himalaya', 'category': 'Face Wash', 'price': 1334.8}}, page_content='Himalaya Purifying Neem Face Wash (100ml) is a natural, daily face wash formulated to purify your skin and remove impurities.  Harnessing the power of Neem and Turmeric, this cleanser effectively addresses acne and oil control, making it suitable for all skin types.  Simply apply to a wet face, lather, and rinse. While generally well-tolerated, some individuals sensitive to Turmeric may experience dryness.  For LKR 1334.80, experience the clarifying benefits of this natural face wash from a trusted brand like Himalaya.\n')

In [None]:
from langchain_ollama.chat_models import ChatOllama
from langchain.embeddings import OllamaEmbeddings

In [None]:
import os

EMBEDDING_MODEL = os.getenv('OLLAMA_EMBEDDING_MODEL_ID')
CHAT_MODEL = os.getenv('OLLAMA_CHAT_MODEL_ID')

In [None]:
ollama_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

  ollama_embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)


In [None]:
content = [doc.page_content for doc in docs]
embeddings = ollama_embeddings.embed_documents(content)

In [None]:
len(embeddings)

90

In [None]:
import random

# randomly selecting a chunk of data to act as your focal anchor
reference_index = random.randint(0, len(embeddings) - 1)
reference_embedding = embeddings[reference_index]
contexts = [content[reference_index]]

In [None]:
contexts

['Indulge your skin with the luxurious OxyGlow Gold & Saffron Face Wash (100ml).  Infused with the opulence of real gold and the revitalizing power of saffron, this all-natural face wash gently cleanses while nourishing and revitalizing your skin.  Perfect for all skin types, this daily use face wash combats dullness and the signs of aging, revealing a radiant and youthful complexion. Simply apply to a wet face, lather, and rinse.  Experience the transformative benefits of gold and saffron for a truly luxurious skincare experience. Priced at LKR 1885.00.\n']

In [None]:
# set a similarity threshold and use cosine similarity to identify related chunks to build your context
import numpy as np

similarity_threshold = 0.8
similar_indices = []
for i, embedding in enumerate(embeddings):
    product = np.dot(reference_embedding, embedding)
    norm = np.linalg.norm(reference_embedding) * np.linalg.norm(embedding)
    similarity = product / norm
    if similarity >= similarity_threshold:
        similar_indices.append(i)
for i in similar_indices:
    contexts.append(content[i])

In [None]:
len(contexts)

43

In [None]:
prompt = f"""I want you act as a copywriter. Based on the given context,
which is list of strings, please generate a list of JSON objects
with a `input` key. The `input` can either be a question or a
statement that can be addressed by the given context.
contexts:
{contexts}"""

In [None]:
!ollama pull llama3.1:latest

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏    0 B/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏    0 B/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏ 3.7 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   1% ▕▏  49 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   2% ▕▏  73 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   2% ▕▏ 102 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   3% ▕▏ 135 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifes

In [None]:
query = ChatOllama(model=CHAT_MODEL).invoke(prompt)

In [None]:
query.content

"It appears you've provided a large block of text containing product descriptions for various face washes and a moisturizer from different brands. I'll summarize the key information about each product:\n\n**1. Aroma Magic Mint Cleanser (25g)**: Natural cleanser with mint and aloe vera, suitable for all skin types, and priced at LKR869.50.\n\n**2. Himalaya Herbals Clear Complexion Whitening Face Wash**: Formulated with Saffron and Licorice, addresses dullness and uneven skin tone, suitable for all skin types, and priced at LKR1200.00.\n\n**3. Jovees Grape Face Wash (120ml)**: Enriched with Grape Extracts and Vitamin E, suitable for all skin types, and priced at LKR1865.00.\n\n**4. Jovees Strawberry Face Wash (120ml)**: Formulated with natural Strawberry Extracts and soothing Aloe Vera, suitable for normal to dry skin types, and priced at LKR1710.00.\n\n**5. OxyGlow Pearl Whitening Face Wash**: Enriched with natural pearl extracts, gently brightens and cleanses all skin types, effectivel

In [None]:
multi_context_template = """
I want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`
1. `Input` should require information from all `Context` elements.
2. `Rewritten Input` must be concise and fully answerable from `Context`.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""
reasoning_template = """
I want you to rewrite the given `input` so that it explicitly requests multi-step reasoning.
1. `Rewritten Input` should require multiple logical connections or inferences.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""
hypothetical_scenario_template = """
I want you to rewrite the given `input` to incorporate a hypothetical or speculative scenario.
1. `Rewritten Input` should encourage applying knowledge from `Context` to deduce outcomes.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

In [None]:
from langchain_core.prompts import PromptTemplate

multi_context_template = PromptTemplate.from_template(multi_context_template)
reasoning_template = PromptTemplate.from_template(reasoning_template)
hypothetical_scenario_template = PromptTemplate.from_template(hypothetical_scenario_template)

In [None]:
multi_context_template

PromptTemplate(input_variables=['context', 'original_input'], input_types={}, partial_variables={}, template="\nI want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`\n1. `Input` should require information from all `Context` elements. \n2. `Rewritten Input` must be concise and fully answerable from `Context`. \n3. Do not use phrases like 'based on the provided context.'\n4. `Rewritten Input` should not exceed 15 words.\nContext: {context}\nInput: {original_input}\nRewritten Input:\n")

In [None]:
evolution_templates = [multi_context_template, reasoning_template, hypothetical_scenario_template]
# Number of evolution steps to apply
num_evolution_steps = 3

In [None]:
multi_context_prompt = multi_context_template.format(context=contexts, original_input=query)
reasoning_prompt = reasoning_template.format(context=contexts, original_input=query)
hypothetical_scenario_prompt = hypothetical_scenario_template.format(context=contexts, original_input=query)

In [None]:
multi_context_prompt

'\nI want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`\n1. `Input` should require information from all `Context` elements. \n2. `Rewritten Input` must be concise and fully answerable from `Context`. \n3. Do not use phrases like \'based on the provided context.\'\n4. `Rewritten Input` should not exceed 15 words.\nContext: [\'Indulge your skin with the luxurious OxyGlow Gold & Saffron Face Wash (100ml).  Infused with the opulence of real gold and the revitalizing power of saffron, this all-natural face wash gently cleanses while nourishing and revitalizing your skin.  Perfect for all skin types, this daily use face wash combats dullness and the signs of aging, revealing a radiant and youthful complexion. Simply apply to a wet face, lather, and rinse.  Experience the transformative benefits of gold and saffron for a truly luxurious skincare experience. Priced at LKR 1885.00.\\n\', \'Himalaya Purifying Neem Face Wash (100ml)

In [None]:
# Function to perform random evolution steps
def evolve_query(original_input, context, steps):
    current_input = original_input
    for _ in range(steps):
        # Choose a random (or using custom logic) template from the list
        chosen_template = random.choice(evolution_templates)
        # Replace the placeholders with the current context and input
        evolved_prompt = chosen_template.invoke({"context": context, "original_input": current_input})
        # Update the current input with the "Rewritten Input" section
        current_input = ChatOllama(model=CHAT_MODEL).invoke(evolved_prompt)
    return current_input



In [None]:
# Evolve the input by randomly selecting the evolution type
evolved_query = evolve_query(query.content, contexts, num_evolution_steps)

In [None]:
evolved_query.content

"Here is a rewritten version of the input in a more structured and concise format with clear headings and labels:\n\n**Product List**\n\n1. **Aroma Magic Mint Cleanser (25g)**\n\t* Natural cleanser with mint and aloe vera\n\t* Suitable for all skin types\n\t* Price: LKR869.50\n\n2. **Himalaya Herbals Clear Complexion Whitening Face Wash**\n\t* Formulated with Saffron and Licorice\n\t* Addresses dullness and uneven skin tone\n\t* Suitable for all skin types\n\t* Price: LKR1200.00\n\n3. **Jovees Grape Face Wash (120ml)**\n\t* Enriched with Grape Extracts and Vitamin E\n\t* Suitable for all skin types\n\t* Price: LKR1865.00\n\n4. **Jovees Strawberry Face Wash (120ml)**\n\t* Formulated with natural Strawberry Extracts and soothing Aloe Vera\n\t* Suitable for normal to dry skin types\n\t* Price: LKR1710.00\n\n5. **OxyGlow Pearl Whitening Face Wash**\n\t* Enriched with natural pearl extracts\n\t* Gently brightens and cleanses all skin types\n\t* Effectively addresses dullness and uneven tone

In [None]:
expected_output_template = """
I want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.
Context: {context}
Input: {evolved_query}
Answer:
"""

In [None]:
expected_output_template = PromptTemplate.from_template(expected_output_template)
expected_output_prompt = expected_output_template.invoke({"context": contexts, "evolved_query": evolved_query.content})

In [None]:
expected_output_prompt

StringPromptValue(text='\nI want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.\nContext: [\'Indulge your skin with the luxurious OxyGlow Gold & Saffron Face Wash (100ml).  Infused with the opulence of real gold and the revitalizing power of saffron, this all-natural face wash gently cleanses while nourishing and revitalizing your skin.  Perfect for all skin types, this daily use face wash combats dullness and the signs of aging, revealing a radiant and youthful complexion. Simply apply to a wet face, lather, and rinse.  Experience the transformative benefits of gold and saffron for a truly luxurious skincare experience. Priced at LKR 1885.00.\\n\', \'Himalaya Purifying Neem Face Wash (100ml) is a natural, daily face wash formulated to purify your skin and remove impurities.  Harnessing the power of Neem and Turmeric, this cleanser effectively addresses acne and oil control, making it suitable for all skin types.  Simpl

In [None]:
expected_output = ChatOllama(model=CHAT_MODEL).invoke(expected_output_prompt)

In [None]:
expected_output.content

"It seems like you provided a list of products with their descriptions, prices, and other details. However, I don't see any specific question or task to be performed on this data.\n\nIf you could provide more context or clarify what you'd like me to do with this information (e.g., extract specific details, calculate totals, etc.), I'll be happy to assist you accordingly!"

In [None]:
!pip install pydantic



In [None]:
from pydantic import BaseModel
from typing import Optional, List

In [None]:
class SyntheticData(BaseModel):
	query: str
	expected_output: Optional[str]
	context: List[str]


In [None]:
synthetic_data = SyntheticData(
	query=evolved_query.content,
	expected_output=expected_output.content,
	context=contexts
)

In [None]:
synthetic_dataset = []
synthetic_dataset.append(synthetic_data)

In [None]:
# filter out high-quality goldens from dataset