In [None]:
# https://arxiv.org/html/2411.10541v1/
# https://arxiv.org/pdf/2411.10541

In [None]:
%pip install --upgrade langchain langchain-community langchain-core langchainhub langchain-qdrant langchain-text-splitters langsmith langchain-google-genai pandas



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
csv_file_path = '/content/drive/MyDrive/4th year research/Development/data/Data  - New Dataset.csv'

In [None]:
df = pd.read_csv(csv_file_path)

# Generate sequential IDs starting from 1
df['Id'] = range(1, len(df) + 1)

output_file = '/content/drive/MyDrive/4th year research/Development/data/new_dataset.csv'
df.to_csv(output_file, index=False)

print(f"Updated CSV file with new IDs has been saved as '{output_file}'")
print("First few rows of the updated DataFrame:")
print(df.head())

Updated CSV file with new IDs has been saved as '/content/drive/MyDrive/4th year research/Development/data/new_dataset.csv'
First few rows of the updated DataFrame:
   Id                                     Name         Brand         Price  \
0   1             The Ordinary Peeling Solution  The Ordinary  Rs 6,350.00   
1   2  COSRX Advanced Snail 92 All In One Cream         COSRX  Rs 6,650.00   
2   3          CeraVe Daily Moisturizing Lotion        CeraVe  Rs 7,950.00   
3   4      CeraVe PM Facial Moisturizing Lotion        CeraVe  Rs 7,250.00   
4   5      CeraVe AM Facial Moisturizing Lotion        CeraVe  Rs 7,250.00   

          Category                                        Ingredients   \
0  Exfoliating Peel  Glycolic Acid, Lactic Acid, Tartaric Acid, Cit...   
1       Moisturizer  Snail Secretion Filtrate, Betaine, Caprylic/Ca...   
2       Moisturizer  Purified Water, Glycerin, Caprylic/Capric Trig...   
3       Moisturizer  Aqua/Water, Glycerin, Caprylic/Capric Triglyce...

In [None]:
dataset_path = '/content/drive/MyDrive/4th year research/Development/data/new_dataset.csv'
json_file_path = '/content/drive/MyDrive/4th year research/Development/data/products.json'

In [None]:
df = pd.read_csv(dataset_path)

In [None]:
df['Concentrations'][:5]

Unnamed: 0,Concentrations
0,"AHA 30%, BHA 2%"
1,"Snail Mucin 92%, 1000 ppm Hyaluronate"
2,"Hyaluronic Acid (low %), Ceramides blend"
3,"Niacinamide (medium-high), Ceramides"
4,"Niacinamide (medium), Ceramides, Hyaluronic Acid"


In [None]:
print(df.columns)

Index(['Id', 'Name ', 'Brand ', 'Price', 'Category ', 'Ingredients ',
       'Key Ingredients ', 'Benefit ', 'Potential Side Effects', 'Natural',
       'Concentrations', 'Usage ', 'Application Tips', 'Skin Type ',
       'Skin Concern ', 'Average Rating ', 'Customer Reviews',
       'Expert Review ', 'Allergens ', 'For Sensitive Skin ', 'Claims '],
      dtype='object')


In [None]:
def parse_reviews(review_string):
    if pd.isna(review_string):  # Handle NaN values
        return []
    # Split reviews by quotes and filter out empty strings
    reviews = [r.strip() for r in review_string.split('"') if r.strip() and "stars" in r]
    result = []
    for review in reviews:
        # Extract review text and rating
        parts = review.rsplit(" – ", 1)
        if len(parts) == 2:
            text = parts[0].strip()
            rating = float(parts[1].replace(" stars", "").strip())
            result.append({"review": text, "rating": rating})
    return result

In [None]:
def parse_allergens(allergen_string):
    if pd.isna(allergen_string):  # Handle NaN values
        return None
    return [allergen.strip() for allergen in allergen_string.split(", ")]

In [None]:
import json

def to_json(df, json_file_path):
  json_array = []

  for index, row in df.iterrows():
    is_natural = True if row["Natural"] == "Yes" else False
    reviews = parse_reviews(row["Customer Reviews"])
    allergens = parse_allergens(row["Allergens "])

    json_obj = {
        "Id": row["Id"],
        "Name": row["Name "],
        "Brand": row["Brand "],
        "Category": row["Category "],
        "Price": row["Price"],
        "Ingredients": row["Ingredients "].split(", ") if row["Ingredients "] else [],
        "Key Ingredients": row["Key Ingredients "].split(", ") if row["Key Ingredients "] else [],
        "Benefits": row["Benefit "].split(", ") if row["Benefit "] else [],
        "Potential Side Effects": row["Potential Side Effects"].split(", ") if row["Potential Side Effects"] else [],
        "Natural": is_natural,
        "Concentrations": row["Concentrations"].split(", ") if row["Concentrations"] else [],
        "Usage": row["Usage "],
        "Application Tips": row["Application Tips"],
        "Skin Type": row["Skin Type "].split(", ") if row["Skin Type "] else [],
        "Skin Concerns": row["Skin Concern "].split(", ") if row["Skin Concern "] else [],
        "Average Rating": float(row["Average Rating "]),
        "Customer Reviews": reviews,
        "Expert Review": row["Expert Review "],
        "Allergens": allergens,
        "For Sensitive Skin": row["For Sensitive Skin "],
        "Claims": row["Claims "].split(", ") if row["Claims "] else [],
    }
    json_array.append(json_obj)
  return json_array

In [None]:
json_arr = to_json(df, json_file_path)
print(len(json_arr))

100


In [None]:
for i in json_arr[:4]:
  print(i)

{'Id': 1, 'Name': 'The Ordinary Peeling Solution', 'Brand': 'The Ordinary', 'Category': 'Exfoliating Peel', 'Price': 'Rs 6,350.00', 'Ingredients': ['Glycolic Acid', 'Lactic Acid', 'Tartaric Acid', 'Citric Acid', 'Salicylic Acid', 'Sodium Hyaluronate Crosspolymer', 'Tasmannia Lanceolata Fruit/Leaf Extract'], 'Key Ingredients': ['Glycolic Acid', 'Salicylic Acid'], 'Benefits': ['Improves texture', 'clears pore congestion', 'and targets uneven skin tone'], 'Potential Side Effects': ['Sun sensitivity', 'tingling', 'redness', 'potential irritation'], 'Natural': False, 'Concentrations': ['AHA 30%', 'BHA 2%'], 'Usage': 'Use once or twice a week on dry skin. Leave on for max 10 minutes and rinse. Avoid eye contour.', 'Application Tips': 'Apply evenly using fingertips on clean, dry skin. Do not use on wet or compromised skin. Patch test recommended. Use sunscreen afterward.', 'Skin Type': ['Normal', 'Oily', 'Combination'], 'Skin Concerns': ['Dullness', 'Uneven Texture', 'Enlarged Pores'], 'Avera

In [None]:
# Write JSON data to a file
with open(json_file_path, mode='w', encoding='utf-8') as json_file:
  json.dump(json_arr, json_file, indent=4)

In [None]:
#read the content of json file
with open(json_file_path, 'r') as f:
    products_json = json.load(f)

In [None]:
for product in products_json[:5]:
  print(product)

{'Id': 1, 'Name': 'The Ordinary Peeling Solution', 'Brand': 'The Ordinary', 'Category': 'Exfoliating Peel', 'Price': 'Rs 6,350.00', 'Ingredients': ['Glycolic Acid', 'Lactic Acid', 'Tartaric Acid', 'Citric Acid', 'Salicylic Acid', 'Sodium Hyaluronate Crosspolymer', 'Tasmannia Lanceolata Fruit/Leaf Extract'], 'Key Ingredients': ['Glycolic Acid', 'Salicylic Acid'], 'Benefits': ['Improves texture', 'clears pore congestion', 'and targets uneven skin tone'], 'Potential Side Effects': ['Sun sensitivity', 'tingling', 'redness', 'potential irritation'], 'Natural': False, 'Concentrations': ['AHA 30%', 'BHA 2%'], 'Usage': 'Use once or twice a week on dry skin. Leave on for max 10 minutes and rinse. Avoid eye contour.', 'Application Tips': 'Apply evenly using fingertips on clean, dry skin. Do not use on wet or compromised skin. Patch test recommended. Use sunscreen afterward.', 'Skin Type': ['Normal', 'Oily', 'Combination'], 'Skin Concerns': ['Dullness', 'Uneven Texture', 'Enlarged Pores'], 'Avera

In [None]:
def display(value):
  if len(value) == 0 or None:
    return 'Not specified'
  return value

def display_list(lst):
  if len(lst) == 0 or None:
    return 'Not specified'
  return ', '.join(lst)

In [None]:
def generate_product_profile(product):
  # Format the product profile
  product_profile = f"""
  # {product['Name']} - {product['Category']}
  ---

  ## Product Overview
  - Name: {display(product["Name"])}
  - Brand: {display(product["Brand"])}
  - Category: {display(product["Category"])}
  - Price: {display(product["Price"])} (in LKR)
  - Natural: {product["Natural"]}

  ## Ingredients
  - Key Ingredients: {display_list(product['Key Ingredients'])}
  - Concentrations: {display_list(product['Concentrations'])}
  - Full Ingredient List: {display_list(product['Ingredients'])}

  ## Benefits and Claims
  - Benefits: {display_list(product['Benefits'])}
  - Claims: {display_list(product['Claims'])}

  ## Usage and Application
  - Usage: {display(product['Usage'])}
  - Application Tips: {display(product['Application Tips'])}

  ## Skin Suitability
  - Suitable for Skin Types: {display_list(product['Skin Type'])}
  - Addresses Skin Concerns: {display_list(product['Skin Concerns'])}
  - For Sensitive Skin: {display(product['For Sensitive Skin'])}

  ## Safety Information
  - Potential Side Effects: {display_list(product['Potential Side Effects'])}
  - Allergens: {display_list(product['Allergens'])}

  ## Reviews and Ratings
  - Average Rating: {product['Average Rating']}/5
  - Customer Reviews:
  """
  # Append customer reviews with consistent indentation
  if product['Customer Reviews']:
    for i, review in enumerate(product['Customer Reviews']):
      if i == 0:
        product_profile += f"  - \"{review['review']}\" - {review['rating']} stars\n"
      else:
        product_profile += f"    - \"{review['review']}\" - {review['rating']} stars\n"
  else:
    product_profile += "    No customer reviews available.\n"

  # Append expert review
  product_profile += f"  - Expert Review: {display(product['Expert Review'])}"

  return product_profile

In [None]:
product_profile = generate_product_profile(products_json[0])
print(product_profile)


  # The Ordinary Peeling Solution - Exfoliating Peel
  ---

  ## Product Overview
  - Name: The Ordinary Peeling Solution  
  - Brand: The Ordinary 
  - Category: Exfoliating Peel  
  - Price: Rs 6,350.00 (in LKR)  
  - Natural: False  

  ## Ingredients
  - Key Ingredients: Glycolic Acid, Salicylic Acid
  - Concentrations: AHA 30%, BHA 2%
  - Full Ingredient List: Glycolic Acid, Lactic Acid, Tartaric Acid, Citric Acid, Salicylic Acid, Sodium Hyaluronate Crosspolymer, Tasmannia Lanceolata Fruit/Leaf Extract

  ## Benefits and Claims
  - Benefits: Improves texture, clears pore congestion, and targets uneven skin tone
  - Claims: Clinically formulated, High-strength exfoliator

  ## Usage and Application
  - Usage: Use once or twice a week on dry skin. Leave on for max 10 minutes and rinse. Avoid eye contour.
  - Application Tips: Apply evenly using fingertips on clean, dry skin. Do not use on wet or compromised skin. Patch test recommended. Use sunscreen afterward.

  ## Skin Suitabili

In [None]:
product_docs = []

for product in products_json:
  # Add formatted profile to the list
  product_docs.append({
    "template": generate_product_profile(product),
    "metadata": {
      "id": product["Id"],
      "name": product["Name"],
      "brand": product["Brand"],
      "category": product["Category"],
      "price": product["Price"]
    }
  })

### Generate engaging product descriptions using product profiles

In [None]:
import os
from google.colab import userdata

GEMINI_API_KEY=userdata.get('GEMINI_API_KEY')
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

In [None]:
product_copywrite_prompt = """
You are an expert product copywriter. Based on the given product profile, write a professional and engaging product description that covers all the essential details, benefits, usage instructions, and any special considerations.
Create a summary discription of a skincare product based on the given structured product details.
Use only then given product information and phrase them as use see fit.
Don't remove any vital information from the product profile.
Be accurate and through.

Product Profile:
{product_profile}

Product Description:
"""

### Product Copywriter Agent

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

def product_copywriter_agent(product_profile, prompt_template):
  prompt = PromptTemplate(input_variables=["product_profile"], template=prompt_template)

  agent = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_retries=2,
  )

  product_copywrite_chain = LLMChain(llm=agent, prompt=prompt)

  product_description = product_copywrite_chain.run(product_profile=product_profile)

  return product_description

In [None]:
import time

product_descriptions = []

for doc in product_docs:
  agent_response = product_copywriter_agent(doc["template"], product_copywrite_prompt)
  product_description = {
      "id": doc["metadata"]["id"],
      "metadata": doc["metadata"],
      "content": agent_response
  }
  product_descriptions.append(product_description)
  print(f"Product description for product {doc['metadata']['id']} generated")
  time.sleep(3)

In [None]:
product_descriptions[:2]

In [None]:
output_file = "/content/drive/MyDrive/4th year research/Development/data/product_descriptions.json"

In [None]:
with open(output_file, "w") as file:
    json.dump(product_descriptions, file, indent=4)

print(f"Product description written to {output_file}")