In [1]:
# https://arxiv.org/html/2411.10541v1/
# https://arxiv.org/pdf/2411.10541

In [2]:
%pip install --upgrade langchain langchain-community langchain-core langchainhub langchain-qdrant langchain-text-splitters langsmith langchain-google-genai pandas

Collecting langchain-google-genai
  Obtaining dependency information for langchain-google-genai from https://files.pythonhosted.org/packages/59/82/2a5d3fe54df23d6471768b9558f9a73e1a712065e6c20a228aa3254092aa/langchain_google_genai-2.1.2-py3-none-any.whl.metadata
  Downloading langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Obtaining dependency information for filetype<2.0.0,>=1.2.0 from https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl.metadata
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting goo


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import pandas as pd

In [21]:
csv_file_path = './dataset_v3.csv'

In [22]:
df = pd.read_csv(csv_file_path)

# Generate sequential IDs starting from 1
df['Id'] = range(1, len(df) + 1)

output_file = './numbered_dataset_v3.csv'
df.to_csv(output_file, index=False)

print(f"Updated CSV file with new IDs has been saved as '{output_file}'")
print("First few rows of the updated DataFrame:")
print(df.head())

Updated CSV file with new IDs has been saved as './numbered_dataset_v3.csv'
First few rows of the updated DataFrame:
   Id                                     Name         Brand      Price  \
0   1             The Ordinary Peeling Solution  The Ordinary  6,350.00   
1   2  COSRX Advanced Snail 92 All In One Cream         COSRX  6,650.00   
2   3          CeraVe Daily Moisturizing Lotion        CeraVe  7,950.00   
3   4      CeraVe PM Facial Moisturizing Lotion        CeraVe  7,250.00   
4   5      CeraVe AM Facial Moisturizing Lotion        CeraVe  7,250.00   

          Category                                        Ingredients   \
0  Exfoliating Peel  Glycolic Acid, Lactic Acid, Tartaric Acid, Cit...   
1       Moisturizer  Snail Secretion Filtrate, Betaine, Caprylic/Ca...   
2       Moisturizer  Purified Water, Glycerin, Caprylic/Capric Trig...   
3       Moisturizer  Aqua/Water, Glycerin, Caprylic/Capric Triglyce...   
4       Moisturizer  Aqua/Water, Glycerin, Caprylic/Capric Tri

In [23]:
dataset_path = './numbered_dataset_v3.csv'
json_file_path = './products.json'

In [24]:
df = pd.read_csv(dataset_path)

In [25]:
df['Concentrations'][:5]

0                                     AHA 30%, BHA 2%
1               Snail Mucin 92%, 1000 ppm Hyaluronate
2            Hyaluronic Acid (low %), Ceramides blend
3                Niacinamide (medium-high), Ceramides
4    Niacinamide (medium), Ceramides, Hyaluronic Acid
Name: Concentrations, dtype: object

In [26]:
print(df.columns)

Index(['Id', 'Name ', 'Brand ', 'Price', 'Category ', 'Ingredients ',
       'Key Ingredients ', 'Benefit ', 'Potential Side Effects', 'Natural',
       'Concentrations', 'Usage ', 'Application Tips', 'Skin Type ',
       'Skin Concern ', 'Average Rating ', 'Customer Reviews',
       'Expert Review ', 'Allergens ', 'For Sensitive Skin ', 'Claims '],
      dtype='object')


In [27]:
def parse_reviews(review_string):
    if pd.isna(review_string):  # Handle NaN values
        return []
    # Split reviews by quotes and filter out empty strings
    reviews = [r.strip() for r in review_string.split('"') if r.strip() and "stars" in r]
    result = []
    for review in reviews:
        # Extract review text and rating
        parts = review.rsplit(" – ", 1)
        if len(parts) == 2:
            text = parts[0].strip()
            rating = float(parts[1].replace(" stars", "").strip())
            result.append({"review": text, "rating": rating})
    return result

In [28]:
def parse_allergens(allergen_string):
    if pd.isna(allergen_string):  # Handle NaN values
        return None
    return [allergen.strip() for allergen in allergen_string.split(", ")]

In [29]:
import json

def to_json(df, json_file_path):
  json_array = []

  for index, row in df.iterrows():
    is_natural = True if row["Natural"] == "Yes" else False
    reviews = parse_reviews(row["Customer Reviews"])
    allergens = parse_allergens(row["Allergens "])

    json_obj = {
        "Id": row["Id"],
        "Name": row["Name "],
        "Brand": row["Brand "],
        "Category": row["Category "],
        "Price": row["Price"],
        "Ingredients": row["Ingredients "].split(", ") if row["Ingredients "] else [],
        "Key Ingredients": row["Key Ingredients "].split(", ") if row["Key Ingredients "] else [],
        "Benefits": row["Benefit "].split(", ") if row["Benefit "] else [],
        "Potential Side Effects": row["Potential Side Effects"].split(", ") if row["Potential Side Effects"] else [],
        "Natural": is_natural,
        "Concentrations": row["Concentrations"].split(", ") if row["Concentrations"] else [],
        "Usage": row["Usage "],
        "Application Tips": row["Application Tips"],
        "Skin Type": row["Skin Type "].split(", ") if row["Skin Type "] else [],
        "Skin Concerns": row["Skin Concern "].split(", ") if row["Skin Concern "] else [],
        "Average Rating": float(row["Average Rating "]),
        "Customer Reviews": reviews,
        "Expert Review": row["Expert Review "],
        "Allergens": allergens,
        "For Sensitive Skin": row["For Sensitive Skin "],
        "Claims": row["Claims "].split(", ") if row["Claims "] else [],
    }
    json_array.append(json_obj)
  return json_array

In [30]:
json_arr = to_json(df, json_file_path)
print(len(json_arr))

100


In [31]:
for i in json_arr[:4]:
  print(i)

{'Id': 1, 'Name': 'The Ordinary Peeling Solution', 'Brand': 'The Ordinary', 'Category': 'Exfoliating Peel', 'Price': '6,350.00', 'Ingredients': ['Glycolic Acid', 'Lactic Acid', 'Tartaric Acid', 'Citric Acid', 'Salicylic Acid', 'Sodium Hyaluronate Crosspolymer', 'Tasmannia Lanceolata Fruit/Leaf Extract'], 'Key Ingredients': ['Glycolic Acid', 'Salicylic Acid'], 'Benefits': ['Improves texture', 'clears pore congestion', 'targets uneven skin tone'], 'Potential Side Effects': ['Sun sensitivity', 'tingling', 'redness', 'potential irritation'], 'Natural': False, 'Concentrations': ['AHA 30%', 'BHA 2%'], 'Usage': 'Use once or twice a week on dry skin, Leave on for max 10 minutes and rinse, Avoid eye contour', 'Application Tips': 'Apply evenly using fingertips on clean/dry skin , Do not use on wet or compromised skin , Patch test recommended , Use sunscreen afterward', 'Skin Type': ['Normal', 'Oily', 'Combination'], 'Skin Concerns': ['Dullness', 'Uneven Texture', 'Enlarged Pores'], 'Average Rati

In [32]:
# Write JSON data to a file
with open(json_file_path, mode='w', encoding='utf-8') as json_file:
  json.dump(json_arr, json_file, indent=4)

In [33]:
#read the content of json file
with open(json_file_path, 'r') as f:
    products_json = json.load(f)

In [34]:
for product in products_json[:5]:
  print(product)

{'Id': 1, 'Name': 'The Ordinary Peeling Solution', 'Brand': 'The Ordinary', 'Category': 'Exfoliating Peel', 'Price': '6,350.00', 'Ingredients': ['Glycolic Acid', 'Lactic Acid', 'Tartaric Acid', 'Citric Acid', 'Salicylic Acid', 'Sodium Hyaluronate Crosspolymer', 'Tasmannia Lanceolata Fruit/Leaf Extract'], 'Key Ingredients': ['Glycolic Acid', 'Salicylic Acid'], 'Benefits': ['Improves texture', 'clears pore congestion', 'targets uneven skin tone'], 'Potential Side Effects': ['Sun sensitivity', 'tingling', 'redness', 'potential irritation'], 'Natural': False, 'Concentrations': ['AHA 30%', 'BHA 2%'], 'Usage': 'Use once or twice a week on dry skin, Leave on for max 10 minutes and rinse, Avoid eye contour', 'Application Tips': 'Apply evenly using fingertips on clean/dry skin , Do not use on wet or compromised skin , Patch test recommended , Use sunscreen afterward', 'Skin Type': ['Normal', 'Oily', 'Combination'], 'Skin Concerns': ['Dullness', 'Uneven Texture', 'Enlarged Pores'], 'Average Rati