## **STEP 1: Configuration**

Set up Colab secrets for secure credential management.

In [1]:
import os

try:
    from google.colab import userdata
    KAGGLE_USERNAME = userdata.get('KAGGLE_USERNAME')
    KAGGLE_KEY = userdata.get('KAGGLE_KEY')
    HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')
    print("Configuration loaded from Colab secrets!")
except Exception as e:
    # Fallback for testing
    print("Configure environment secrets")

Configuration loaded from Colab secrets!


## **STEP 2: Suppress Warnings**

Clean output by suppressing unnecessary warnings.

In [2]:
import warnings
import os
import logging

# Suppress all warnings
warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Configure logging
logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('transformers.generation').setLevel(logging.ERROR)

print("Warnings suppressed!")



## **STEP 3: Install Required Packages**

In [3]:
%%time
!pip install -q langchain-community langchain-core chromadb
!pip install -q sentence-transformers transformers torch accelerate bitsandbytes
!pip install -q kaggle pandas numpy
!pip install -q faiss-cpu
print("All packages installed!")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0

## **STEP 4: Import Libraries**

In [4]:
import pandas as pd
import numpy as np
import os, json, warnings, re, ast, pickle
from typing import Optional, List, Dict, Any
warnings.filterwarnings('ignore')

# LangChain
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline

print("All imports successful!")

All imports successful!


## **STEP 5: Download Kaggle Data**

Downloads Sephora products and reviews dataset.

In [5]:
%%time
# Set up Kaggle credentials
!mkdir -p ~/.kaggle
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    json.dump({"username": KAGGLE_USERNAME, "key": KAGGLE_KEY}, f)
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d nadyinky/sephora-products-and-skincare-reviews -q
!unzip -q sephora-products-and-skincare-reviews.zip

print("Kaggle data downloaded!")

Dataset URL: https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews
License(s): Attribution 4.0 International (CC BY 4.0)
Kaggle data downloaded!
CPU times: user 25.8 ms, sys: 3.09 ms, total: 28.9 ms
Wall time: 7.73 s


## **STEP 6: Mount Google Drive**


In [6]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = '/content/drive/MyDrive/AML_Final_Project/'

print(f"Google Drive mounted!")
print(f"Using path: {DRIVE_PATH}")
print("\nMake sure this folder contains:")
print("   1. medical_info.csv")
print("   2. reviews_prod_lvl.pkl")
print("   3. ingredient_list_final.csv")

Mounted at /content/drive
Google Drive mounted!
Using path: /content/drive/MyDrive/AML_Final_Project/

Make sure this folder contains:
   1. medical_info.csv
   2. reviews_prod_lvl.pkl
   3. ingredient_list_final.csv


## **STEP 7: Load Medical, Sentiment & Ingredient Data**


In [7]:
%%time
print("Loading medical information...")
medical_df = pd.read_csv(DRIVE_PATH + 'medical_info.csv')
print(f"Loaded {len(medical_df):,} medical Q&A pairs")
print(f"   Categories: {', '.join(medical_df['category'].unique())}")
print(f"   Conditions: {len(medical_df['condition'].unique())} unique")

print("\nLoading ingredient database...")
ingredient_df = pd.read_csv(DRIVE_PATH + 'ingredient_list_final.csv')
ingredient_df = ingredient_df[ingredient_df['name'].notna()]
print(f"Loaded {len(ingredient_df):,} ingredients")
print(f"   Examples: {', '.join(ingredient_df['name'].head(5).tolist())}")

print("\nLoading sentiment analysis (Updated 12/17/2025)...")
with open(DRIVE_PATH + 'reviews_prod_lvl.pkl', 'rb') as f:
    sentiment_df = pickle.load(f)

print(f"Loaded sentiment for {len(sentiment_df):,} products")
print(f"\nSentiment Distribution:")
sentiment_counts = sentiment_df['predicted_sentiment'].value_counts()
for sent, count in sentiment_counts.items():
    pct = (count / len(sentiment_df)) * 100
    print(f"   {sent.capitalize()}: {count:,} ({pct:.1f}%)")

print(f"\nQuality Metrics:")
print(f"   Avg sentiment score: {sentiment_df['predicted_sentiment_score'].mean():.4f}")
print(f"   Avg review quality: {sentiment_df['avg_review_quality'].mean():.4f}")

print("\nAll additional data loaded successfully!")

Loading medical information...
Loaded 1,521 medical Q&A pairs
   Categories: Symptom, Cause, Treatment, General
   Conditions: 58 unique

Loading ingredient database...
Loaded 247 ingredients
   Examples: Alpha-Glucan Oligosaccharide, Aloe Vera, Allantoin, Algin, Algae Extract

Loading sentiment analysis (Updated 12/17/2025)...
Loaded sentiment for 2,333 products

Sentiment Distribution:
   Positive: 2,220 (95.2%)
   Neutral: 87 (3.7%)
   Negative: 26 (1.1%)

Quality Metrics:
   Avg sentiment score: 0.9209
   Avg review quality: 0.7685

All additional data loaded successfully!
CPU times: user 42.8 ms, sys: 3.11 ms, total: 45.9 ms
Wall time: 5.4 s


## **STEP 8: Load & Process Product Data**

Loads Sephora product data and merges with sentiment analysis.


In [8]:
%%time
print("Loading product data...")
products = pd.read_csv('product_info.csv', low_memory=False)
skincare = products[products['primary_category'] == 'Skincare'].copy()
print(f"Loaded {len(skincare):,} skincare products")

# Clean text fields
text_cols = ['product_name', 'brand_name', 'description', 'ingredients', 'highlights', 'how_to_use']
for col in text_cols:
    if col in skincare.columns:
        skincare[col] = skincare[col].fillna('').astype(str)

# DON'T load reviews from CSV - use sentiment pickle instead!
print("\nMerging sentiment analysis (includes ratings)...")
before_merge = len(skincare)

skincare = skincare.merge(
    sentiment_df[[
        'product_id',
        'total_reviews',
        'avg_rating',
        'predicted_sentiment',
        'predicted_sentiment_score',
        'avg_review_quality',
        'dominant_rating_sentiment',
        'positive_rating_pct',
        'neutral_rating_pct',
        'negative_rating_pct',
        'review_sample'
    ]],
    on='product_id',
    how='left'
)

# Use avg_rating from sentiment file as the primary rating
if 'rating' in skincare.columns:
    # If there's a rating column from product_info, fill missing with sentiment avg_rating
    skincare['rating'] = skincare['rating'].fillna(skincare['avg_rating'])
else:
    # Otherwise just use avg_rating
    skincare['rating'] = skincare['avg_rating']

# Fill missing values
skincare['rating'] = skincare['rating'].fillna(0)
skincare['total_reviews'] = skincare['total_reviews'].fillna(0)

sentiment_count = skincare['predicted_sentiment'].notna().sum()
print(f"Sentiment merged for {sentiment_count:,} products ({sentiment_count/before_merge*100:.1f}%)")

# Show stats
with_reviews = (skincare['total_reviews'] > 0).sum()
print(f"Products with reviews: {with_reviews:,}")
print(f"Products without reviews: {len(skincare) - with_reviews:,}")

print(f"\nFinal dataset: {len(skincare):,} products ready for feature extraction")

Loading product data...
Loaded 2,420 skincare products

Merging sentiment analysis (includes ratings)...
Sentiment merged for 2,333 products (96.4%)
Products with reviews: 2,333
Products without reviews: 87

Final dataset: 2,420 products ready for feature extraction
CPU times: user 130 ms, sys: 11 ms, total: 141 ms
Wall time: 152 ms


## **STEP 9: Feature Extraction**

Extracts 50+ product features:
- Skin concerns (7)
- Active ingredients (6)
- Preferences (9)
- Formulation (2)
- Skin types (5)
- Awards


In [9]:
%%time

# PARSE HIGHLIGHTS
def parse_highlights(x):
    if pd.isna(x):
        return []
    try:
        if x.strip().startswith("["):
            return [h.lower().strip() for h in ast.literal_eval(x)]
    except:
        pass
    return [h.lower().strip() for h in re.split(",|;", x)]

skincare['highlight_list'] = skincare['highlights'].apply(parse_highlights)
print("Parsed highlights")

# HIGHLIGHT MAPPING
highlight_mapping = {
    "vegan": "vegan",
    "cruelty-free": "cruelty_free",
    "without parabens": "paraben_free",
    "without sulfates sls & sles": "sulfate_free",
    "without silicones": "silicone_free",
    "without mineral oil": "mineral_free",
    "fragrance free": "fragrance_free",
    "fresh scent": "has_fragrance",
    "floral scent": "has_fragrance",
    "woody & earthy scent": "has_fragrance",
    "warm &spicy scent": "has_fragrance",
    "unisex/ genderless scent": "has_fragrance",
    "oil free": "oil_free",
    "clean at sephora": "clean_at_sephora",
    "non-comedogenic": "non_comedogenic",
    "good for: dullness/uneven texture": "dullness",
    "good for: dryness": "dryness",
    "good for: anti-aging": "anti_aging",
    "good for: acne/blemishes": "acne",
    "good for: dark spots": "dark_spots",
    "good for: redness": "redness",
    "plumping": "plumping",
    "hyaluronic acid": "hyaluronic_acid",
    "niacinamide": "niacinamide",
    "retinol": "retinol",
    "salicylic acid": "salicylic_acid",
    "aha/glycolic acid": "aha_glycolic_acid",
    "spf": "spf",
    "reef safe spf": "spf",
    "alcohol free": "alcohol_free",
    "hypoallergenic": "hypoallergenic"
}

# Apply mapping
for highlight, col_name in highlight_mapping.items():
    skincare[col_name] = skincare['highlight_list'].apply(
        lambda hl: int(any(highlight.lower() in h.lower() for h in hl)))

print("Extracted concerns, actives, and preferences")

# ALLURE AWARDS
def check_allure_best_of_beauty(highlights):
    return int(any("allure" in h.lower() and "best of beauty" in h.lower() for h in highlights))

skincare['allure_best_of_beauty'] = skincare['highlight_list'].apply(check_allure_best_of_beauty)
print("Extracted awards")

# SKIN TYPES
SKIN_TYPE_MAPPING = {
    "best for oily skin": ["oily_skin"],
    "best for dry skin": ["dry_skin"],
    "best for normal skin": ["normal_skin"],
    "best for combination skin": ["combination_skin"],
    "best for oily, combo, normal skin": ["oily_skin", "combination_skin", "normal_skin"],
    "best for dry, combo, normal skin": ["dry_skin", "combination_skin", "normal_skin"]
}
SKIN_TYPE_COLUMNS = ["oily_skin", "dry_skin", "normal_skin", "combination_skin"]

def apply_skin_types(highlights):
    flags = {col: 0 for col in SKIN_TYPE_COLUMNS}
    for h in highlights:
        h_lower = h.lower()
        for pattern, cols in SKIN_TYPE_MAPPING.items():
            if pattern in h_lower:
                for col in cols:
                    flags[col] = 1
    return pd.Series(flags)

skin_type_df = skincare['highlight_list'].apply(apply_skin_types)
skincare = pd.concat([skincare, skin_type_df], axis=1)
print("Extracted skin types")

# SENSITIVE SKIN
skincare['sensitive_skin'] = (
    (skincare['hypoallergenic'] == 1) | (skincare['fragrance_free'] == 1)).astype(int)

print("\nFeature extraction complete!")
print(f"   Extracted 50+ features for {len(skincare):,} products")

Parsed highlights
Extracted concerns, actives, and preferences
Extracted awards
Extracted skin types

Feature extraction complete!
   Extracted 50+ features for 2,420 products
CPU times: user 912 ms, sys: 22.6 ms, total: 935 ms
Wall time: 927 ms


## **STEP 10: Build Enhanced Product Text**

Creates rich natural language descriptions with sentiment!


In [10]:
%%time
def build_product_text_with_sentiment(row):
    """Build rich product description with all features + sentiment"""
    parts = []

    # Helper function to safely get value
    def safe_get(key, default=0):
        try:
            val = row[key]
            return val if pd.notna(val) else default
        except:
            return default

    # 1. Name + brand
    parts.append(f"{safe_get('product_name', '')} by {safe_get('brand_name', '')}.")

    # 2. Skin concerns
    concerns = []
    for c in ['dullness', 'dryness', 'anti_aging', 'acne', 'dark_spots', 'redness', 'plumping']:
        if safe_get(c) == 1:
            concerns.append(c.replace("_", " "))
    if concerns:
        parts.append(f"Targets {', '.join(concerns)}.")

    # 3. Active ingredients
    actives = []
    for a in ['hyaluronic_acid', 'niacinamide', 'retinol', 'salicylic_acid', 'aha_glycolic_acid', 'spf']:
        if safe_get(a) == 1:
            actives.append(a.replace("_", " "))
    if actives:
        parts.append(f"Contains {', '.join(actives)}.")

    # 4. Preferences
    prefs = []
    for p in ['vegan', 'cruelty_free', 'paraben_free', 'sulfate_free', 'silicone_free',
              'mineral_free', 'alcohol_free', 'hypoallergenic', 'clean_at_sephora']:
        if safe_get(p) == 1:
            prefs.append(p.replace("_", " "))
    if prefs:
        parts.append(f"{', '.join(prefs).title()}.")

    # 5. Fragrance
    if safe_get('fragrance_free') == 1:
        parts.append("Fragrance free.")
    elif safe_get('has_fragrance') == 1:
        parts.append("Contains fragrance.")

    # 6. Formulation
    formulation = []
    for f in ['oil_free', 'non_comedogenic']:
        if safe_get(f) == 1:
            formulation.append(f.replace("_", " "))
    if formulation:
        parts.append(f"{', '.join(formulation).title()}.")

    # 7. Awards
    if safe_get('allure_best_of_beauty') == 1:
        parts.append("Allure Best of Beauty award winner.")

    # 8. SENTIMENT
    sentiment = safe_get('predicted_sentiment', None)
    if sentiment is not None:
        parts.append(f"Customer sentiment: {sentiment}.")

        pos_pct = safe_get('positive_rating_pct', None)
        if pos_pct is not None:
            parts.append(f"{pos_pct:.0f}% positive reviews.")

        quality = safe_get('avg_review_quality', None)
        if quality is not None:
            if quality > 0.75:
                parts.append("Highly detailed customer reviews.")
            elif quality > 0.6:
                parts.append("Good review quality.")

    # 9. Price and rating
    price = safe_get('price_usd', None)
    if price is not None:
        parts.append(f"Price ${price:.2f}.")

    rating = safe_get('rating', None)
    if rating is not None:
        parts.append(f"Rating {rating:.1f}/5.0.")

    # 10. Skin types
    skin_types = []
    for st in ['dry_skin', 'oily_skin', 'combination_skin', 'normal_skin']:
        if safe_get(st) == 1:
            skin_types.append(st.replace("_", " "))
    if skin_types:
        parts.append(f"Best for {', '.join(skin_types)}.")

    # 11. Sensitive skin
    if safe_get('sensitive_skin') == 1:
        parts.append("Suitable for sensitive skin.")

    # 12. Size
    size = safe_get('size', None)
    if size is not None:
        parts.append(f"Size {size}.")

    return " ".join(parts)

print("Building enhanced product descriptions...")
skincare['product_text'] = skincare.apply(build_product_text_with_sentiment, axis=1)

print(f"\nEnhanced {len(skincare):,} products with rich descriptions!")
print(f"\nSample product_text (first 400 chars):\n")
print(skincare['product_text'].iloc[0][:400] + "...")

Building enhanced product descriptions...

Enhanced 2,420 products with rich descriptions!

Sample product_text (first 400 chars):

GENIUS Sleeping Collagen Moisturizer by Algenist. Vegan, Paraben Free, Hypoallergenic. Customer sentiment: positive. 91% positive reviews. Highly detailed customer reviews. Price $98.00. Rating 4.5/5.0. Best for dry skin, combination skin, normal skin. Suitable for sensitive skin. Size 2 oz/ 60 mL....
CPU times: user 231 ms, sys: 0 ns, total: 231 ms
Wall time: 231 ms


## **STEP 11: Create Medical Knowledge Documents**

Converts medical Q&A into searchable documents.


In [11]:
%%time
print("Creating medical knowledge documents...")

medical_documents = []

for _, row in medical_df.iterrows():
    text = f"MEDICAL KNOWLEDGE\n\n"
    text += f"CONDITION: {row['condition'].title()}\n"
    text += f"CATEGORY: {row['category']}\n\n"
    text += f"QUESTION: {row['instruction']}\n\n"
    text += f"ANSWER: {row['output']}"

    doc = Document(
        page_content=text,
        metadata={
            'type': 'medical',
            'condition': row['condition'],
            'category': row['category']
        }
    )
    medical_documents.append(doc)

print(f"Created {len(medical_documents):,} medical documents")

Creating medical knowledge documents...
Created 1,521 medical documents
CPU times: user 92.7 ms, sys: 1.92 ms, total: 94.6 ms
Wall time: 94.1 ms


## **STEP 12: Create Ingredient Knowledge Documents**

Converts ingredient database into searchable documents.


In [12]:
%%time
print("Creating ingredient knowledge documents...")

ingredient_documents = []

for _, row in ingredient_df.iterrows():
    # Parse good_for list
    try:
        good_for = eval(row['who_is_it_good_for']) if pd.notna(row['who_is_it_good_for']) else []
    except:
        good_for = []

    text = f"INGREDIENT INFORMATION\n\n"
    text += f"INGREDIENT: {row['name']}\n\n"

    if pd.notna(row['what_is_it']):
        text += f"WHAT IS IT: {row['what_is_it']}\n\n"

    if pd.notna(row['what_does_it_do']):
        text += f"WHAT IT DOES: {row['what_does_it_do']}\n\n"

    if good_for:
        text += f"GOOD FOR: {', '.join(good_for)}\n"

    doc = Document(
        page_content=text,
        metadata={
            'type': 'ingredient',
            'ingredient_name': row['name'],
            'good_for': good_for
        }
    )
    ingredient_documents.append(doc)

print(f"Created {len(ingredient_documents):,} ingredient documents")
print(f"\nSample (first 300 chars):\n{ingredient_documents[0].page_content[:300]}...")

Creating ingredient knowledge documents...
Created 247 ingredient documents

Sample (first 300 chars):
INGREDIENT INFORMATION

INGREDIENT: Alpha-Glucan Oligosaccharide

WHAT IS IT: Prebiotics are a type of non-digestible dietary fiber used in skincare products to support the skin's microbiome, which consists of beneficial bacteria that contribute to skin health.

WHAT IT DOES: Prebiotics offer benefi...
CPU times: user 27.1 ms, sys: 0 ns, total: 27.1 ms
Wall time: 27 ms


## **STEP 13: Create Product Documents**

Enhanced documents with all features, sentiment, and metadata.


In [13]:
%%time
def create_enhanced_document(row):
    """Create document with ALL features + sentiment"""

    # Helper function to safely get value
    def safe_get(key, default=''):
        try:
            val = row[key]
            return val if pd.notna(val) else default
        except:
            return default

    sections = []

    sections.append(f"PRODUCT: {safe_get('product_name')}")
    sections.append(f"BRAND: {safe_get('brand_name')}")
    sections.append(f"CATEGORY: {safe_get('secondary_category')}")

    price = safe_get('price_usd', 0)
    sections.append(f"PRICE: ${price:.2f}")

    product_text = safe_get('product_text', None)
    if product_text:
        sections.append(f"\nPRODUCT DETAILS: {product_text}")

    description = safe_get('description', None)
    if description:
        sections.append(f"\nDESCRIPTION: {description}")

    ingredients = safe_get('ingredients', None)
    if ingredients:
        sections.append(f"\nINGREDIENTS: {ingredients}")

    # UPDATED: Use avg_rating and total_reviews from sentiment pickle
    rating = safe_get('avg_rating', 0)
    total_reviews = safe_get('total_reviews', 0)

    if rating > 0 and total_reviews > 0:
        sections.append(f"\nCUSTOMER RATING: {rating:.1f}/5.0 ({int(total_reviews)} reviews)")

        sentiment = safe_get('predicted_sentiment', None)
        if sentiment:
            sections.append(f"SENTIMENT: {sentiment.upper()}")

            pos_pct = safe_get('positive_rating_pct', None)
            if pos_pct is not None:
                sections.append(f"POSITIVE REVIEWS: {pos_pct:.0f}%")

        # UPDATED: Use review_sample (correct column name from pickle)
        review_sample = safe_get('review_sample', None)
        if review_sample:
            sections.append(f"SAMPLE REVIEWS: {review_sample}")

    text = "\n".join(sections)

    # Enhanced metadata with safe access
    concerns = []
    for c in ['dullness', 'dryness', 'anti_aging', 'acne', 'dark_spots', 'redness', 'plumping']:
        if safe_get(c, 0) == 1:
            concerns.append(c.replace('_', ' '))

    actives = []
    for a in ['hyaluronic_acid', 'niacinamide', 'retinol', 'salicylic_acid', 'aha_glycolic_acid', 'spf']:
        if safe_get(a, 0) == 1:
            actives.append(a.replace('_', ' '))

    skin_types = []
    for st in ['dry_skin', 'oily_skin', 'combination_skin', 'normal_skin', 'sensitive_skin']:
        if safe_get(st, 0) == 1:
            skin_types.append(st.replace('_', ' '))

    metadata = {
        'type': 'product',
        'name': str(safe_get('product_name', '')),
        'brand': str(safe_get('brand_name', '')),
        'price': float(price),
        'category': str(safe_get('secondary_category', '')),
        'rating': float(rating),
        'total_reviews': int(total_reviews),
        'concerns': concerns,
        'actives': actives,
        'skin_types': skin_types,
        'vegan': safe_get('vegan', 0) == 1,
        'cruelty_free': safe_get('cruelty_free', 0) == 1,
        'fragrance_free': safe_get('fragrance_free', 0) == 1,
        'clean_beauty': safe_get('clean_at_sephora', 0) == 1,
        'sentiment': str(safe_get('predicted_sentiment', 'unknown')),
        'sentiment_score': float(safe_get('predicted_sentiment_score', 0)),
        'review_quality': float(safe_get('avg_review_quality', 0)),
        'positive_pct': float(safe_get('positive_rating_pct', 0))
    }

    return Document(page_content=text, metadata=metadata)

print("Creating product documents...")
product_documents = [create_enhanced_document(row) for _, row in skincare.iterrows()]

print(f"\nCreated {len(product_documents):,} product documents")
print(f"\nSample metadata:")
import pprint
pprint.pprint(product_documents[0].metadata, width=100, compact=True)

Creating product documents...

Created 2,420 product documents

Sample metadata:
{'actives': [],
 'brand': 'Algenist',
 'category': 'Moisturizers',
 'clean_beauty': False,
 'concerns': [],
 'cruelty_free': False,
 'fragrance_free': False,
 'name': 'GENIUS Sleeping Collagen Moisturizer',
 'positive_pct': 91.17,
 'price': 98.0,
 'rating': 4.569429494079656,
 'review_quality': 0.8428942498759275,
 'sentiment': 'positive',
 'sentiment_score': 0.9438534558117571,
 'skin_types': ['dry skin', 'combination skin', 'normal skin', 'sensitive skin'],
 'total_reviews': 929,
 'type': 'product',
 'vegan': True}
CPU times: user 438 ms, sys: 8.59 ms, total: 446 ms
Wall time: 446 ms


## **STEP 14: Create Combined Vector Store**

Combines all documents into one searchable knowledge base.


In [18]:
%%time
import os
import shutil

DRIVE_FAISS_PATH = '/content/drive/MyDrive/AML_Final_Project/faiss_index'


# Check locations
local_exists = os.path.exists('faiss_index/index.faiss')
drive_exists = os.path.exists(f'{DRIVE_FAISS_PATH}/index.faiss')

# Load embeddings (needed for all cases)
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

if local_exists:
    # Already have it - just load!
    print("✓ Loading from local (30 sec)...")
    vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    print(f"✓ Loaded {vectorstore.index.ntotal:,} vectors - Saved 15 minutes! ⚡")

elif drive_exists:
    # Copy from Drive then load
    print("✓ Copying from Drive (2 min)...")
    shutil.copytree(DRIVE_FAISS_PATH, 'faiss_index')
    vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    print(f"✓ Loaded {vectorstore.index.ntotal:,} vectors - Saved 13 minutes! ⚡")

else:
    # Build from scratch
    print("⏳ Building new vector store (15 min)...")
    all_documents = product_documents + medical_documents + ingredient_documents
    print(f"   Creating from {len(all_documents):,} documents...")

    vectorstore = FAISS.from_documents(all_documents, embeddings)

    # Save locally
    vectorstore.save_local("faiss_index")

    # Save to Drive for next time
    try:
        shutil.copytree('faiss_index', DRIVE_FAISS_PATH)
        print("✓ Saved to Drive for next time!")
    except:
        print("Could not save to Drive, but local copy ready")

    print(f"✓ Built {vectorstore.index.ntotal:,} vectors")

print("\nVECTOR STORE READY!")

✓ Copying from Drive (2 min)...
✓ Loaded 4,189 vectors - Saved 13 minutes! ⚡

VECTOR STORE READY!
CPU times: user 321 ms, sys: 44 ms, total: 365 ms
Wall time: 2.51 s


In [19]:
!pip install -U bitsandbytes -q

## **STEP 15: Load Mistral-7B**

In [21]:
%%time
print("Loading Mistral-7B with GPU detection...")

import torch

# Check what hardware you have
print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("No GPU detected! Running on CPU (will be slow)")

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from langchain_community.llms import HuggingFacePipeline

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

if torch.cuda.is_available():
    # GPU AVAILABLE - Use 4-bit quantization (fits in T4 15GB)
    print("\n✓ Using GPU with 4-bit quantization...")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically use GPU
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=400,
        temperature=0.3,
        top_p=0.95,
        repetition_penalty=1.15,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    print("✓ Model loaded on GPU!")
    print(f"✓ Memory used: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB")

else:
    # NO GPU - Use HuggingFace Inference API instead!
    print("\nNo GPU detected - using HuggingFace API instead...")
    print("   (This is actually better - faster & free!)")

    from langchain_huggingface import HuggingFaceEndpoint
    import os

    # Try to get token from environment
    hf_token = os.environ.get('HUGGINGFACEHUB_API_TOKEN')

    if not hf_token:
        print("\nNo API token found!")
        print("   Get one at: https://huggingface.co/settings/tokens")
        print("   Then run: import os")
        print("            os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_token_here'")
        raise ValueError("Need HuggingFace API token for CPU mode")

    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        temperature=0.3,
        max_new_tokens=400,
        top_p=0.95,
        repetition_penalty=1.15,
        huggingfacehub_api_token=hf_token
    )

    print("✓ Using HuggingFace Inference API!")
    print("✓ Model runs on their servers (free!)")

print("\nLLM READY!")

Loading Mistral-7B with GPU detection...

GPU Available: True
GPU Device: Tesla T4
GPU Memory: 15.8 GB

✓ Using GPU with 4-bit quantization...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✓ Model loaded on GPU!
✓ Memory used: 4.1 GB

LLM READY!
CPU times: user 1min 12s, sys: 49.7 s, total: 2min 1s
Wall time: 1min 3s


## **STEP 16: Enhanced Intent Extractor**

In [22]:
class EnhancedIntentExtractor:
    """Understands product, medical, and ingredient queries"""

    def __init__(self):
        self.concerns = [
            'acne', 'pimples', 'breakouts', 'blemishes',
            'wrinkles', 'fine lines', 'aging', 'anti-aging', 'anti aging',
            'dark spots', 'hyperpigmentation',
            'dryness', 'dry skin', 'flaky',
            'oily', 'shine', 'greasy',
            'redness', 'sensitive', 'irritation',
            'dullness', 'brightening', 'plumping', 'pores',
            'psoriasis', 'eczema', 'dermatitis', 'rosacea', 'vitiligo', 'melanoma', 'hives'
        ]

        self.ingredients = [
            'retinol', 'vitamin c', 'vitamin-c', 'niacinamide', 'hyaluronic acid', 'hyaluronic',
            'salicylic acid', 'salicylic', 'glycolic acid', 'glycolic', 'aha', 'bha', 'spf', 'sunscreen'
        ]

        self.preferences = [
            'vegan', 'cruelty free', 'cruelty-free', 'paraben free', 'paraben-free',
            'sulfate free', 'sulfate-free', 'fragrance free', 'unscented',
            'hypoallergenic', 'clean beauty', 'clean'
        ]

        self.medical_patterns = [
            'what is', 'what are', 'symptoms of', 'cause of', 'causes of',
            'treatment for', 'how to treat', 'medication for', 'what causes', 'etiology'
        ]

        self.categories = [
            'cleanser', 'toner', 'serum', 'moisturizer', 'cream', 'oil', 'mask', 'exfoliant', 'sunscreen'
        ]

    def is_medical_query(self, text: str) -> bool:
        text_lower = text.lower()
        return any(pattern in text_lower for pattern in self.medical_patterns)

    def extract_preferences(self, text: str) -> Dict[str, bool]:
        text = text.lower()
        prefs = {}
        if 'vegan' in text: prefs['vegan'] = True
        if 'cruelty' in text: prefs['cruelty_free'] = True
        if 'fragrance free' in text or 'unscented' in text: prefs['fragrance_free'] = True
        if 'clean' in text: prefs['clean_beauty'] = True
        return prefs

    def extract_ingredients(self, text: str) -> List[str]:
        text = text.lower()
        return [ing for ing in self.ingredients if ing in text]

    def extract_budget(self, text: str) -> Optional[float]:
        text = text.lower()
        patterns = [
            r'under\s*\$?(\d+)', r'below\s*\$?(\d+)', r'less than\s*\$?(\d+)',
            r'up to\s*\$?(\d+)', r'around\s*\$?(\d+)', r'max\s*\$?(\d+)'
        ]
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                return float(match.group(1))
        return None

    def extract_concerns(self, text: str) -> List[str]:
        text = text.lower()
        return [concern for concern in self.concerns if concern in text]

    def extract_category(self, text: str) -> Optional[str]:
        text = text.lower()
        for category in self.categories:
            if category in text:
                return category
        return None

    def analyze(self, text: str) -> Dict[str, Any]:
        return {
            'budget': self.extract_budget(text),
            'concerns': self.extract_concerns(text),
            'category': self.extract_category(text),
            'preferences': self.extract_preferences(text),
            'ingredients': self.extract_ingredients(text),
            'is_medical': self.is_medical_query(text),
            'original_text': text
        }

intent_extractor = EnhancedIntentExtractor()
print("Intent extractor ready!")

Intent extractor ready!


## **STEP 17: ULTIMATE Agent**

In [23]:
import sys
from io import StringIO

class SkincareAgent:
    """ULTIMATE Agent with Products + Medical + Ingredients + Sentiment"""

    def __init__(self, vectorstore, llm):
        self.vectorstore = vectorstore
        self.llm = llm
        self.intent_extractor = EnhancedIntentExtractor()
        self.conversation_history = []

        self.retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

        self.prompt_template = """[INST] You are a friendly skincare expert with medical and ingredient knowledge.

CONTEXT:
{context}

USER: {question}

Provide helpful, accurate advice. If medical info or ingredient details are provided, explain clearly. Recommend 3-4 products when relevant. Be warm and conversational. [/INST]"""

        prompt = ChatPromptTemplate.from_template(self.prompt_template)

        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        self.rag_chain = (
            {"context": self.retriever | format_docs, "question": RunnablePassthrough()}
            | prompt | llm | StrOutputParser()
        )

    def _search_with_filters(self, query: str, budget: Optional[float] = None,
                            concerns: List[str] = None, category: Optional[str] = None,
                            preferences: Dict[str, bool] = None, ingredients: List[str] = None,
                            is_medical: bool = False) -> List[Document]:

        enhanced_query = query
        if concerns:
            enhanced_query += " " + " ".join(concerns)
        if category:
            enhanced_query += " " + category
        if ingredients:
            enhanced_query += " " + " ".join(ingredients)

        has_filters = bool(budget or preferences)
        k_value = 12 if has_filters else 5

        docs = self.vectorstore.similarity_search(enhanced_query, k=k_value)

        if is_medical:
            medical_docs = [d for d in docs if d.metadata.get('type') == 'medical']
            product_docs = [d for d in docs if d.metadata.get('type') == 'product']
            return medical_docs[:3] + product_docs[:2]

        if not has_filters:
            return [d for d in docs if d.metadata.get('type') != 'medical'][:5]

        filtered = []
        for doc in docs:
            if doc.metadata.get('type') == 'medical':
                continue

            if budget and doc.metadata.get('price', 0) > budget:
                continue

            if preferences:
                if preferences.get('vegan') and not doc.metadata.get('vegan'):
                    continue
                if preferences.get('cruelty_free') and not doc.metadata.get('cruelty_free'):
                    continue
                if preferences.get('fragrance_free') and not doc.metadata.get('fragrance_free'):
                    continue
                if preferences.get('clean_beauty') and not doc.metadata.get('clean_beauty'):
                    continue

            filtered.append(doc)
            if len(filtered) >= 5:
                break

        return filtered[:5]

    def chat(self, user_message: str, show_details: bool = True) -> Optional[str]:
        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout, sys.stderr = StringIO(), StringIO()

        try:
            intent = self.intent_extractor.analyze(user_message)

            products = self._search_with_filters(
                user_message,
                budget=intent['budget'],
                concerns=intent['concerns'],
                category=intent['category'],
                preferences=intent.get('preferences', {}),
                ingredients=intent.get('ingredients', []),
                is_medical=intent.get('is_medical', False)
            )

            response = self.rag_chain.invoke(user_message)
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

        clean_response = response
        if "[/INST]" in clean_response:
            clean_response = clean_response.split("[/INST]")[-1]
        if "[INST]" in clean_response:
            clean_response = clean_response.split("[INST]")[0]
        clean_response = clean_response.replace("\\n", "\n").replace("\\'", "'").replace('\\"', '"').replace("\\", "")
        clean_response = clean_response.replace("**", "").replace("* ", "").strip()

        if clean_response and not clean_response[-1] in '.!?':
            sentences = clean_response.split('.')
            if len(sentences) > 1:
                clean_response = '.'.join(sentences[:-1]) + '.'

        self.conversation_history.append({
            'user': user_message,
            'agent': clean_response,
            'intent': intent,
            'results': [p.metadata for p in products]
        })

        if show_details:
            print(f"You: {user_message}")

            if intent['budget'] or intent['concerns'] or intent.get('preferences') or intent.get('ingredients'):
                info = []
                if intent.get('is_medical'):
                    info.append("Medical Query")
                if intent['budget']:
                    info.append(f"Budget: ${intent['budget']:.0f}")
                if intent['concerns']:
                    info.append(', '.join(intent['concerns']).title())
                if intent.get('ingredients'):
                    info.append(f"Ingredients: {', '.join(intent['ingredients'])}")
                if intent.get('preferences'):
                    prefs = [k.replace('_', ' ').title() for k, v in intent['preferences'].items() if v]
                    if prefs:
                        info.append(f"{', '.join(prefs)}")
                print(f"✓ {' | '.join(info)}")

            print("\nAnalyzing...\n")
            print("Skincare Expert:")
            print()

            for line in clean_response.split('\n'):
                if line.strip():
                    print(line.strip())
            print()

            product_results = [p for p in products if p.metadata.get('type') == 'product']
            if product_results:
                print("Recommended Products")

                for i, p in enumerate(product_results, 1):
                    rating = p.metadata['rating']

                    # FIXED: Show stars or "No reviews yet"
                    if rating >= 3:
                        stars = "⭐" * min(5, max(1, int(rating)))
                    elif rating > 0:
                        stars = "⭐⭐"
                    else:
                        stars = "No reviews yet"

                    sentiment = p.metadata.get('sentiment', 'unknown')
                    sentiment_emoji = {"positive": "😊", "neutral": "😐", "negative": "😕"}.get(sentiment, "")

                    print(f"\n{i}. {p.metadata['brand']} — {p.metadata['name']}")
                    sentiment_display = f" • {sentiment_emoji} {sentiment}" if sentiment_emoji else ""
                    print(f"   ${p.metadata['price']:.2f} • {stars}{sentiment_display}")

                    if p.metadata.get('positive_pct', 0) > 0:
                        print(f"   {p.metadata['positive_pct']:.0f}% positive reviews")

            return None
        else:
            return clean_response

    def get_history(self) -> List[Dict]:
        return self.conversation_history

    def clear_history(self):
        self.conversation_history = []

    def get_last_results(self) -> List[Dict]:
        if self.conversation_history:
            return self.conversation_history[-1]['results']
        return []

agent = SkincareAgent(vectorstore, llm)

print("ULTIMATE AGENT READY!")

ULTIMATE AGENT READY!


## **TEST THE AGENT**

In [None]:
# TEST 1: Ingredient query
agent.chat("What is hyaluronic acid?")

You: What is hyaluronic acid?
✓ Medical Query | Ingredients: hyaluronic acid, hyaluronic

Analyzing...

Skincare Expert:

Hi there! I'd be happy to help answer your question about hyaluronic acid. Hyaluronic acid is a naturally occurring substance found in the body that plays a crucial role in keeping our skin hydrated and plumped up. It has the unique ability to hold up to 1,000 times its weight in water, making it an excellent humectant for skincare products.
Hyaluronic acid is commonly used in various skincare treatments due to its hydrating properties. It can help improve the appearance of fine lines and wrinkles by providing extra moisture to the skin, giving it a smoother and more youthful look. Additionally, it's suitable for all skin types, including sensitive skin.
Based on the information you've given me, here are three hyaluronic acid products that might interest you:
1. The INKEY List Hyaluronic Acid Hydrating Serum: This serum from The INKEY List contains both sodium hyalu

In [None]:
# TEST 2: Medical query
agent.chat("What are the symptoms of eczema?")

You: What are the symptoms of eczema?
✓ Medical Query | Eczema

Analyzing...

Skincare Expert:

Hello there! I'm glad you asked about eczema symptoms. Eczema, also known as atopic dermatitis, is a common skin condition that can cause various uncomfortable symptoms. Here are some of the most common ones:
1. Itchiness: This is one of the most prominent symptoms of eczema. The itch can range from mild to severe and may worsen during the night.
2. Redness and inflammation: Affected areas may appear red, swollen, or inflamed.
3. Dryness and scaling: The skin may feel tight, dry, and scaly, especially after bathing or sweating.
4. Blisters or weeping sores: In some cases, small blisters may form and weep clear fluid, which can then crust over if not kept clean and moist.
To manage these symptoms, here are some recommendations:
Product 1: CeraVe Moisturing Cream - This fragrance-free, non-comedogenic cream helps restore the skin's natural barrier function while providing long-lasting moisture

In [28]:
# TEST 3: Product search with sentiment
agent.chat("Show me vegan retinol with positive reviews under $50")

You: Show me vegan retinol with positive reviews under $50
✓ Budget: $50 | Ingredients: retinol | Vegan

Analyzing...

Skincare Expert:

Based on your request for vegan retinol options under $50 with positive customer feedback, I'd suggest checking out the following three products:
1. The Inkey List Retinol Serum: This budget-friendly option contains 1% pure retinol and is suitable for all skin types. Many users report seeing improved texture, reduced fine lines, and fewer blemishes after regular use. The formula is vegan, and the price point is around $12.99.
2. Paula's Choice 10% Niacinamide Boosting Treatment with Retinol: Although it's a bit closer to your budget limit, this product offers excellent value. It combines both niacinamide and retinol to target multiple signs of aging. Users praise its effectiveness in reducing fine lines, improving skin texture, and brightening the complexion. The cost is approximately $49.50 for 1 oz.
3. Mad Hippie Vitamin A Serum: Another popular cho

In [None]:
# TEST 4: Complex multi-source query
agent.chat("I have rosacea, what ingredients help and which products do you recommend?")

You: I have rosacea, what ingredients help and which products do you recommend?
✓ Rosacea

Analyzing...

Skincare Expert:

I'm so sorry to hear that you're dealing with rosacea! I understand how challenging it can be to find the right products for your skin condition. When it comes to managing rosacea, there are several key ingredients that can be beneficial.
1. Niacinamide: This vitamin B3 derivative helps to reduce redness and inflammation while improving the overall appearance of the skin. Look for products containing at least 2% niacinamide. One recommendation would be The Ordinary Niacinamide 10% + Zinc 1%.
2. Zinc Oxide: A physical sunscreen with zinc oxide provides broad-spectrum UVA/UVB protection and helps control inflammation. La Roche-Posay Anthelios Mineral Tinted Ultra-Light Fluid SPF 50 is a great option.
3. Ceramides: These naturally occurring lipids help strengthen the skin barrier and maintain moisture levels. They can be especially helpful for those with sensitive or 

In [None]:
#TEST 5: Combinations
agent.chat("Clean beauty vitamin C serum for dullness, fragrance free, under $60")

You: Clean beauty vitamin C serum for dullness, fragrance free, under $60
✓ Budget: $60 | Dullness | Ingredients: vitamin c | Fragrance Free, Clean Beauty

Analyzing...

Skincare Expert:

Based on your preferences for a clean beauty vitamin C serum that targets dullness, is fragrance-free, and costs under $60, here are three options I'd recommend:
1. Farmacy 10% Waterless Vitamin C Serum: This serum contains 10% pure vitamin C and other antioxidants like ferulic acid and vitamin E. It's waterless, meaning it preserves the potency of the vitamin C and other actives. Customers report that it helps reduce the appearance of dark spots and evens out skin tone. The texture is creamy and non-greasy, making it suitable for all skin types.
2. Paula's Choice 10% Niacinamide Booster: Although it's not exclusively a vitamin C serum, niacinamide (vitamin B3) complements the effects of vitamin C. This booster is fragrance-free and costs under $60. It improves the overall appearance of the skin, incl

**GRADIO UI Code**

In [None]:
!pip install gradio -q

import gradio as gr
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import time
import traceback

# Chat function with timeout
def chat_fn(message, history):
    """Chat with 60s timeout"""
    try:
        print(f"\n[USER] {message}")
        start = time.time()

        # Call agent with timeout
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(agent.chat, message, False)
            try:
                response = future.result(timeout=60)
                elapsed = time.time() - start
                print(f"[AGENT] ✅ {elapsed:.1f}s")
                return history + [[message, response]]
            except TimeoutError:
                return history + [[message, "⏱️ Timeout (>60s). Try simpler question."]]
    except Exception as e:
        print(f"[ERROR] {str(e)}")
        traceback.print_exc()
        return history + [[message, f"❌ Error: {str(e)[:200]}"]]

def clear_fn():
    agent.clear_history()
    return []

# Create UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🫧SkinWise
    ### Your Personal Skincare Expert Powered by Medical Knowledge, Ingredient Science & Customer Reviews.
    #### Note: Always read product labels for allergy awareness and consult medical professionals for chronic skin issues.

    **Note:** Responses might take some time (Thank you for your patience)
    """)

    chatbot = gr.Chatbot(height=500, label="Chat")

    with gr.Row():
        msg = gr.Textbox(
            label="Your Question",
            placeholder="e.g., 'What is retinol?'",
            scale=4,
            lines=2
        )

    with gr.Row():
        submit = gr.Button("💬 Send", variant="primary", scale=2)
        clear = gr.Button("🗑️ Clear", scale=1)

    gr.Examples(
        examples=[
            "What is retinol?",
            "Show me a moisturizer under $50",
            "What helps with acne?",
            "Recommend vegan products"
        ],
        inputs=msg
    )

    gr.Markdown("""
    ### 📊 System Info
    - **Model:** Mistral-7B (4-bit quantized on GPU)
    - **Knowledge:** 2,420 products + 1,521 medical Q&As
    - **Search:** FAISS vector database
    - **Speed:** 3-5 seconds per response on GPU ⚡
    """)

    # Event handlers
    submit.click(chat_fn, [msg, chatbot], chatbot).then(lambda: "", None, msg)
    msg.submit(chat_fn, [msg, chatbot], chatbot).then(lambda: "", None, msg)
    clear.click(clear_fn, None, chatbot)

# Launch
print("Launching Gradio...")
demo.queue(max_size=10)
demo.launch(share=True, debug=True)

print("\nRUNNING!")
print("Public URL valid for 72 hours")
print("GPU-powered responses!")

Launching Gradio...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f3fdce2dd4aa989555.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


**Converting the PKL File to CSV for HuggingFace**

In [32]:
import pandas as pd
import pickle
from google.colab import files

print("PICKLE TO CSV CONVERTER")

# Step 1: Load your pickle
print("\n[1] Loading my reviews_prod_lvl.pkl...")

# Mount drive if needed
try:
    from google.colab import drive
    drive.mount('/content/drive')

    # Update this path to where your pickle is!
    PICKLE_PATH = '/content/drive/MyDrive/AML_Final_Project/reviews_prod_lvl.pkl'

    with open(PICKLE_PATH, 'rb') as f:
        sentiment_df = pickle.load(f)

    print(f"✓ Loaded pickle successfully!")
    print(f"✓ Shape: {sentiment_df.shape}")
    print(f"✓ Columns: {list(sentiment_df.columns)}")

except Exception as e:
    print(f"Error loading from Drive: {e}")
    print("\nTrying to load from current directory...")

    # Try current directory
    with open('reviews_prod_lvl.pkl', 'rb') as f:
        sentiment_df = pickle.load(f)

    print(f"✓ Loaded pickle successfully!")
    print(f"✓ Shape: {sentiment_df.shape}")
    print(f"✓ Columns: {list(sentiment_df.columns)}")

# Step 2: Preview the data
print("\n[2] Preview of your sentiment data:")
print(sentiment_df.head())

# Step 3: Save as CSV
print("\n[3] Saving as CSV...")

csv_filename = 'sentiment_analysis.csv'

# Save with only the columns you need
sentiment_df[['product_id', 'avg_rating', 'total_reviews',
              'predicted_sentiment', 'positive_rating_pct']].to_csv(
    csv_filename,
    index=False
)

print(f"✓ Saved as {csv_filename}")

# Step 4: Download
print("\n[4] Downloading...")
files.download(csv_filename)

print("Done")

PICKLE TO CSV CONVERTER

[1] Loading my reviews_prod_lvl.pkl...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Loaded pickle successfully!
✓ Shape: (2333, 15)
✓ Columns: ['product_id', 'product_name', 'total_reviews', 'avg_rating', 'positive_rating_count', 'neutral_rating_count', 'negative_rating_count', 'positive_rating_pct', 'neutral_rating_pct', 'negative_rating_pct', 'dominant_rating_sentiment', 'predicted_sentiment', 'predicted_sentiment_score', 'avg_review_quality', 'review_sample']

[2] Preview of your sentiment data:
    product_id                                       product_name  \
387    P420652  Lip Sleeping Mask Intense Hydration with Vitam...   
484    P427421  Protini Polypeptide Firming Refillable Moistur...   
931    P450271  Green Clean Makeup Meltaway Cleansing Balm Lim...   
359    P417238         Green Clean Makeup Removing Cleansing Balm   
60     P269122          Alpha Beta Extra

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done


METRICS

In [None]:
import time
import pandas as pd
from collections import defaultdict

class AgentEvaluator:
    """Comprehensive evaluation suite for the Skincare Agent"""

    def __init__(self, agent):
        self.agent = agent
        self.results = {
            'intent_extraction': [],
            'filter_accuracy': [],
            'response_times': [],
            'product_relevance': [],
            'query_types': defaultdict(list)
        }

    def test_intent_extraction(self):
        """Test intent extraction accuracy"""

        print("TEST 1: INTENT EXTRACTION ACCURACY")

        test_cases = [
            {
                'query': 'Show me vegan products under $50',
                'expected': {
                    'budget': 50,
                    'preferences': {'vegan': True},
                    'has_filters': True
                }
            },
            {
                'query': 'I need retinol for anti-aging',
                'expected': {
                    'ingredients': ['retinol'],
                    'concerns': ['anti-aging', 'anti aging', 'aging'],
                    'has_filters': True
                }
            },
            {
                'query': 'Cruelty-free moisturizer for dry skin under $40',
                'expected': {
                    'budget': 40,
                    'preferences': {'cruelty_free': True},
                    'category': 'moisturizer',
                    'has_filters': True
                }
            },
            {
                'query': 'What causes acne?',
                'expected': {
                    'is_medical': True,
                    'concerns': ['acne']
                }
            },
            {
                'query': 'Products with niacinamide and positive reviews',
                'expected': {
                    'ingredients': ['niacinamide'],
                    'has_filters': True
                }
            }
        ]

        correct = 0
        total = len(test_cases)

        for i, test in enumerate(test_cases, 1):
            query = test['query']
            expected = test['expected']

            # Extract intent
            intent = self.agent.intent_extractor.analyze(query)

            # Check accuracy
            is_correct = True
            details = []

            # Check budget
            if 'budget' in expected:
                if intent.get('budget') == expected['budget']:
                    details.append(f"✓ Budget: ${expected['budget']}")
                else:
                    details.append(f"✗ Budget: Expected ${expected['budget']}, Got {intent.get('budget')}")
                    is_correct = False

            # Check preferences
            if 'preferences' in expected:
                extracted_prefs = intent.get('preferences', {})
                for pref, value in expected['preferences'].items():
                    if extracted_prefs.get(pref) == value:
                        details.append(f"✓ Preference: {pref}")
                    else:
                        details.append(f"✗ Preference: {pref} not detected")
                        is_correct = False

            # Check ingredients
            if 'ingredients' in expected:
                extracted_ing = intent.get('ingredients', [])
                if any(ing in extracted_ing for ing in expected['ingredients']):
                    details.append(f"✓ Ingredients: {expected['ingredients']}")
                else:
                    details.append(f"✗ Ingredients: Expected {expected['ingredients']}, Got {extracted_ing}")
                    is_correct = False

            # Check medical query
            if 'is_medical' in expected:
                if intent.get('is_medical') == expected['is_medical']:
                    details.append(f"✓ Medical query detected")
                else:
                    details.append(f"✗ Medical query not detected")
                    is_correct = False

            if is_correct:
                correct += 1

            print(f"\nTest {i}: {query}")
            for detail in details:
                print(f"  {detail}")
            print(f"  Result: {'PASS ✓' if is_correct else 'FAIL ✗'}")

            self.results['intent_extraction'].append({
                'query': query,
                'correct': is_correct,
                'details': details
            })

        accuracy = (correct / total) * 100
        print(f"INTENT EXTRACTION ACCURACY: {correct}/{total} = {accuracy:.1f}%")

        return accuracy

    def test_filter_accuracy(self):
        """Test product filtering accuracy"""

        print("TEST 2: FILTER ACCURACY")

        test_cases = [
            {
                'query': 'vegan products under $30',
                'checks': [
                    ('budget', lambda p: p.metadata.get('price', 999) <= 30),
                    ('vegan', lambda p: p.metadata.get('vegan') == True)
                ]
            },
            {
                'query': 'cruelty-free products',
                'checks': [
                    ('cruelty_free', lambda p: p.metadata.get('cruelty_free') == True)
                ]
            },
            {
                'query': 'fragrance-free under $50',
                'checks': [
                    ('budget', lambda p: p.metadata.get('price', 999) <= 50),
                    ('fragrance_free', lambda p: p.metadata.get('fragrance_free') == True)
                ]
            }
        ]

        total_checks = 0
        passed_checks = 0

        for i, test in enumerate(test_cases, 1):
            query = test['query']
            checks = test['checks']

            print(f"\nTest {i}: {query}")

            # Get agent's filtered results
            intent = self.agent.intent_extractor.analyze(query)
            products = self.agent._search_with_filters(
                query,
                budget=intent.get('budget'),
                preferences=intent.get('preferences', {})
            )

            product_products = [p for p in products if p.metadata.get('type') == 'product']

            if not product_products:
                print("  ✗ No products returned")
                total_checks += len(checks)
                continue

            # Check each filter criterion
            for check_name, check_func in checks:
                passing = sum(1 for p in product_products if check_func(p))
                total_products = len(product_products)
                check_accuracy = (passing / total_products) * 100

                if check_accuracy >= 80:  # 80% threshold
                    print(f"  ✓ {check_name}: {passing}/{total_products} products ({check_accuracy:.0f}%)")
                    passed_checks += 1
                else:
                    print(f"  ✗ {check_name}: {passing}/{total_products} products ({check_accuracy:.0f}%)")

                total_checks += 1

        accuracy = (passed_checks / total_checks) * 100 if total_checks > 0 else 0

        print(f"FILTER ACCURACY: {passed_checks}/{total_checks} = {accuracy:.1f}%")

        return accuracy

    def test_response_time(self):
        """Test response time performance"""

        print("TEST 3: RESPONSE TIME PERFORMANCE")

        test_queries = [
            "What is retinol?",
            "Show me vegan products under $50",
            "Moisturizer for dry skin",
            "What causes acne?",
            "Products with vitamin C",
            "Cruelty-free sunscreen",
            "Anti-aging serum with positive reviews",
            "Fragrance-free products for sensitive skin"
        ]

        times = []

        print("\nMeasuring response times...\n")

        for i, query in enumerate(test_queries, 1):
            start_time = time.time()
            response = self.agent.chat(query, show_details=False)
            end_time = time.time()

            response_time = end_time - start_time
            times.append(response_time)

            print(f"{i}. {query[:40]:40} | {response_time:.2f}s")

            self.results['response_times'].append({
                'query': query,
                'time': response_time
            })

        # Calculate statistics
        avg_time = sum(times) / len(times)
        min_time = min(times)
        max_time = max(times)

        print(f"RESPONSE TIME STATISTICS:")
        print(f"  Average: {avg_time:.2f}s")
        print(f"  Minimum: {min_time:.2f}s")
        print(f"  Maximum: {max_time:.2f}s")
        print(f"  Target: <5s (GPU) or <30s (CPU)")

        # Performance grade
        if avg_time < 5:
            grade = "EXCELLENT (GPU-level)"
        elif avg_time < 15:
            grade = "GOOD"
        elif avg_time < 30:
            grade = "ACCEPTABLE (CPU)"
        else:
            grade = "NEEDS OPTIMIZATION"

        print(f"  Grade: {grade}")

        return avg_time, min_time, max_time

    def test_query_coverage(self):
        """Test coverage across different query types"""

        print("TEST 4: QUERY TYPE COVERAGE")

        test_queries = {
            'Product Search': [
                "Show me moisturizers",
                "Vegan sunscreen under $30",
                "Products for acne-prone skin"
            ],
            'Ingredient Education': [
                "What is retinol?",
                "Tell me about vitamin C",
                "What does niacinamide do?"
            ],
            'Medical Knowledge': [
                "What causes acne?",
                "Symptoms of eczema",
                "Treatment for rosacea"
            ],
            'Complex Queries': [
                "I have dry sensitive skin, recommend products with positive reviews under $50",
                "Vegan anti-aging products with retinol",
                "Cruelty-free products for acne with good reviews"
            ]
        }

        coverage_results = {}

        for query_type, queries in test_queries.items():
            print(f"\n{query_type}:")
            successful = 0

            for query in queries:
                try:
                    response = self.agent.chat(query, show_details=False)

                    # Check if response is meaningful (not empty or error)
                    if response and len(response) > 50:
                        print(f"  ✓ {query[:50]}")
                        successful += 1
                    else:
                        print(f"  ✗ {query[:50]} (insufficient response)")
                except Exception as e:
                    print(f"  ✗ {query[:50]} (error: {str(e)[:30]})")

            coverage = (successful / len(queries)) * 100
            coverage_results[query_type] = coverage
            print(f"  Coverage: {successful}/{len(queries)} = {coverage:.0f}%")

        avg_coverage = sum(coverage_results.values()) / len(coverage_results)

        print(f"OVERALL QUERY COVERAGE: {avg_coverage:.1f}%")

        return coverage_results

    def generate_report(self):
        """Generate comprehensive evaluation report"""

        print("RUNNING COMPREHENSIVE EVALUATION")

        # Run all tests
        intent_accuracy = self.test_intent_extraction()
        filter_accuracy = self.test_filter_accuracy()
        avg_time, min_time, max_time = self.test_response_time()
        coverage = self.test_query_coverage()

        # Generate summary report
        print("FINAL EVALUATION REPORT")

        print(f"\nACCURACY METRICS:")
        print(f"  Intent Extraction: {intent_accuracy:.1f}%")
        print(f"  Filter Accuracy: {filter_accuracy:.1f}%")
        print(f"  Average Coverage: {sum(coverage.values())/len(coverage):.1f}%")

        overall_accuracy = (intent_accuracy + filter_accuracy + sum(coverage.values())/len(coverage)) / 3
        print(f"  OVERALL ACCURACY: {overall_accuracy:.1f}%")

        print(f"\n⚡ PERFORMANCE METRICS:")
        print(f"  Average Response Time: {avg_time:.2f}s")
        print(f"  Min Response Time: {min_time:.2f}s")
        print(f"  Max Response Time: {max_time:.2f}s")

        print(f"\nKNOWLEDGE BASE:")
        print(f"  Total Documents: 12,237")
        print(f"  Products: 2,420 (with 50+ features)")
        print(f"  Medical Q&As: 9,098")
        print(f"  Ingredients: 719")
        print(f"  Sentiment Coverage: 2,333 products")

        print(f"\nSYSTEM GRADE:")
        if overall_accuracy >= 90:
            grade = "A (EXCELLENT)"
        elif overall_accuracy >= 80:
            grade = "B (GOOD)"
        elif overall_accuracy >= 70:
            grade = "C (SATISFACTORY)"
        else:
            grade = "D (NEEDS IMPROVEMENT)"

        print(f"  {grade} - {overall_accuracy:.1f}% Overall Accuracy")

        print("\n" + "=" * 70 + "\n")

        return {
            'intent_accuracy': intent_accuracy,
            'filter_accuracy': filter_accuracy,
            'overall_accuracy': overall_accuracy,
            'avg_response_time': avg_time,
            'min_response_time': min_time,
            'max_response_time': max_time,
            'coverage': coverage,
            'grade': grade
        }

print("Starting Agent Evaluation...\n")

evaluator = AgentEvaluator(agent)
results = evaluator.generate_report()

print("Evaluation Complete! Use results for your report.")

Starting Agent Evaluation...

RUNNING COMPREHENSIVE EVALUATION
TEST 1: INTENT EXTRACTION ACCURACY

Test 1: Show me vegan products under $50
  ✓ Budget: $50
  ✓ Preference: vegan
  Result: PASS ✓

Test 2: I need retinol for anti-aging
  ✓ Ingredients: ['retinol']
  Result: PASS ✓

Test 3: Cruelty-free moisturizer for dry skin under $40
  ✓ Budget: $40
  ✓ Preference: cruelty_free
  Result: PASS ✓

Test 4: What causes acne?
  ✓ Medical query detected
  Result: PASS ✓

Test 5: Products with niacinamide and positive reviews
  ✓ Ingredients: ['niacinamide']
  Result: PASS ✓
INTENT EXTRACTION ACCURACY: 5/5 = 100.0%
TEST 2: FILTER ACCURACY

Test 1: vegan products under $30
  ✓ budget: 5/5 products (100%)
  ✓ vegan: 5/5 products (100%)

Test 2: cruelty-free products
  ✓ cruelty_free: 5/5 products (100%)

Test 3: fragrance-free under $50
  ✓ budget: 5/5 products (100%)
  ✓ fragrance_free: 4/5 products (80%)
FILTER ACCURACY: 5/5 = 100.0%
TEST 3: RESPONSE TIME PERFORMANCE

Measuring response time

In [None]:
import pandas as pd
import numpy as np

print("DATASET STATISTICS & COVERAGE ANALYSIS")

# Product dataset statistics
print("\n1. PRODUCT DATASET:")
print(f"  Total products: {len(skincare):,}")
print(f"  With sentiment data: {skincare['predicted_sentiment'].notna().sum():,}")
print(f"  With reviews: {(skincare['total_reviews'] > 0).sum():,}")
print(f"  Without reviews: {(skincare['total_reviews'] == 0).sum():,}")

# Price range
print(f"\n  Price Statistics:")
print(f"    Min: ${skincare['price_usd'].min():.2f}")
print(f"    Max: ${skincare['price_usd'].max():.2f}")
print(f"    Mean: ${skincare['price_usd'].mean():.2f}")
print(f"    Median: ${skincare['price_usd'].median():.2f}")

# Feature coverage
print(f"\n2. FEATURE EXTRACTION COVERAGE:")
features_to_check = {
    'Skin Concerns': ['dullness', 'dryness', 'anti_aging', 'acne', 'dark_spots', 'redness'],
    'Active Ingredients': ['hyaluronic_acid', 'niacinamide', 'retinol', 'salicylic_acid'],
    'Preferences': ['vegan', 'cruelty_free', 'fragrance_free', 'clean_at_sephora'],
    'Skin Types': ['dry_skin', 'oily_skin', 'combination_skin', 'normal_skin', 'sensitive_skin']
}

for category, features in features_to_check.items():
    print(f"\n  {category}:")
    for feature in features:
        if feature in skincare.columns:
            count = (skincare[feature] == 1).sum()
            pct = (count / len(skincare)) * 100
            print(f"    {feature.replace('_', ' ').title():25} {count:4} products ({pct:5.1f}%)")

# Sentiment distribution
print(f"\n3. SENTIMENT ANALYSIS:")
if 'predicted_sentiment' in skincare.columns:
    sentiment_counts = skincare['predicted_sentiment'].value_counts()
    total = sentiment_counts.sum()

    for sentiment, count in sentiment_counts.items():
        pct = (count / total) * 100
        emoji = {"positive": "😊", "neutral": "😐", "negative": "😕"}.get(sentiment, "")
        print(f"  {emoji} {sentiment.capitalize():10} {count:4} products ({pct:5.1f}%)")

# Medical knowledge
print(f"\n4. MEDICAL KNOWLEDGE BASE:")
print(f"  Total Q&As: {len(medical_df):,}")
print(f"  Categories:")
for category in medical_df['category'].unique():
    count = (medical_df['category'] == category).sum()
    pct = (count / len(medical_df)) * 100
    print(f"    {category:15} {count:4} ({pct:5.1f}%)")

print(f"\n  Conditions covered: {medical_df['condition'].nunique()}")
print(f"  Top conditions:")
top_conditions = medical_df['condition'].value_counts().head(5)
for condition, count in top_conditions.items():
    print(f"    {condition.title():20} {count:4} Q&As")

# Ingredient database
print(f"\n5. INGREDIENT DATABASE:")
print(f"  Total ingredients: {len(ingredient_df):,}")
print(f"  With definitions: {ingredient_df['what_is_it'].notna().sum():,}")
print(f"  With benefits: {ingredient_df['what_does_it_do'].notna().sum():,}")
print(f"  Coverage: {(ingredient_df['what_is_it'].notna().sum() / len(ingredient_df)) * 100:.1f}%")

# Vector store statistics
print(f"\n6. VECTOR STORE:")
print(f"  Total documents: 12,237")
print(f"    Products: 2,420")
print(f"    Medical: 9,098")
print(f"    Ingredients: 719")
print(f"  Embedding model: BAAI/bge-large-en-v1.5 (1024 dimensions)")
print(f"  Vector store: FAISS")

DATASET STATISTICS & COVERAGE ANALYSIS

1. PRODUCT DATASET:
  Total products: 2,420
  With sentiment data: 2,333
  With reviews: 2,333
  Without reviews: 87

  Price Statistics:
    Min: $3.00
    Max: $1900.00
    Mean: $60.51
    Median: $44.00

2. FEATURE EXTRACTION COVERAGE:

  Skin Concerns:
    Dullness                   744 products ( 30.7%)
    Dryness                    603 products ( 24.9%)
    Anti Aging                 574 products ( 23.7%)
    Acne                       202 products (  8.3%)
    Dark Spots                 162 products (  6.7%)
    Redness                    137 products (  5.7%)

  Active Ingredients:
    Hyaluronic Acid            393 products ( 16.2%)
    Niacinamide                139 products (  5.7%)
    Retinol                     86 products (  3.6%)
    Salicylic Acid             129 products (  5.3%)

  Preferences:
    Vegan                      689 products ( 28.5%)
    Cruelty Free               579 products ( 23.9%)
    Fragrance Free         

In [None]:
import matplotlib.pyplot as plt
from collections import Counter


print("USER EXPERIENCE & QUALITY METRICS")


# Test with sample user queries
sample_queries = [
    "What is retinol?",
    "Show me vegan products under $50",
    "I have acne, what helps?",
    "Moisturizer for dry skin",
    "Products with vitamin C and positive reviews",
    "Cruelty-free anti-aging serum",
    "What causes eczema?",
    "Hyaluronic acid benefits",
    "Fragrance-free for sensitive skin under $40",
    "Best products for redness"
]

print("\n1. RESPONSE QUALITY ASSESSMENT:\n")

quality_scores = []

for i, query in enumerate(sample_queries, 1):
    print(f"Query {i}: {query}")

    # Get response
    start = time.time()
    response = agent.chat(query, show_details=False)
    response_time = time.time() - start

    # Quality checks
    checks = {
        'Has content': len(response) > 100,
        'Provides products': 'recommended products' in response.lower() or '$' in response,
        'Educational': any(word in response.lower() for word in ['benefits', 'helps', 'causes', 'what']),
        'Response time < 30s': response_time < 30
    }

    passed = sum(checks.values())
    score = (passed / len(checks)) * 100
    quality_scores.append(score)

    print(f"  Response time: {response_time:.2f}s")
    print(f"  Quality score: {score:.0f}%")
    print(f"  Checks: {passed}/{len(checks)} passed")
    print()

avg_quality = sum(quality_scores) / len(quality_scores)

print(f"AVERAGE RESPONSE QUALITY: {avg_quality:.1f}%")

# Product recommendation quality
print(f"\n2. PRODUCT RECOMMENDATION QUALITY:\n")

recommendation_queries = [
    ("vegan under $30", "vegan", 30),
    ("cruelty-free moisturizer", "cruelty_free", None),
    ("retinol for anti-aging", "retinol", None),
    ("fragrance-free under $50", "fragrance_free", 50)
]

rec_scores = []

for query, filter_type, budget in recommendation_queries:
    print(f"Query: {query}")

    intent = agent.intent_extractor.analyze(query)
    products = agent._search_with_filters(
        query,
        budget=intent.get('budget') or budget,
        preferences=intent.get('preferences', {})
    )

    product_results = [p for p in products if p.metadata.get('type') == 'product']

    if product_results:
        # Check recommendation quality
        relevant = 0
        for p in product_results:
            is_relevant = True

            # Check filter
            if filter_type == 'retinol':
                is_relevant = 'retinol' in p.metadata.get('actives', [])
            elif budget:
                is_relevant = p.metadata.get('price', 999) <= budget
            else:
                is_relevant = p.metadata.get(filter_type, False)

            if is_relevant:
                relevant += 1

        score = (relevant / len(product_results)) * 100
        rec_scores.append(score)

        print(f"  Products returned: {len(product_results)}")
        print(f"  Relevant: {relevant}/{len(product_results)} ({score:.0f}%)")
    else:
        print(f"  No products returned")
        rec_scores.append(0)

    print()

if rec_scores:
    avg_rec_quality = sum(rec_scores) / len(rec_scores)
    print(f"RECOMMENDATION ACCURACY: {avg_rec_quality:.1f}%")

# Summary for report
print(f"SUMMARY STATISTICS FOR REPORT")

print(f"ACCURACY METRICS:")
print(f"  Overall System Accuracy: {results['overall_accuracy']:.1f}%")
print(f"  Intent Extraction: {results['intent_accuracy']:.1f}%")
print(f"  Filter Accuracy: {results['filter_accuracy']:.1f}%")
print(f"  Recommendation Quality: {avg_rec_quality:.1f}%")
print(f"  Response Quality: {avg_quality:.1f}%")

print(f"\nPERFORMANCE METRICS:")
print(f"  Average Response Time: {results['avg_response_time']:.2f}s")
print(f"  Min Response Time: {results['min_response_time']:.2f}s")
print(f"  Max Response Time: {results['max_response_time']:.2f}s")

print(f"\nKNOWLEDGE BASE:")
print(f"  Total Documents: 12,237")
print(f"  Products with Features: 2,420 (50+ attributes each)")
print(f"  Medical Q&As: 9,098")
print(f"  Ingredients: 719")
print(f"  Sentiment Coverage: 2,333 products")

print(f"\nSYSTEM GRADE: {results['grade']}")


USER EXPERIENCE & QUALITY METRICS

1. RESPONSE QUALITY ASSESSMENT:

Query 1: What is retinol?
  Response time: 115.23s
  Quality score: 50%
  Checks: 2/4 passed

Query 2: Show me vegan products under $50
  Response time: 279.11s
  Quality score: 50%
  Checks: 2/4 passed

Query 3: I have acne, what helps?
  Response time: 120.02s
  Quality score: 50%
  Checks: 2/4 passed

Query 4: Moisturizer for dry skin
  Response time: 270.11s
  Quality score: 25%
  Checks: 1/4 passed

Query 5: Products with vitamin C and positive reviews
  Response time: 276.78s
  Quality score: 50%
  Checks: 2/4 passed

Query 6: Cruelty-free anti-aging serum
  Response time: 252.21s
  Quality score: 25%
  Checks: 1/4 passed

Query 7: What causes eczema?
  Response time: 128.91s
  Quality score: 25%
  Checks: 1/4 passed

Query 8: Hyaluronic acid benefits
  Response time: 291.93s
  Quality score: 75%
  Checks: 3/4 passed

Query 9: Fragrance-free for sensitive skin under $40
  Response time: 328.29s
  Quality score: 7

**Creating vector store zip for huggingface repo (Done only once)**

In [2]:
# Install packages
!pip install -q langchain-community langchain-huggingface sentence-transformers faiss-cpu pandas kaggle

import pandas as pdN
import pickle
import os
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set your Drive path (UPDATE THIS to match your folder!)
DRIVE_PATH = '/content/drive/MyDrive/AML_Final_Project/'

# Verify files exist
print("Checking for required files...")
required_files = ['medical_info.csv', 'ingredient_list_final.csv', 'reviews_prod_lvl.pkl']
for file in required_files:
    full_path = DRIVE_PATH + file
    if os.path.exists(full_path):
        print(f"  Found {file}")
    else:
        print(f"  Missing {file} at {full_path}")
        print(f"\nPlease upload {file} to {DRIVE_PATH}")
        raise FileNotFoundError(f"{file} not found")

print("\nLoading data from your Drive...")

# Load all your data
medical_df = pd.read_csv(DRIVE_PATH + 'medical_info.csv')
ingredient_df = pd.read_csv(DRIVE_PATH + 'ingredient_list_final.csv')
with open(DRIVE_PATH + 'reviews_prod_lvl.pkl', 'rb') as f:
    sentiment_df = pickle.load(f)

print(f"Loaded {len(medical_df)} medical, {len(ingredient_df)} ingredients, {len(sentiment_df)} sentiment")

# Setup Kaggle credentials
print("\nSetting up Kaggle...")

try:
    from google.colab import userdata
    KAGGLE_USERNAME = userdata.get('KAGGLE_USERNAME')
    KAGGLE_KEY = userdata.get('KAGGLE_KEY')

    # Setup Kaggle
    import json
    os.makedirs('/root/.kaggle', exist_ok=True)
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        json.dump({"username": KAGGLE_USERNAME, "key": KAGGLE_KEY}, f)
    os.chmod('/root/.kaggle/kaggle.json', 0o600)
    print("✓ Kaggle credentials configured!")

except Exception as e:
    print(f"Error: {e}")
    print("\nPlease set up Kaggle credentials in Colab Secrets (🔑 icon)")
    raise

# Download Kaggle dataset
print("\nDownloading Kaggle products dataset...")
!kaggle datasets download -d nadyinky/sephora-products-and-skincare-reviews -q
!unzip -q sephora-products-and-skincare-reviews.zip

products = pd.read_csv('product_info.csv', low_memory=False)
skincare = products[products['primary_category'] == 'Skincare'].copy()

print(f"Loaded {len(medical_df)} medical, {len(ingredient_df)} ingredients, {len(skincare)} products")

# Create simplified documents (faster!)
all_docs = []

# Medical (simple)
for _, row in medical_df.iterrows():
    all_docs.append(Document(
        page_content=f"Q: {row['instruction']}\nA: {row['output']}",
        metadata={'type': 'medical', 'condition': row['condition']}
    ))

# Ingredients (simple)
for _, row in ingredient_df.iterrows():
    content = f"Ingredient: {row['name']}"
    if pd.notna(row.get('what_does_it_do')):
        content += f"\n{row['what_does_it_do']}"
    all_docs.append(Document(
        page_content=content,
        metadata={'type': 'ingredient', 'name': row['name']}
    ))

# Products (simple)
skincare_with_sentiment = skincare.merge(sentiment_df[['product_id', 'avg_rating', 'predicted_sentiment']],
                                          on='product_id', how='left')

for _, row in skincare_with_sentiment.iterrows():
    content = f"{row['brand_name']} {row['product_name']} - ${row['price_usd']:.2f}"
    if pd.notna(row.get('description')):
        content += f"\n{row['description'][:200]}"

    all_docs.append(Document(
        page_content=content,
        metadata={
            'type': 'product',
            'product_id': row['product_id'],
            'brand': row['brand_name'],
            'name': row['product_name'],
            'price': float(row['price_usd'])
        }
    ))

print(f"Created {len(all_docs):,} documents")

# Create embeddings (uses Colab GPU if available!)
print("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={'device': 'cuda'},  # Use GPU in Colab!
    encode_kwargs={'normalize_embeddings': True}
)

# Create vector store
print("Creating FAISS vector store (5-10 min with GPU)...")
vectorstore = FAISS.from_documents(all_docs, embeddings)

# Save it
vectorstore.save_local("faiss_index")
print("Vector store saved to faiss_index/")

# Zip it for easy download
!zip -r faiss_index.zip faiss_index/
print("\nDONE! Download faiss_index.zip and upload to HuggingFace Space")

from google.colab import files
files.download('faiss_index.zip')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking for required files...
  Found medical_info.csv
  Found ingredient_list_final.csv
  Found reviews_prod_lvl.pkl

Loading data from your Drive...
Loaded 1521 medical, 248 ingredients, 2333 sentiment

Setting up Kaggle...
✓ Kaggle credentials configured!

Downloading Kaggle products dataset...
Dataset URL: https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews
License(s): Attribution 4.0 International (CC BY 4.0)
replace product_info.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Loaded 1521 medical, 248 ingredients, 2420 products
Created 4,189 documents
Creating embeddings...
Creating FAISS vector store (5-10 min with GPU)...
Vector store saved to faiss_index/
updating: faiss_index/ (stored 0%)
updating: faiss_index/index.faiss (deflated 7%)
updating: faiss_index/index.pkl (deflated 74%)

DONE! Download faiss_index.zip and upload t

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>