In [2]:
import os
import pandas as pd
from pydantic import BaseModel
from openai import OpenAI
import dotenv
import numpy as np

dotenv.load_dotenv()



client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

df_products = pd.read_json('../data/processed_data_with_embeddings.jsonl', orient='records', lines=True)    

In [5]:
class QueryList(BaseModel):
    queries: list[str]

def expand_query(query):
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """
                                You are a helpful assistant that expands queries. 
                                Based on the user query, you will expand the query to include more relevant products. 
                                Your output will be used to do a similarity search on a product database.
                                Return a list of expanded queries with only three queries.
                                """,
                },
                {"role": "user", "content": query},
            ],
            response_format=QueryList,
        )
        return response.choices[0].message.parsed

In [6]:
original_query = "I am looking for an outfit for a wedding for me. I am a groomsman. Adult size."

queries = expand_query(original_query).queries

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [7]:
def get_embeddings(input):
    print(input)
    response = client.embeddings.create(input=input, model=os.getenv("EMBEDDING_MODEL"))
    return [data.embedding for data in response.data]

embeddings = get_embeddings(queries)

NameError: name 'queries' is not defined

In [67]:
def find_similar_products(
    query_embeddings: list[list[float]], top_k: int = 5
):
    product_embeddings = np.array(list(df_products["embedding"].apply(lambda x: list(x))))
    # Calculate cosine similarity between query embeddings and product embeddings
    similarities = np.dot(query_embeddings, product_embeddings.T)

    # Get the top 5 most similar products for each query
    top_indices = np.argsort(similarities, axis=1)[:, -1 * top_k :]

    top_indices = top_indices.ravel()

    # Get the products for the top indices
    products = df_products.iloc[top_indices]

    return products

In [40]:
product_results = find_similar_products(embeddings)

In [68]:
# Filter out products with average rating less than 3
def filter_products(products):
  return products[products["average_rating"] >= 3]

product_results = filter_products(product_results)

In [69]:
product_results

Unnamed: 0,title,average_rating,rating_number,features,price,store,thumbnail,embedding
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."
3537,"Black Bow Tie, Cummerbund and Cufflinks & Stud...",4.5,5,"[Satin, Drawstring closure, HIGH QUALITY SOFT ...",33.99,S.H. Churchill & Co.,https://m.media-amazon.com/images/I/318KU0C+oY...,"[0.0157968514, 0.0562614575, -0.0178959817, 0...."
3437,Little Gents Boys Suits Dark Grey - Kids Suit ...,4.0,7,"[Cotton Blend, Button closure, YOUR SON WILL L...",46.99,Little Gents,https://m.media-amazon.com/images/I/3193Wv4k57...,"[-0.0163080767, 0.0172005594, -0.0390055142, -..."
620,Tailorsun Tweed Vest Vintage Rustic Wedding Ve...,4.2,25,"[Imported, Button closure, Sizes are now corre...",29.0,Tailorsun,https://m.media-amazon.com/images/I/51L65WniXC...,"[-0.012405319100000001, 0.0764714554, -0.04585..."
620,Tailorsun Tweed Vest Vintage Rustic Wedding Ve...,4.2,25,"[Imported, Button closure, Sizes are now corre...",29.0,Tailorsun,https://m.media-amazon.com/images/I/51L65WniXC...,"[-0.012405319100000001, 0.0764714554, -0.04585..."
717,Epoint Men's Fashion Paisley Microfiber Dress ...,4.4,156,"[100% Woven Microfiber, Drawstring closure, Th...",22.93,Epoint,https://m.media-amazon.com/images/I/41pMZ8C9EJ...,"[0.009126815, -0.0051772515000000005, -0.05662..."
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."
3479,Sportoli Men's Formal Satin Adjustable Solid P...,3.9,12,[Satin],6.79,Sportoli,https://m.media-amazon.com/images/I/31fGBJrN3e...,"[-0.0165207591, 0.0515770055, -0.0199793726, 0..."
1037,"Tuxedo Shirt, Cummerbund, Bow Tie, Cufflink & ...",3.6,25,[Drawstring closure],26.0,Neil Allyn,https://m.media-amazon.com/images/I/41IUCjYr2y...,"[0.0013640962, 0.0185012035, -0.0066230986, 0...."
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."


In [70]:
import base64
import requests

def get_and_encode_image(image_url):
    response = requests.get(image_url)
    return base64.b64encode(response.content).decode('utf-8')


In [86]:
from pydantic import BaseModel

class ProductValidationResponse(BaseModel):
    answer: bool


def validate_product_with_query(query, product_title, product_image_base64):
    response = client.beta.chat.completions.parse(
        model=os.getenv("LLM_MODEL"),
        messages=[
            {
            "role": "user",
            "content": [
                {
                "type": "text",
                "text": f"""You are a fashion expert analyzing a clothing item.
                            You will be shown an image of a clothing item and a user text query.
                            Evaluate if this item matches what the user was intending for their query.

                            Example: If the user is looking for a "mens wedding outfit", make sure the item is for an adult male and not a child.

                            If the user is providing an event or location, account for the formality that would be needed for the event and location.
                            
                            Provide your analysis in JSON format with one fields:
                            - "answer": Must be "True" or "False" indicating if the item matches the query
                
                            Query: {query}
                            """,
                },
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{product_image_base64}",
                },
                },
            ],
            }
        ],
        response_format=ProductValidationResponse,
    )

    return response.choices[0].message.parsed



In [87]:

product = product_results.iloc[0]

product_image_base64 = get_and_encode_image(product["thumbnail"])


In [89]:

validate_product_with_query(original_query, product["title"], product_image_base64)

ProductValidationResponse(answer=True)

In [None]:
def validate_single_product(product_row):
    """
    Validates a single product against the original query using its thumbnail image.
    Returns the validation response.
    """
    product_image_base64 = get_and_encode_image(product_row["thumbnail"])
    return validate_product_with_query(original_query, product_row["title"], product_image_base64)

# Filter products to only those that match the query according to the vision model
validated_products = product_results[
    product_results.apply(validate_single_product, axis=1).map(lambda x: x.answer)
]

validated_products

Unnamed: 0,title,average_rating,rating_number,features,price,store,thumbnail,embedding
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."
620,Tailorsun Tweed Vest Vintage Rustic Wedding Ve...,4.2,25,"[Imported, Button closure, Sizes are now corre...",29.0,Tailorsun,https://m.media-amazon.com/images/I/51L65WniXC...,"[-0.012405319100000001, 0.0764714554, -0.04585..."
620,Tailorsun Tweed Vest Vintage Rustic Wedding Ve...,4.2,25,"[Imported, Button closure, Sizes are now corre...",29.0,Tailorsun,https://m.media-amazon.com/images/I/51L65WniXC...,"[-0.012405319100000001, 0.0764714554, -0.04585..."
717,Epoint Men's Fashion Paisley Microfiber Dress ...,4.4,156,"[100% Woven Microfiber, Drawstring closure, Th...",22.93,Epoint,https://m.media-amazon.com/images/I/41pMZ8C9EJ...,"[0.009126815, -0.0051772515000000005, -0.05662..."
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."
1037,"Tuxedo Shirt, Cummerbund, Bow Tie, Cufflink & ...",3.6,25,[Drawstring closure],26.0,Neil Allyn,https://m.media-amazon.com/images/I/41IUCjYr2y...,"[0.0013640962, 0.0185012035, -0.0066230986, 0...."
799,Epoint Men's Fashion Marriage Paisley Microfib...,4.1,37,"[100% Woven Microfiber, Drawstring closure, Th...",24.99,Epoint,https://m.media-amazon.com/images/I/41cFdjEAtm...,"[0.00313825, 0.0041489243000000006, -0.0520087..."


In [60]:
def generate_natural_language_response(validated_products, original_query):
    """
    Generates a natural language response summarizing the validated product results.
    
    Args:
        validated_products (pd.DataFrame): DataFrame containing the validated products
        original_query (str): The original search query
        
    Returns:
        str: A natural language response describing the search results
    """
    messages = [
        {
            "role": "system",
            "content": f"""You are a helpful shopping assistant. Summarize the search results in a natural, 
            conversational way. Include key details like number of results, price ranges if available, 
            and notable brands or features. Avoid any negative language when describing the products.

            Do not list the number of products that you are showing. Mention that you have found a few options that you think will work.
            encourage the user to look at the products and see if they like any of them and say there are more options if they want to see more.

            Include all products provided in the results.

            Here is a search query: {original_query}
            
            Here are the matching products:
            {validated_products.to_string()}
            
            Please provide a natural language summary of these results.
            """
        }
    ]
    
    response = client.chat.completions.create(
        model=os.getenv("LLM_MODEL"),
        messages=messages,
        temperature=0.5,
    )
    
    return response.choices[0].message.content

def convert_results_to_json(validated_products):
    """
    Converts the validated products DataFrame to a JSON format suitable for API responses.
    
    Args:
        validated_products (pd.DataFrame): DataFrame containing the validated products
        
    Returns:
        dict: JSON-compatible dictionary containing the product results
    """
    products_list = []
    
    for _, row in validated_products.iterrows():
        product_dict = {
            "title": row["title"],
            "brand": row["brand"],
            "thumbnail": row["thumbnail"],
            "average_rating": float(row["average_rating"]),
            "embedding": row["embedding"].tolist() if isinstance(row["embedding"], (list, np.ndarray)) else None
        }
        products_list.append(product_dict)
    
    return {
        "total_results": len(products_list),
        "products": products_list
    }

# Example usage:
natural_response = generate_natural_language_response(validated_products, original_query)
# json_results = convert_results_to_json(validated_products)

print("Natural Language Response:")
print(natural_response)
# print("\nJSON Results:")
# print(json_results)


Natural Language Response:
I found a few great options for your groomsman outfit for the wedding! Here are some stylish picks that you might like:

1. **Epoint Men's Fashion Marriage Paisley Microfiber Tuxedo Vest Necktie Set** - This set is made from 100% woven microfiber and features a drawstring closure. The vest is designed to offer a blend of elegance and casual style, perfect for various occasions, including weddings. It's priced at **$24.99** and has a solid rating of **4.1** from 37 reviews. ![View Product](https://m.media-amazon.com/images/I/41cFdjEAtmL._AC_SR38,50_.jpg)

2. **Tailorsun Tweed Vest Vintage Rustic Wedding Vest** - This rustic-style vest is great for a vintage-themed wedding. It features a slim fit with three pockets and is priced at **$29.00**. It has a rating of **4.2** based on 25 reviews, and it's noted for its good quality. ![View Product](https://m.media-amazon.com/images/I/51L65WniXCL._AC_SR38,50_.jpg)

3. **Epoint Men's Fashion Paisley Microfiber Dress Tu

In [10]:
def generate_recommendation_response(original_query):
    """
    Generates a natural language response summarizing the validated product results.

    Args:
        validated_products (pd.DataFrame): DataFrame containing the validated products
        original_query (str): The original search query

    Returns:
        str: A natural language response describing the search results
    """
    messages = [
        {
            "role": "user",
            "content": f"""You are a helpful shopping assistant. Summarize the search results in a natural, 
            conversational way. Include key details like number of results, price ranges if available, 
            and notable brands or features. Avoid any negative language when describing the products.

            Do not list the number of products that you are showing. Mention that you have found a few options that you think will work.
            encourage the user to look at the products and see if they like any of them and say there are more options if they want to see more.

            Compare and contrast the results. 
            
            Example: This shirt is a great option but this other shirt has a higher rating.

            Search Query: {original_query}
            
            Here are the matching products:
            Please provide a natural language summary of these results. If there are no products,
            Suggest ways the user can improve their search query.
            """,
        }
    ]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
    )

    return response.choices[0].message.content

In [11]:
generate_recommendation_response("Test")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [12]:

@retry(wait=wait_random_exponential(min=1, max=30), stop=stop_after_attempt(5))
def validate_product_with_query(query, product_row, product_image_base64):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f""" You are a fashion expert analyzing a clothing item.
                            You will be shown an image of a clothing item and given a text query.
                            Evaluate if this item matches the description in the query.

                            Example: If the user is looking for a "mens wedding outfit", make sure the item is for an adult male and not a child.

                            If the user is providing an event or location, account for the formality that would be needed for the event and location.

                            If the user provided a price or a range in their query, take it into account as to whether is should be shown.
                            
                            Provide your analysis in JSON format with field:
                            - "answer": Must be "True" or "False" indicating if the item matches the query
                            
                            Do not describe the item itself. Focus only on its relevance to the query.
                            
                            Query: {query}
                            Product Title: {product_row.title}
                            Price: {product_row.price}
                            Average Rating: {product_row.average_rating}
                            """,
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{product_image_base64}",
                    },
                },
            ],
        }
    ]

    response = client.beta.chat.completions.parse(
        model=settings.LLM_MODEL,
        messages=messages,
        temperature=0,
        response_format=ProductValidationResponse,
    )

    return response.choices[0].message.parsed.answer


def validate_single_product(product_row, query):
    """
    Validates a single product against the original query using its thumbnail image.
    Returns the validation response.
    """
    product_image_base64 = get_and_encode_image(product_row.thumbnail)
    return validate_product_with_query(
        query, product_row, product_image_base64
    )



NameError: name 'retry' is not defined

In [13]:
import json
def validate_product_with_query(query, product_row):

    prompt = f"""
        You are a fashion expert analyzing a clothing item.
        You will be given a text query about a clothing item along with its details.
        Evaluate if this item matches the description in the query.

        Example: If the user is looking for a "mens wedding outfit", ensure the item is for an adult male and not a child.
        If the query mentions an event or location, consider the appropriate formality.
        If the query provides a price or range, take that into account when determining relevance.

        Provide your analysis in JSON format with a single field:
        - "answer": Must be "True" or "False" indicating if the item matches the query.

        Do not describe the item itself; focus solely on its relevance to the query.

        Query: {query}
        Product Title: {product_row.title}
        Price: {product_row.price}
        Average Rating: {product_row.average_rating}
    """

    messages = [
        {
            "role": "user",
            "content": prompt.strip()
        }
    ]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
    )

    answer_text = response.choices[0].message.content.strip()
    try:
        result = json.loads(answer_text)
        return result.get("answer")
    except json.JSONDecodeError:
        return False


def validate_single_product(product_row, query):
    """
    Validates a single product against the original query using its thumbnail image.
    Returns the validation response.
    """
    product_image_base64 = get_and_encode_image(product_row.thumbnail)
    return validate_product_with_query(query, product_row, product_image_base64)

In [15]:
# Testing regex
import re
answer_text = "True    "
print(answer_text)


match = re.search(r'^(True|False)$', answer_text)
if match:
    answer_str = match.group(0).lower()
    answer_bool = True if answer_str == "True" else False
    print(answer_bool)


True    
