In [None]:
! pip install numpy
! pip install openai==1.2.3
! pip install pymongo
! pip install python-dotenv
! pip install tenacity

## Set up

In [35]:
import json
import openai
import pymongo

from dotenv import dotenv_values
from openai import AzureOpenAI

env_name = "myconfig.env" 
config = dotenv_values(env_name)


# Connection string
cosmos_conn = config['cosmos_connection_string']
cosmos_client = pymongo.MongoClient(cosmos_conn)

# Database name
DATABASE_NAME = "ProductRecommendation"
db = cosmos_client[DATABASE_NAME]

# Collection names
actual_ratings = db["ActualRating"]
predicted_ratings = db["PredictedRating"]
product_catalog = db['ProductCollection']


openai.api_type = config['openai_type']
openai.api_key = config['openai_api_key']
openai.api_base = config['openai_api_endpoint']
openai.api_version = config['openai_api_version']

client = AzureOpenAI(
    api_key=openai.api_key,
    api_version=openai.api_version,
    azure_endpoint = openai.api_base
)

In [36]:
def generate_embeddings(text):
    try:
        response = client.embeddings.create(
            input=text, model="embeddings") # need to read from config
        
        embeddings = response.data[0].embedding
        
        return embeddings
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [29]:
# Simple function to assist with vector search
def vector_search(query, collection, num_results=3):
    query_embedding = generate_embeddings(query)
    
    pipeline = [
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "numLists": 1,
                    "path": "Embedding",
                    "k": num_results 
                    #, "efsearch": 40 # optional for HNSW only 
                },
                "returnStoredSource": True }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } }
    ]
    results = collection.aggregate(pipeline)
    return results

little test here

In [None]:
query = "snowboard"
col = db['ProductCollection']
results = vector_search(query,col, 10)

print("\nResults:\n")
if True: 
    for result in results:
        print(f"Similarity Score: {result.get('similarityScore')}")
        document = result.get('document', {})
        print(f"Id: {document.get('Id')}")
        print(f"Type: {document.get('Type')}")
        print(f"Brand: {document.get('Brand')}")
        print(f"Name: {document.get('Name')}")
else:
    print("No results found. Please check your query or data setup.")


In [21]:
# User search for products but exclude products of the same type as well as products already rated by user.
# Example, if user searches for snowboard, don't return any more snowboards or products already rated by the user.

def get_vector_based_recommendations_excluding_type(user_query, exclude_type, rated_products, num_results=10):
    
    query_embedding = generate_embeddings(user_query)

    # Filter criteria to exclude the type of product and the products already rated by the user
    filter_criteria = {
        "Type": {"$ne": exclude_type}, 
        "Id": {"$ne": rated_products}
    }

    results = product_catalog.aggregate([
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "path": "Embedding",
                    "k": num_results,
                    "filter": filter_criteria
                },
                "returnStoredSource": True
            }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' }
    }])

    return list(results)

In [11]:
def get_rated_product_ids(user_id):
    """
    Get products already purchased and rated by user.
    Used to remove products from vector search results and recommendations.
    """
    rated_product_ids = actual_ratings.find({"UserId": user_id}).distinct("ProductId")
    
    return list(set(rated_product_ids))

def get_better_recommendation(user_id, recommendations, num_results):
    """
    This function filters a list of recommendations based on vector search. Then, it retrieves the user's predicted 
    ratings for products from a collection named predicted_ratings (trained using collaborative filtering). 
    The function matches these predictions with the initial list of recommendations, ensuring only items with a 
    predicted rating are considered. Finally, it returns the top num_results product IDs from this filtered list, 
    aiming to provide more personalized output.
      
    """
    # get the product ids from the recommendations
    recommendation_ids = [rec['document']['Id'] for rec in recommendations]

    # Get the predicted ratings for the user (this is a point read for all of the user's predicted ratings)
    predicted_rating_products = predicted_ratings.find_one({"UserId": user_id})
    
    # Filter predictions to find those matching the recommended product IDs
    filtered_predictions = [
        prediction for prediction in predicted_rating_products['Predictions']
        if prediction['ProductId'] in recommendation_ids
    ]

    # Return the Product IDs for the top N filtered predictions
    return [prediction['ProductId'] for prediction in filtered_predictions[:num_results]]

def recommend_products(user_query, user_id, num_results=3):
    """
    Recommends products based on a user query, excluding products of the same type as the top result and
    products the user has already rated.

    This function identifies the type of this top result and retrieves a list of products the user has already rated. 
    With this information, it obtains a broader set of vector-based recommendations excluding products of the same type 
    as the top result and those already rated by the user. Finally, it narrows down these recommendations using the 
    get_better_recommendation function to select the top num_results items based on predicted user preferences.
    """
    
    top_result_cursor = vector_search(user_query, product_catalog, 1)
    top_result = next(top_result_cursor, None)
    top_product_type = top_result.get('document', {}).get('Type')  
    
    # get the products the user has already rated so we can exclude them in the vector search
    rated_products = get_rated_product_ids(user_id)
    
    recommendations_vs = get_vector_based_recommendations_excluding_type(user_query, top_product_type, rated_products, 20)
    
    return get_better_recommendation(user_id, recommendations_vs, num_results), recommendations_vs

## Final Function to recommend products

In [None]:
# actual_user_ids = [144, 496, 189, 232, 194, 950, 370, 980, 190, 404, 737, 959, 142, 795, 121, 743, 307, 365, 30, 726, 339, 536]
user_query = "i just bought a snowboard, what other products recommend me more products to buy?"
user_id = '189'
user_id = int(user_id)
num_results = 10

# this needs to return an entire product, not just an id which then needs another query to the product collection
recommended_products_id, vector_search_recommendations = recommend_products(user_query, user_id, num_results)

print("---------Vector Search Results: --------")
for product in vector_search_recommendations[:num_results]:
    print(f"{product['document']['Id']}: {product['document']['Name']}")

print("\n--------Model + VS Results: ---------")
for recommend_product in recommended_products_id:
    product = (product_catalog.find_one({'Id':recommend_product}))
    print(f"{product.get('Id')}: {product.get('Name', 'No name')}, Price: {product.get('Price', 'No price')}")


rated_products = actual_ratings.find({'UserId': user_id})

print("\n-------Rated Products: --------")
for product in rated_products:
    id = product['ProductId']
    print(f"{id}: {product_catalog.find_one({'Id':id}).get('Name')} | Rating: {product.get('Rating')}")


My stuff below

In [102]:
def predictions_from_current_product_page(user_id, current_product_id, num_results=3):
    """
    This function recommends similar products predicted for this user excluding the current product.
    """
    
    # Get the predicted products for the user, limit results
    user_predicted_products = predicted_ratings.find_one( 
        { "UserId": user_id },
        {"Predictions": {"$slice": num_results}})

    # Filter out the current product if exists and return a list of product ids
    product_ids = [prediction['ProductId'] for prediction in user_predicted_products['Predictions'] 
        if prediction['ProductId'] != current_product_id]
    
    predicted_products = []

    # loop through the product ids to look up the product details in the product_catalog maintaining the order of the product_ids
    for product_id in product_ids:
        product = product_catalog.find_one({"Id": product_id})
        if product:
            predicted_products.append(product)


    # TO-DO: should do vector search on the current product versus just returning all the predicted products  

    # Query the product_catalog and return the full product details
    # predicted_products = product_catalog.find({"Id": {"$in": product_ids}}).limit(num_results)

    predicted_products = list(predicted_products)

    return predicted_products

In [38]:
def predictions_from_vector_search(user_id, user_query, num_results=10):
    """ 
    This function takes a user prompt search for products and returns products that are predicted for the user. 
    """
    
    # Generate the embedding for the user query
    query_embedding = generate_embeddings(user_query)

    # Get the predicted products for the user
    predicted_products = predicted_ratings.find_one( { "UserId": user_id } )

    # Extract the ProductId from the Predictions array
    product_ids = [prediction['ProductId'] for prediction in predicted_products['Predictions']]

    # Filter criteria to include predicted products
    filter_criteria = { 
        "Id": {"$in": product_ids}
    }

    results = product_catalog.aggregate([
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "path": "Embedding",
                    "k": num_results,
                    "filter": filter_criteria
                },
                "returnStoredSource": True
            }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' }
    }])

    predicted_products = list(results)

    return predicted_products

In [103]:
# actual_user_ids = [144, 496, 189, 232, 194, 950, 370, 980, 190, 404, 737, 959, 142, 795, 121, 743, 307, 365, 30, 726, 339, 536]
# snowboards product_ids = 73, 5, 92, 83, 43, 12, 53, 62, 22
user_query = "i would like to buy a snowboard."
user_id = '189'
user_id = int(user_id)
product_id = int(73) # TO-DO: find snowboards in the product catalog
num_results = 10

print("---------Vector Search Results: --------")
vector_search_with_predictions = predictions_from_vector_search(user_id, user_query, num_results)

for product in vector_search_with_predictions:
    print(f"{product['document']['Id']}: {product['document']['Name']} - {product['document']['Price']}")


print("\n--------Current Page Results: ---------")
on_page_predictions = predictions_from_current_product_page(user_id, product_id, num_results)

for product in on_page_predictions:
    print(f"{product['Id']}: {product['Name']} - {product['Price']}")



---------Vector Search Results: --------


92: Blizzard Snowboard - 449.99
73: Omni-Snow Dual Snowboard - 289.99
43: Glacier Frost Snowboard - 419.99
62: Shadow Black Snowboard - 379.0
12: Powder Pro Snowboard - 399.0
32: Cosmic Purple Snowboard - 419.99
22: Venture 2022 Snowboard - 499.0
39: Midnight Blue Goggles - 89.99
42: Gravity 5000 All-Mountain Skis - 699.0
72: GravityZone All-Mountain Skis - 699.0

--------Current Page Results: ---------
42: Gravity 5000 All-Mountain Skis - 699.0
72: GravityZone All-Mountain Skis - 699.0
92: Blizzard Snowboard - 449.99
22: Venture 2022 Snowboard - 499.0
62: Shadow Black Snowboard - 379.0
32: Cosmic Purple Snowboard - 419.99
12: Powder Pro Snowboard - 399.0
43: Glacier Frost Snowboard - 419.99
69: Expedition 200 GPS Navigator - 299.0
