In [None]:
! pip install numpy
! pip install openai==1.2.3
! pip install pymongo
! pip install python-dotenv
! pip install azure-core
! pip install azure-cosmos
! pip install tenacity

! pip install scikit-surprise
! pip install scikit-learn
! pip install numpy

## Set up

In [2]:
import json
import openai
import pymongo

from dotenv import dotenv_values
from openai import AzureOpenAI

env_name = "myconfig.env" 
config = dotenv_values(env_name)


# Connection string
mongo_conn = config['cosmos_connection_string']
mongo_client = pymongo.MongoClient(mongo_conn)

# Database name
DATABASE_NAME = "ProductRecommendation"
db = mongo_client[DATABASE_NAME]

# Collection names
collection_actual_rating = db["ActualRating"]
collection_predicted_rating = db["PredictedRating"]
collection_product = db['ProductCollection']


openai.api_type = config['openai_type']
openai.api_key = config['openai_api_key']
openai.api_base = config['openai_api_endpoint']
openai.api_version = config['openai_api_version']

client = AzureOpenAI(
    api_key=openai.api_key,
    api_version=openai.api_version,
    azure_endpoint = openai.api_base
)

In [None]:
def generate_embeddings(text):
    try:
        response = client.embeddings.create(
            input=text, model="embeddings") # need to read from config
        
        embeddings = response.data[0].embedding
        
        return embeddings
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

embeddings = generate_embeddings("i just bought a snowboard, recommend me more products to buy?")

if embeddings is not None:
    print(embeddings)

In [5]:
# Simple function to assist with vector search
def vector_search(query, collection, num_results=3):
    query_embedding = generate_embeddings(query)
    embeddings_list = []
    pipeline = [
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "numLists": 1,
                    "path": "Embedding",
                    "k": num_results 
                    #, "efsearch": 40 # optional for HNSW only 
                },
                "returnStoredSource": True }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } }
    ]
    results = collection.aggregate(pipeline)
    return results

little test here

In [None]:
query = "snowboard"
col = db['ProductCollection']
results = vector_search(query,col, 5)

print("\nResults:\n")
if True: 
    for result in results:
        print(f"Similarity Score: {result.get('similarityScore')}")
        document = result.get('document', {})
        print(f"Id: {document.get('Id')}")
        print(f"Type: {document.get('Type')}")
        print(f"Brand: {document.get('Brand')}")
        print(f"Name: {document.get('Name')}")
else:
    print("No results found. Please check your query or data setup.")


### Evaluates the predicted rating vs actual (enhanced) rating

In [None]:
# why are we doing this?


collection_actual_rating.delete_many({})
# Load actual ratings data
with open("./data/ratings/actualRatings.json", "r") as f:
    actual_rating = json.load(f)

# Insert data into the ActualRating collection
result = collection_actual_rating.insert_many(actual_rating)

print(f"Number of data points added: {len(result.inserted_ids)} in ActualRating")


## Logic: 
Using Vector similarity search, look up top 10 products, based on the query
    Exclude items that have the same type as the top result (So if the user buys a snowboard, it doesnt recommend any more snowboards) and products already rated by the user.

Now feed in these top 10 products in the recommendation model to recommend me top 3 products that user should buy based on the products they have rated before (not in the actualRatings collection)

In [8]:
def get_vector_based_recommendations_excluding_type(user_query, exclude_type, rated_products, num_results=10):
    query_embedding = generate_embeddings(user_query)
    if query_embedding is None:
        return []

    filter_criteria = {
        "Type": {"$ne": exclude_type}, 
        "Id": {"$ne": rated_products}
    }

    results = collection_product.aggregate([
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "path": "Embedding",
                    "k": num_results,
                    "filter": filter_criteria
                },
                "returnStoredSource": True
            }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' }
    }])

    return list(results)

- get_rated_product_ids(user_id): This function fetches and returns a list of product IDs that the specified user has already rated. It queries a database collection named collection_actual_rating to find all ratings associated with the user's ID and extracts distinct product IDs from those ratings. The purpose is to identify products the user has interacted with, ensuring the recommendation system does not suggest items the user has already reviewed.

- get_better_recommendation(user_id, recommendations, num_results): This function filters a list of recommendations based on vector search. Then, it retrieves the user's predicted ratings for products from a collection named collection_predicted_rating (trained using collaborative filtering). The function matches these predictions with the initial list of recommendations, ensuring only items with a predicted rating are considered. Finally, it returns the top num_results product IDs from this filtered list, aiming to provide more personalized output.

- recommend_products(user_query, user_id, num_results=3): It identifies the type of this top result and retrieves a list of products the user has already rated. With this information, it obtains a broader set of vector-based recommendations excluding products of the same type as the top result and those already rated by the user. Finally, it narrows down these recommendations using the get_better_recommendation function to select the top num_results items based on predicted user preferences.

In [9]:
def get_rated_product_ids(user_id):
    """
    Retrieves a set of product IDs that the user hasn't rated yet by querying the actual ratings collection.
    """
    rated_product_ids = collection_actual_rating.find({"UserId": user_id}).distinct("ProductId")
    return list(set(rated_product_ids))

def get_better_recommendation(user_id, recommendations, num_results):
    recommendation_ids = [rec['document']['Id'] for rec in recommendations]
    predicted_rating_products = collection_predicted_rating.find_one({"UserId": user_id})
    
    # Filter predictions to find those matching the recommended product IDs
    filtered_predictions = [
    prediction for prediction in predicted_rating_products['Predictions']
    if prediction['ProductId'] in recommendation_ids
    ]

    # Return the Product IDs for the top N filtered predictions
    return [prediction['ProductId'] for prediction in filtered_predictions[:num_results]]

def recommend_products(user_query, user_id, num_results=3):
    """
    Recommends products based on a user query, excluding products of the same type as the top result and
    products the user has already rated.
    """
    top_result_cursor = vector_search(user_query, collection_product, 1)
    top_result = next(top_result_cursor, None)  
    top_product_type = top_result.get('document', {}).get('Type')  
    rated_products = get_rated_product_ids(user_id)
    recommendations_vs = get_vector_based_recommendations_excluding_type(user_query, top_product_type, rated_products, 20)
    
    return get_better_recommendation(user_id, recommendations_vs, num_results), recommendations_vs

## Final Function to recommend products

In [13]:
# actual_user_ids = [144, 496, 189, 232, 194, 950, 370, 980, 190, 404, 737, 959, 142, 795, 121, 743, 307, 365, 30, 726, 339, 536]
user_query = "i just bought a snowboard, what other products recommend me more products to buy?"
user_id = '189'
user_id = int(user_id)
num_results = 10
recommended_products_id, vector_search_recommendations = recommend_products(user_query, user_id, num_results)

print("---------Vector Search Results: --------")
for product in vector_search_recommendations[:num_results]:
    print(f"{product['document']['Id']}: {product['document']['Name']}")

print("\n--------Model + VS Results: ---------")
for recommend_product in recommended_products_id:
    product = (collection_product.find_one({'Id':recommend_product}))
    print(f"{product.get('Id')}: {product.get('Name', 'No name')}, Price: {product.get('Price', 'No price')}")


rated_products = collection_actual_rating.find({'UserId': user_id})
print("\n-------Rated Products: --------")
for product in rated_products:
    id = product['ProductId']
    print(f"{id}: {collection_product.find_one({'Id':id}).get('Name')} | Rating: {product.get('Rating')}")


---------Vector Search Results: --------
55: Vigor 2.0 Insulated Jacket
71: Explorer Frost Boots
47: Edge Pro Ice Axe
8: Frostbite Insulated Jacket
79: Everest Insulated Jacket
15: Summit Pro Insulated Jacket
28: Alpine Peak Down Jacket
2: Summit Pro Harness
49: Arctic Shield Insulated Jacket
89: Summit Pro Down Jacket

--------Model + VS Results: ---------
88: Alpine AlpinePack Backpack, Price: 129.0
7: Explorer 45L Backpack, Price: 149.99
28: Alpine Peak Down Jacket, Price: 249.99
55: Vigor 2.0 Insulated Jacket, Price: 189.99
15: Summit Pro Insulated Jacket, Price: 249.99
65: Sprint PRO Carbon Cycling Helmet, Price: 179.99
49: Arctic Shield Insulated Jacket, Price: 169.99
71: Explorer Frost Boots, Price: 149.99
89: Summit Pro Down Jacket, Price: 239.99
8: Frostbite Insulated Jacket, Price: 179.99

-------Rated Products: --------
