In [1]:
!pip install faiss-cpu==1.7.4  # FAISS can only load database from same FAISS version
!pip install transformers
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import faiss
import numpy as np
import torch
import openai
import textwrap

In [3]:
class VectorDataBase:
    """
    This class functions as a vector database

    :param _storage: Stores the vector database
    """
    _storage: faiss.Index
    _id: np.ndarray
    _metadata: np.ndarray
    _review: np.ndarray
    _metadata_storage: np.ndarray
    _ntotal: int
    _item: np.ndarray

    def __init__(self, database_file_path: str, id_file_path: str, metadata_file_path: str, review_file_path: str, metadata_storage_file_path: str, item_file_path: np.ndarray):
        self._storage = faiss.read_index(database_file_path)
        self._id = np.load(id_file_path)
        self._metadata = np.load(metadata_file_path)
        self._review = np.load(review_file_path)
        self._metadata_storage = np.load(metadata_storage_file_path, allow_pickle=True)
        self._ntotal = self._storage.ntotal
        self._item = np.load(item_file_path)

    def search_for_index(self, query: np.ndarray, k: int):
        """
        Search the database

        :param query: This is the query vector
        :param k: This is how many items to retrieve
        :return: The indexs of most similar vectors
        """
        # First output stores the distance between query and retrieved vectors
        # I stores the index of retrieved vectors
        _, I = self._storage.search(query, k)

        return I

    def search_for_vector(self, query: np.ndarray, k: int):
        """
        Search the database and return the actual embedding vectors

        :param query: This is the query vector
        :param k: This is how many items to retrieve
        :return: A numpy array containing the actual most similar vectors
        """
        # First output stores the distance between query and retrieved vectors
        # I stores the index of retrieved vectors
        _, I = self._storage.search(query, k)

        list_of_vectors = []
        # Create the np array that contains the most similar vectors
        for index in I[0]:
            list_of_vectors.append(self._storage.reconstruct(int(index)))

        np_of_vectors = np.array(list_of_vectors)

        return np_of_vectors

    def filter_with_id(self, target_id: str) -> np.ndarray:
        """
        This function serves as the filter for id

        :param id: A 1d np array
        :param target_id: A string representing the id you are searching for

        :return: A numpy array with the same shape as id, with index of target_id
        set to True while all other index set to false
        """
        return self._id == target_id

    def filter_with_metadata(self, target: str) -> np.ndarray:
        """
        This function examines the metadata and searches for lists that
        include the target. It then generates a one-dimensional numpy array
        where the index corresponds to each list. If a list contains the target,
        the corresponding index in the array is set to True; otherwise,
        it is set to False.

        :param metadata: A 2d list containing metadata
        :param target: A string representing the target we are filtering for

        :return: A 1d numpy array with True represents this item's metadata
        contains the target and False represent otherwise
        """

        indexes = np.zeros((self._ntotal), dtype=bool)

        items_satisfies_requirement = []

        for i, info in enumerate(self._metadata_storage):
            for key in info.keys():
                if isinstance(info[key], dict):
                    if(target in info[key].values()):
                        items_satisfies_requirement.append(i)
                        break
                else:
                    if(target == info[key]):
                        items_satisfies_requirement.append(i)
                        break
                    if isinstance(info[key], str):
                        if(target in info[key]):
                            items_satisfies_requirement.append(i)
                            break

        for number in items_satisfies_requirement:
            id_filter = self._metadata == number
            indexes = np.logical_or(indexes, id_filter)

        return indexes

    def search_with_filter(self, query: np.ndarray, top_k_items: int, top_k_revirw:int,  target_id: list = None, target_metadata: list = None) -> tuple[np.ndarray, np.ndarray]:
        """
        This function filters the datavase to look for indexs with metadata
        that contains the target we are looking for and items with id we are looking for.

        :param query: The query embedding of shape [1, 768]
        :param top_k_items: The number of items we want to return
        :param top_k_revirw: The number of reviews we want to use for late fusion
        :param target_id: The target id we are looking for
        :param target_metadata: The metadata we are looking for

        :return: The indexs of the review
        """
        # Create id filter
        id_filter = np.ones((self._ntotal), dtype=bool)
        if(target_id != None):
            # If user did not specify what id they are looking for
            # we are not going to filter out anything
            id_filter = np.zeros((self._ntotal), dtype=bool)
            for id in target_id:
                id_filter_requirement = self.filter_with_id(id)
                id_filter = np.logical_or(id_filter, id_filter_requirement)

        # Create metadata filter
        metadata_filer = np.ones((self._ntotal), dtype=bool)
        if(target_metadata != None):
            # If user did not specify the kind of metadata they are looking for
            # we are not going to filter out anything
            for requirement in target_metadata:
                metadata_filer_requirement = self.filter_with_metadata(requirement)
                metadata_filer = np.logical_and(metadata_filer, metadata_filer_requirement)

        mask = np.logical_and(id_filter, metadata_filer)

        count = np.count_nonzero(mask == True)
        if(count == 0):
            # If the user specifies a filter that no item can satisfy
            print("""The filter you have entered appears to exclude all available
            options. Please review your filter criteria to ensure that it allows
            for the selection of relevant items.""")
            return None
        if(count < top_k_items):
            # If the user ask for more retrieved item than there is
            print("""The number of items you want to retrieve is more than number of items that satisfies
            your requirements.""")
            return None

        # Actually searching
        top_k_item_index, top_k_item_most_similar_review = self.most_similar_item(query, top_k_items, top_k_revirw, mask)
        return top_k_item_index, top_k_item_most_similar_review

    def _find_similarity_vector(self, query: np.ndarray, filter: np.ndarray) -> np.ndarray:
        query = query.reshape(1, self._storage.d)
        D, I = self._storage.search(query, self._storage.ntotal)
        D = D[0]
        I = I[0] #For some reason FAISS return a numpy within a numpy that contains all the answer.

        output = [False] * self._ntotal
        for i, index in enumerate(I):
            output[index] = D[i]

        output = np.array(output)
        output = output * filter
        return output

    def _find_similarity_item(self, similarity_score: np.ndarray, top_k_review: int) -> tuple[np.ndarray, np.ndarray]:
        """
        This function finds and returns a tensor that contains the similarity score for each item

        :param similarity_score: A tensor of similarity score between each review and the query
        :param top_k_review: A number that tells the number of most similar tensors to look at when doing late fusion(k)
        :return: Returning a tuple with element 0 being a tensor that contains the similarity score for each item
                and element 1 being a tensor that contains the index of top k reviews for each item
        """
        similarity_score = torch.tensor(similarity_score)

        index = 0
        # size records how many items are in the matrix
        size = self._item.shape[0]

        item_score = []
        item_index = []

        for i in range(size):
            # Mask out the review scores related to one item
            similarity_score_item = similarity_score[index:index + self._item[i]]

            # Get the top k review scores or all review scores if the number of reviews is less than k
            values, index_topk = similarity_score_item.topk(top_k_review)

            index_topk += index

            # Get the item score by finding the mean of all the review scores
            item_score.append(values.mean(dim=0))
            item_index.append(index_topk.tolist())

            index += self._item[i]

        item_score = np.array(item_score)
        item_index = np.array(item_index)
        return item_score, item_index

    def most_similar_item(self, query: np.ndarray, top_k_items: int, top_k_revirw: int, filter: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        """
        This function returns the most similar item's index given the item similarity score

        :param top_k_items: Number of items to return
        :param top_k_revirw: Number of reviews to look at
        :return: The indices of the most similar item, beginning from the most similar to the least similar,
         and their corresponding most similar review index
        """
        similarity_score_review = self._find_similarity_vector(query, filter)
        similarity_score_item, most_similar_review_index = self._find_similarity_item(similarity_score_review, top_k_revirw)

        # Unfortunately, numpy doesn't have finding the biggest elements in an array and return the indexs in
        similarity_score_item = torch.tensor(similarity_score_item)
        _, top_k_item_index = similarity_score_item.topk(top_k_items)

        top_k_item_index = np.array(top_k_item_index)
        top_k_item_most_similar_review = most_similar_review_index[top_k_item_index]

        return top_k_item_index, top_k_item_most_similar_review

    def get_database_size(self):
        """
        This function finds how many vectors this database is storing

        :return: The size of the database
        """
        return self._ntotal

    def get_vector_size(self):
        """
        This function finds the size of the vector this database is storing

        :return: The size of the vector
        """
        return self._storage.d

In [4]:
!unzip data.zip

unzip:  cannot find or open data.zip, data.zip.zip or data.zip.ZIP.


In [5]:
database = VectorDataBase("data/vector_database.faiss", "data/id.npy", "data/metadata.npy", "data/review.npy", "data/metadata_database.npy", "data/item.npy")

In [6]:
# This is not my code!!!!!!!!

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import torch
import transformers
transformers.logging.set_verbosity_error()

"""
   Modified based on  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""

class BERT_model:

    _BERT_name: str
    _name1: str
    _name2: str
    _device: torch.device

    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self._BERT_name = BERT_name
        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self._bert_model, self._name1, self._name2 = self._create_model(BERT_name, from_pt)
        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def embed(self, texts: list[str], strategy=None, bs=48, verbose=0):
        """_summary_

        :param texts: list of strings to be embedded
        :param strategy (optional): Defaults to None.
        :param bs (optional): Defaults to 48.
        :param verbose (optional): Defaults to 0.
        :return: embeddings of texts
        """
        tokenized_review = self._tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self._name1: tokenized_review['input_ids'],
                self._name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }

        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self._bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self._bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)

    def get_tensor_embedding(self, query: str):
        """
        Get a tensor embedding of a string.

        :param query: string to be embedded
        :return: tensor embedding of query
        """
        query_embedding = self.embed([query])
        query_embedding = torch.tensor(query_embedding).to(self._device)
        query_embedding = query_embedding.squeeze(0)

        return query_embedding

    def _create_model(self, BERT_name, from_pt=True):
        ## BERT encoder
        encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

        ## Model
        input_ids = layers.Input(shape=(None,), dtype=tf.int32)
        attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
        # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

        embedding = encoder(
            # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            input_ids=input_ids, attention_mask=attention_mask
        )

        model = keras.Model(
            # inputs=[input_ids, attention_mask, token_type_ids],
            inputs = [input_ids, attention_mask],
            outputs = embedding)

        model.compile()
        return model, input_ids.name, attention_mask.name



In [7]:
data_preprocessing=BERT_model("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco", "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")

In [8]:
def create_response_single_item(item_index: int, review_index: list, query: str, requirements: list) -> str:
    setting = "You are a helpful assistant, helping me give a recommendation to a customer about this restaurant"
    concatenated_reviews = ""
    for review in review_index:
        concatenated_reviews += database._review[review]
    prompt = "These are the restaurant information: " + str(database._metadata_storage[item_index]) + ". User sepeifies these requirements: " + str(requirements) + ". This is user's query: " + str(query) +". These are the available reviews you can use: "+ str(concatenated_reviews) + "Help me generate a response explaining why this restaurant is a good choice."

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",  # Please verify the current version of GPT model on OpenAI's website
        messages=[
            {"role": "system", "content": setting},
            {"role": "system", "content": prompt},
        ]
    )

    return response['choices'][0]['message']['content']

In [9]:
def create_response_multiple_item(item_index: list, item_review: list, query: str, requirements: list):
    concatenated_response = ""
    for i in range(len(item_index)):
        concatenated_response += create_response_single_item(item_index[i], item_review[i], query, requirements)

    setting = "You are a helpful assistant giving recommendation to a customer, these are the summaries of " + str(len(item_index)) + " restaurants. Help me generate a recommendation to customer explaining why all of these restaurants are a good choice, in a casual short conversation."
    info = "The names of the restaurants are as follows in respective order. "
    minor_setting = "Give the recommendation in the next response. Do not mention I gave you the information above. Do not greet the customer, go straight to recommendation. Do not say things like 'Hey There!'"
    for i in range(len(item_index)):
        info += database._metadata_storage[item_index[i]]["name"] + ", "

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Please verify the current version of GPT model on OpenAI's website
        messages=[
            {"role": "system", "content": setting},
            {"role": "system", "content": info},
            {"role": "user", "content": concatenated_response},
            {"role": "system", "content": minor_setting}
        ]
    )

    return response['choices'][0]['message']['content']

# Welcome to the filtering demo

Query: Enter restaurant description

Requirements: Enter requirements that the restaurant MUST meet

OpenAI api: Used to generate response.

In [16]:
# Enter restaurant description
query = "I want to have some sushi" #@param {type:"string"}
query = data_preprocessing.embed([query])

# Enter requirements that you MUST have
requirements = "['Japanese']" #@param {type:"string"}
if(requirements == ''):
    requirements = None
else:
    requirements = eval(requirements)

# Enter your openai API key
openai_api_key = "''" #@param {type:"string"}
if(openai_api_key == ""):
    raise Exception("No api key has been entered")
else:
    openai_api_key = eval(openai_api_key)

openai.api_key = openai_api_key

# Number of restaurants to return
restaurant_count = 2
# Number of reviews to look at
review_count = 2

top_k_item_index, top_k_item_most_similar_review = database.search_with_filter(query, restaurant_count, review_count, target_metadata = requirements)

recommendation = create_response_multiple_item(top_k_item_index, top_k_item_most_similar_review, query, requirements)
print(textwrap.fill(recommendation, width=50))

Based on the reviews and information available, I
have two great restaurants to recommend for you.
The first one is "I Love Sushi" which offers
authentic Japanese cuisine with a cozy atmosphere,
friendly staff, and delicious, fresh food. It's a
perfect spot for a quick lunch or a relaxed dinner
with friends. They also have great takeout service
if you want to enjoy their tasty dishes at home.
The second recommendation is "Zen All-You-Can-Eat
Sushi & Grill" which is known for its extensive
all-you-can-eat sushi menu. Customers have praised
the generous portion sizes, delicious food, and
friendly service. They also offer non-sushi
options to cater to different tastes. It might not
have the fanciest ambiance, but it offers
exceptional value for money with their affordable
all-you-can-eat option. Plus, they even have fruit
sushi, which is a unique and tasty twist.  Both of
these restaurants provide a great dining
experience, so you can choose the one that suits
your preferences. Enjoy your

In [None]:
# Example outputs

# Input query: I am so hungry, I want some food
# Requirements: Chinese

# I would recommend Jumbo Dim Sum
# Dining for a Chinese dining experience. One
# reviewer described the food as really good and the
# portion size as generous, offering value for
# money. While there was a comment about rudeness
# from the staff, it is important to consider that
# dim sum places often have fast-paced and efficient
# service, which can sometimes come across as curt.
# Overall, Jumbo Dim Sum Dining seems like a good
# option for those seeking a satisfying, authentic
# Chinese dining experience.

In [None]:
# Example outputs

# Input query: I want some yummy pizza
# Requirements: Pizza

# Both LovePizza and Famoso Neapolitan Pizzeria are
# excellent choices for pizza lovers. LovePizza
# stands out for its unique and delicious toppings,
# such as truffle Parmesan sauce and Mac and cheese
# pizza. The flavorful and substantial dough is also
# a highlight. The staff is friendly and the modern
# interior creates a hip atmosphere. On the other
# hand, Famoso Neapolitan Pizzeria offers delicious
# thin crust pizzas with rave reviews about the
# Margarita Pizza and Ham and Pineapple Pizza with
# fresh feta toppings. The artisan bread and
# prosciutto-wrapped mozzarella balls are also
# highly recommended. The atmosphere is great, and
# the menu includes other tasty options like salads
# and gelato. Overall, both restaurants guarantee a
# satisfying and enjoyable pizza dining experience.

In [None]:
# Example outputs

# Input query: I want to have breakfast
# Requirements: Vegan

# I've got a couple of great restaurant
# recommendations for you based on your preferences.
# The first one is Highlevel Diner. They offer a
# wide variety of comfort food, sandwiches, and
# Canadian cuisine, with a special focus on vegan
# and vegetarian options. Their breakfast specials
# are especially popular, with rave reviews about
# the taste and quality. Plus, if you're into
# cycling, they even offer a discount for customers
# who arrive by bike.   The second option is Cafe
# Mosaics, which is known for its delicious vegan
# and vegetarian dishes. They have an extensive menu
# featuring pancakes, tofu scramble, sandwiches,
# burgers, and even vegan desserts like chocolate
# cake. Customers have praised their fast service,
# generous portions, and reasonable prices. The
# atmosphere at Cafe Mosaics is cozy and the staff
# is friendly, making it a great spot to enjoy a
# satisfying meal.  Both Highlevel Diner and Cafe
# Mosaics are highly recommended options that cater
# to your preference for vegan-friendly restaurants.
# So, whether you're craving a comforting Canadian
# dish or a delicious vegan breakfast, you can't go
# wrong with either of these choices. Enjoy your
# meal!