In [27]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [28]:
#This cell reads from the PMD file and create a csv file that contains which query is related to which restaurant
import pandas as pd

class FindQueryRelatedRestaurants():
    # Function to build true labels
    def build_true_labels(self, source_file_path, destination_file_path):
        df = pd.read_csv(source_file_path)
        queries = df["query"].unique()
        restaurants = df["Restaurant name"].unique()
        final_df = pd.DataFrame(index=queries, columns=restaurants)
        for index, row in df.iterrows():
            final_df.at[row["query"], row["Restaurant name"]] = row["If only Low or  High"]
        
        final_df.to_csv(destination_file_path)
    
    def find_R_value(self, source_file_path, destination_file_path):
        # read the file
        df = pd.read_csv(source_file_path)

        # calculate the count of 1's in each row and add this as a new column
        df['num_of_1'] = df.apply(lambda row: sum(row == 1), axis=1)

        # save the updated DataFrame back to CSV
        df.to_csv(destination_file_path, index=False)

In [29]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import torch

"""
   Modified based on  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""

class BERT_model:
    
    _BERT_name: str
    _name1: str
    _name2: str
    _device: torch.device
    
    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self._BERT_name = BERT_name
        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self._bert_model, self._name1, self._name2 = self._create_model(BERT_name, from_pt)
        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
           
    def embed(self, texts: list[str], strategy=None, bs=48, verbose=0):
        """_summary_

        :param texts: list of strings to be embedded
        :param strategy (optional): Defaults to None.
        :param bs (optional): Defaults to 48.
        :param verbose (optional): Defaults to 0.
        :return: embeddings of texts
        """
        tokenized_review = self._tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self._name1: tokenized_review['input_ids'],
                self._name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }
        
        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self._bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self._bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)

    def get_tensor_embedding(self, query: str):
        """
        Get a tensor embedding of a string.

        :param query: string to be embedded
        :return: tensor embedding of query
        """
        query_embedding = self.embed([query])
        query_embedding = torch.tensor(query_embedding).to(self._device)
        query_embedding = query_embedding.squeeze(0)

        return query_embedding
    
    def _create_model(self, BERT_name, from_pt=True):
        ## BERT encoder
        encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

        ## Model
        input_ids = layers.Input(shape=(None,), dtype=tf.int32)
        attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
        # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

        embedding = encoder(
            # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            input_ids=input_ids, attention_mask=attention_mask
        )

        model = keras.Model(
            # inputs=[input_ids, attention_mask, token_type_ids],
            inputs = [input_ids, attention_mask],
            outputs = embedding)

        model.compile()
        return model, input_ids.name, attention_mask.name

In [30]:
import pandas as pd
import torch

class EmbeddingCreator():
    def __init__(self, embedding_model:BERT_model):
        self.embedder = embedding_model 
    
    def embed(self, source_file_path:str, destination_file_path:str):
        # load the csv file
        df = pd.read_csv(source_file_path)

        # loop through the DataFrame, accessing one row at a time
        for index, row in df.iterrows():
            print(index)
            # let's assume you want to modify column 'ColumnName'
            value = row['review_text']

            # apply some function to modify the value
            embedding=self.embedder.embed([value])
            embedding=torch.tensor(embedding)
            embedding=embedding.squeeze(0)
            embedding=embedding.tolist()

            # update the value in the DataFrame
            df.loc[index, 'review_text'] = str(embedding)
        
        # save the DataFrame back to csv
        df.to_csv(destination_file_path, index=False)

In [31]:
import pandas as pd

class SortEmbedding():
    def sort_embedding(self, source_file, destination_file):
        # load your data
        df = pd.read_csv(source_file)

        # sort the dataframe by the column of interest
        df = df.sort_values(by='business_id')

        # save your data back to csv
        df.to_csv(destination_file, index=False)

In [32]:
#Create the matrix for our information retrieval
import pandas as pd
import torch
import ast

class CreateMatrix():
    def create_matrix(self, source_file, destination_file):
        # Loop through the sorted embedding csv file
        df=pd.read_csv(source_file)

        container=[]

        size=len(df["review_text"])

        for i in range(size):
            embedding=ast.literal_eval(df["review_text"][i])
            embedding=torch.tensor(embedding)
            container.append(embedding)

        container=torch.stack(container)

        torch.save(container, destination_file)

In [33]:
import pandas as pd
import torch

class CreateItemSeperation():
    def get_item_seperation(self, source_file_path:str, destination_file_path:str):
        # Load your CSV file into a pandas DataFrame
        df = pd.read_csv(source_file_path)

        # Group by the specified column and count the number of rows in each group
        value_counts = df.groupby('business_id').size()

        #Convert it into a list
        value_counts = value_counts.to_list()

        #Change it into a tensor
        tensor = torch.tensor(value_counts)

        torch.save(tensor, destination_file_path)

In [34]:
import numpy as np
import pandas as pd
import torch

class NeuralSearchEngine():
    """
    Class that is resonnsible for searching for topk most relevant restaurants using BERT_model.
    
    :param embedder: BERT_model to embed query
    """
    
    def __init__(self, embedder: BERT_model):
        self._embedder = embedder
    
    def search_for_topk(self, query: str, topk_restaurants: int, topk_reviews: int, panda_object:pd.DataFrame,
                        matrix: torch.Tensor, item_review_count: torch.Tensor):
        """
        This function takes a query and returns a list of business id that is most similar to the query and the top k
        reviews for that item

        :param query: The input information retriever gets :param topk_restaurants: Number of items to return
        :param topk_restaurants: Number of restaurants to be returned
        :param topk_reviews: Number of reviews for each item :param panda_object: The panda object that reads from the
        review embedding file
        :param panda_object: Gets the panda object that read from review embedding sorted file
        :param matrix: A pytorch tensor that contains the matrix for review embedding
        :param item_review_count: A pytorch tensor that contains the amount of reviews each restaurant have
        :return: Return a tuple with element 0 being a list[str] a list of string containing the most similar item's
        business_id and element 1 being list[list[str]] with Dim 0 has the top k most similar item, Dim 1 has the top k
        reviews for the corresponding item
        """
        query_embedding = self._embedder.get_tensor_embedding(query)
        similarity_score_review = self._similarity_score_each_review(query_embedding, matrix)
        similarity_score_item, index_most_similar_review = self._similarity_score_each_item(similarity_score_review,
                                                                                            item_review_count,
                                                                                            topk_reviews)
        most_similar_item_index = self._most_similar_item(similarity_score_item, topk_restaurants)
        list_of_business_id = self._get_topk_restaurant_business_id(most_similar_item_index, panda_object)
        #list_of_review = self._get_review(most_similar_item_index, index_most_similar_review, panda_object)
        
        return list_of_business_id#, list_of_review
    
    def _similarity_score_each_review(self, query: torch.Tensor, reviews: torch.Tensor):
        """
        This function finds and returns a tensor that contains the similarity score for each review

        :param query: A tensor containing the query embedding
        :param reviews: A matrix of all the review embedding
        :return: A pytorch tensor that contains the similarity score for each review
        """

        # Get the similarity score using matrix multiplication
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        reviews = reviews.to(device)
        query = query.to(device)
        similarity_score = torch.matmul(reviews, query)

        return similarity_score

    def _similarity_score_each_item(self, similarity_score: torch.Tensor, item_review_count: torch.Tensor, k: int):
        """
        This function finds and returns a tensor that contains the similarity score for each item

        :param similarity_score: A tensor of similarity score between each review and the query
        :param item_review_count: A tensor containing how many review each item has
        :param k: A number that tells the number of most similar tensors to look at when doing late fusion(k)
        :return: Returning a tuple with element 0 being a tensor that contains the similarity score for each item
                and element 1 being a tensor that contains the index of top k reviews for each item
        """

        index = 0
        # size records how many restaurants are in the matrix
        size = item_review_count.size(0)

        item_score = []
        item_index = []

        for i in range(size):
            # Mask out the review scores related to one item
            similarity_score_item = similarity_score[index:index + item_review_count[i]]

            # Get the top k review scores
            values, index_topk = similarity_score_item.topk(k)

            index_topk += index

            # Get the item score by finding the mean of all the review scores
            item_score.append(values.mean(dim=0))
            item_index.append(index_topk)

            index += item_review_count[i]

        item_score = torch.stack(item_score)
        item_index = torch.stack(item_index)
        return item_score, item_index

    def _most_similar_item(self, similarity_score_item: torch.Tensor, top_k_restaurants: int):
        """
        This function returns the most similar item's index given the item similarity score

        :param similarity_score_item: The similarity score for each item
        :param top_k_restaurants: Number of restaurants to return
        :return: The index of the most similar item, beginning from the most similar to the least similar
        """

        _, indices = similarity_score_item.topk(top_k_restaurants)
        return indices

    def _get_topk_restaurant_business_id(self, most_similar_item_index, df: pd.DataFrame):
        """
        Get the most similar item's business id

        :param most_similar_item_index: A tensor containing the top k items with the most
        :param file_name: The review embedding file to access business id
        :return: A list of business id of the most similar items. Beginning from the most
            similar to the least.
        """
        unique_values = df["name"].unique()
        list_of_business_id = []
        most_similar_item_index = most_similar_item_index.tolist()
        for i in most_similar_item_index:
            list_of_business_id.append(unique_values[i])

        return list_of_business_id

    def _get_review(self, most_similar_item_index, index_most_similar_review, panda_object: pd.DataFrame):
        """
        Return the most similar reviews for thos top k restaurants

        :param most_similar_item_index: A tensor containing the index of the most similar items
        :param index_most_similar_review: A tensor containing the index of all the items(Not just the most similar item)
        :param panda_object: A panda object that reads from the review embedding file :return: returns a list[list[
        str]] with dim 0 has the top k most similar item, dim 1 has the top k reviews for the corresponding item
        """

        most_similar_item_index = most_similar_item_index.tolist()
        index_most_similar_review = index_most_similar_review.tolist()
        review_list = []
        for i in most_similar_item_index:
            most_similar_review_list = index_most_similar_review[i]
            review_list_item = []
            for j in most_similar_review_list:
                review_list_item.append(panda_object["Review"][j])

            review_list.append(review_list_item)

        return review_list

In [35]:
import torch
import pandas as pd

class InformationRetrievalRankCSV():
    def __init__(self, information_retrieval:NeuralSearchEngine):
        self.information_retrieval = information_retrieval
    def rank_item_to_csv(self, df_embedding:pd.DataFrame, matrix:torch.Tensor, item:torch.Tensor, source_file_path:str, destination_file_path:str):
        df_query = pd.read_csv(source_file_path)
        size = len(df_query["num_of_1"])

        # Create a new DataFrame of the same size as the original
        new_df = pd.DataFrame(index=df_query.index, columns=df_query.columns)

        # This loop gets each query and does information retrieval on it.
        for i in range(size):
            list_of_ranking = self.information_retrieval.search_for_topk(df_query["Unnamed: 0"][i], item.numel(), 1, df_embedding, matrix, item)

            # Loop through each column in the original DataFrame
            for col in df_query.columns:
                # If the column name exists in the list of rankings, find its index; else, assign -1
                index = (list_of_ranking.index(col) if col in list_of_ranking else -1)+1
                # Assign this index to the corresponding row in the column of the new DataFrame
                new_df.loc[i, col] = index

        for i in range(size):
            new_df.iloc[i, 0] = df_query["Unnamed: 0"][i]

        new_df = new_df.drop('num_of_1', axis=1)
        
        # Save the new DataFrame as a CSV file
        new_df.to_csv(destination_file_path, index=False)

In [36]:
import pandas as pd

class MAP():
    def find_MAP_score(self, relative_file_path:str, ranking_file_path:str, destination_file_path:str):
        #Go through the relative file to find all the related item's 
        df_relative = pd.read_csv(relative_file_path)
        df_ranking = pd.read_csv(ranking_file_path)

        positions = []
        length = df_relative.shape[0]
        width = df_relative.shape[1]

        for i in range(length):
            small_list = []
            for j in range(1, width-1):
                if(df_relative.iloc[i, j]==1):
                    small_list.append(df_ranking.iloc[i, j])
            
            small_list.sort()
            positions.append(small_list)
        
        list_of_MAP = []
        
        for i in range(len(positions)):
            counter = 1
            sum = 0
            for j in range(len(positions[i])):
                sum += counter/positions[i][j]
                counter += 1
            sum = sum/len(positions[i])
            
            list_of_MAP.append(sum)

        # Add the list as a new column
        list_of_MAP = pd.Series(list_of_MAP)
        list_of_MAP.name = "MAP"
        list_of_MAP = list_of_MAP.to_frame()
        list_of_MAP.to_csv(destination_file_path, index=False)

In [37]:
import pandas as pd

class MRP():
    def find_MRP_score(self, relative_file_path:str, ranking_file_path:str, destination_file_path:str):
        #Go through the relative file to find all the related item's 
        df_relative = pd.read_csv(relative_file_path)
        df_ranking = pd.read_csv(ranking_file_path)

        positions = []
        length = df_relative.shape[0]
        width = df_relative.shape[1]

        for i in range(length):
            small_list = []
            for j in range(1, width-1):
                if(df_relative.iloc[i, j]==1):
                    small_list.append(df_ranking.iloc[i, j])
            
            small_list.sort()
            positions.append(small_list)
        
        list_of_MRP = []
        for i in range(len(positions)):
            counter = 0
            for j in range(len(positions[i])):
                if(positions[i][j]<=len(positions[i])):
                    counter += 1
                else:
                    break
            
            list_of_MRP.append(counter/len(positions[i]))

        # Add the list as a new column
        df = pd.read_csv(destination_file_path)
        df["MRP"] = list_of_MRP
        df.to_csv(destination_file_path, index=False)

In [38]:
import os
import pandas as pd
import scipy.stats
import numpy as np

def check_file_exists(file_path):
    return os.path.isfile(file_path)

class Evaluate():
    def __init__(self, model_name:str):
        self.embedding_model = BERT_model(model_name, model_name)
        self.query_related_restaurants = FindQueryRelatedRestaurants()
        self.embed_review = EmbeddingCreator(self.embedding_model)
        self.sort_embedding = SortEmbedding()
        self.matrix_creator = CreateMatrix()
        self.item_creator = CreateItemSeperation()
        self.information_retrieval = NeuralSearchEngine(self.embedding_model)
        self.information_retrieval_ranking = InformationRetrievalRankCSV(self.information_retrieval)
        self.get_map = MAP()
        self.get_mrp = MRP()
    
    def mean_confidence_interval(self, data, confidence=0.90):
        a = 1.0 * np.array(data)
        n = len(a)
        m, se = np.mean(a), scipy.stats.sem(a)
        h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
        return h

    def output(self, source_file_path:str):
        df = pd.read_csv(source_file_path)

        MAP_mean = df["MAP"].mean()
        MRP_mean = df["MRP"].mean()

        print("The MAP is: ", MAP_mean)
        print("The MRP is: ", MRP_mean)
        print("Confidence interval of MAP is: ", MAP_mean-self.mean_confidence_interval(df["MAP"].tolist()), "to", MAP_mean+self.mean_confidence_interval(df["MAP"].tolist()))
        print("Confidence interval of MRP is: ", MRP_mean-self.mean_confidence_interval(df["MRP"].tolist()), "to", MRP_mean+self.mean_confidence_interval(df["MRP"].tolist()))

    def evaluate(self, file_path_PMD, file_path_review):
        if(check_file_exists(file_path_PMD)):
            #Only create the file if it doesn't already exist
            if(not check_file_exists("PMD_Relativity.csv")):
                self.query_related_restaurants.build_true_labels(file_path_PMD, "PMD_Relativity.csv")
        else:
            #If there is no PMD file, exit immediately
            print("The PMD file does not exist")
            return None

        if(not check_file_exists("PMD_Relativity_With_R.csv")):
            self.query_related_restaurants.find_R_value("PMD_Relativity.csv", "PMD_Relativity_With_R.csv")
        
        #Get the embedding for all the reviews
        if(check_file_exists(file_path_review)):
            if(not check_file_exists("embedded_file.csv")):
                self.embed_review.embed(file_path_review, "embedded_file.csv")
        else:
            #If there is no PMD file, exit immediately
            print("The 50_restaurants_all_rates.csv file does not exist")
            return None
        
        if(not check_file_exists("embedded_file_sorted.csv")):
            self.sort_embedding.sort_embedding("embedded_file.csv", "embedded_file_sorted.csv")

        #Create the matrix and item pt file
        if(not check_file_exists("matrix.pt")):
            self.matrix_creator.create_matrix("embedded_file_sorted.csv", "matrix.pt")
        
        if(not check_file_exists("item.pt")):
            self.item_creator.get_item_seperation("embedded_file_sorted.csv", "item.pt")
        
        #Create the matrix and item tensor
        matrix = torch.load("matrix.pt")
        item = torch.load("item.pt")

        df = pd.read_csv("embedded_file_sorted.csv")

        if(not check_file_exists("information_retrieval_ranking.csv")):
            self.information_retrieval_ranking.rank_item_to_csv(df, matrix, item, "PMD_Relativity_With_R.csv", "information_retrieval_ranking.csv")

        if(not check_file_exists("evaluation.csv")):
            self.get_map.find_MAP_score("PMD_Relativity_With_R.csv", "information_retrieval_ranking.csv", "evaluation.csv")
            self.get_mrp.find_MRP_score("PMD_Relativity_With_R.csv", "information_retrieval_ranking.csv", "evaluation.csv")

        self.output("evaluation.csv")

In [39]:
my_obj = Evaluate("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")
my_obj.evaluate("PMD.csv", "50_restaurants_all_rates.csv")

All PyTorch model weights were used when initializing TFDistilBertModel.

All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


The MAP is:  0.5634705053035411
The MRP is:  0.4889448666178525
Confidence interval of MAP is:  0.520852925158267 to 0.6060880854488153
Confidence interval of MRP is:  0.44118275622156317 to 0.5367069770141419
