# Data preprocessing for restaurants

#### Output files
1. item_metadata.json
2. top_all_restaurants_review_sorted.csv

In [None]:
!pip install transformers
!pip install faiss-cpu==1.7.4  # FAISS can only load database from same FAISS version

In [None]:
import json
import pandas as pd

class RetrieveTopRestaurant():
    def get_top_restaurant(self, source_file_path:str, destination_file_path:str, num_restaurant:int):
        # Open the original JSON file and load each line
        with open(source_file_path, 'r') as file:
            data = [json.loads(line) for line in file]

        filtered_data=[]

        counter=0

        # Filter out lines with the desired tag
        for item in data:
            if(item['categories']==None):
                continue
            if("Restaurants" in item['categories'] and item["is_open"]==1 and item['city'] == 'Edmonton' and
            not pd.isna(item['attributes']) and not pd.isna(item['hours'])):
                counter+=1
                try:
                    filtered_data.append(item)
                except:
                    print(filtered_data)

                if(counter == num_restaurant):
                    break

        # Write the filtered lines to a new JSON file
        with open(destination_file_path, 'w') as file:
            for item in filtered_data:
                file.write(json.dumps(item) + '\n')

In [None]:
import csv

class Json_to_CSV():
    def get_csv(self, source_file_path:str, destination_file_path:str):
        """This function takes in a json file as an input(source_file_path) and outputs a csv file(destination_file_path)

        Args:
            source_file_path:str : input json file path
            destination_file_path:str : output csv file path
        """
        with open(destination_file_path, "w", newline='') as file:
            writer=csv.writer(file)
            writer.writerow(["item_id", "name", "address", "city", "state", "postal_code", "latitude",
                            "longitude", "stars", "review_count", "is_open", "attributes", "categories", "hours"])

            with open(source_file_path, 'r') as f:
                for line in f:
                    # Load the JSON object from the line
                    json_obj = json.loads(line)

                    item_id=json_obj["business_id"]
                    name=json_obj["name"]
                    address=json_obj["address"]
                    city=json_obj["city"]
                    state=json_obj["state"]
                    postal_code=json_obj["postal_code"]
                    latitude=json_obj["latitude"]
                    longitude=json_obj["longitude"]
                    stars=json_obj["stars"]
                    review_count=json_obj["review_count"]
                    is_open=json_obj["is_open"]
                    attributes=json_obj["attributes"]
                    categories=json_obj["categories"]
                    hours=json_obj["hours"]

                    writer.writerow([item_id, name, address, city, state, postal_code, latitude, longitude,
                                    stars, review_count, is_open, attributes, categories, hours])

In [None]:
import pandas as pd

class Remove_Weird_Stuff():
    def clean(self, source_file_path:str):
        """
        Takes in a csv file and change all the attributes values inside the file into normal values

        :param source_file_path: file path to the file
        :return: A file without the weird u'
        """
        df = pd.read_csv(source_file_path)

        size = len(df["attributes"])

        for i in range(size):
            for key, value in eval(df["attributes"][i]).items():
                if key == "RestaurantsReservations":
                    dic = eval(df["attributes"][i])
                    dic["HasReservations"] = dic["RestaurantsReservations"]
                    del dic["RestaurantsReservations"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsGoodForGroups":
                    dic = eval(df["attributes"][i])
                    dic["GoodForGroups"] = dic["RestaurantsGoodForGroups"]
                    del dic["RestaurantsGoodForGroups"]
                    df["attributes"][i] = str(dic)
                elif key == "BestNights":
                    dic = eval(df["attributes"][i])
                    dic["PopularNights"] = dic["BestNights"]
                    del dic["BestNights"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsPriceRange2":
                    dic = eval(df["attributes"][i])
                    dic["PriceRange"] = dic["RestaurantsPriceRange2"]
                    del dic["RestaurantsPriceRange2"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsTableService":
                    dic = eval(df["attributes"][i])
                    dic["HasTableService"] = dic["RestaurantsTableService"]
                    del dic["RestaurantsTableService"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsDelivery":
                    dic = eval(df["attributes"][i])
                    dic["HasDelivery"] = dic["RestaurantsDelivery"]
                    del dic["RestaurantsDelivery"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsAttire":
                    dic = eval(df["attributes"][i])
                    dic["Attire"] = dic["RestaurantsAttire"]
                    del dic["RestaurantsAttire"]
                    df["attributes"][i] = str(dic)
                elif key == "RestaurantsTakeOut":
                    dic = eval(df["attributes"][i])
                    dic["HasTakeOut"] = dic["RestaurantsTakeOut"]
                    del dic["RestaurantsTakeOut"]
                    df["attributes"][i] = str(dic)
                elif key == "ByAppointmentOnly":
                    dic = eval(df["attributes"][i])
                    dic["MustMakeReservation"] = dic["ByAppointmentOnly"]
                    del dic["ByAppointmentOnly"]
                    df["attributes"][i] = str(dic)

        for i in range(size):
            for key, value in eval(df["attributes"][i]).items():
                if(value == "{}"):
                    dic = eval(df["attributes"][i])
                    dic.pop(key)
                elif(value == "None" or value == "none"):
                    if key == "Music":
                        dic = eval(df["attributes"][i])
                        dic[key] = "{dj: False, background_music: False, no_music: False, jukebox: False, live: False, video: False, karaoke: False}"
                    elif key == "Ambience":
                        dic = eval(df["attributes"][i])
                        dic[key] = "{touristy: False, hipster: False, romantic: False, intimate: False, trendy: False, upscale: False, classy: False, casual: False, divey: False}"
                    elif key == "BestNights":
                        dic = eval(df["attributes"][i])
                        dic[key] = "{monday: False, tuesday: False, friday: False, wednesday: False, thursday: False, sunday: False, saturday: False}"
                    elif key == "GoodForMeal":
                        dic[key] = "{dessert: False, latenight: False, lunch: False, dinner: False, brunch: False, breakfast: False}"
                    elif key == "BusinessParking":
                        dic[key] = "{garage: False, street: False, validated: False, lot: False, valet: False}"
                    elif key == "DietaryRestrictions":
                        dic[key] = "dairy-free: False, gluten-free: False, vegan: False, kosher: False, halal: False, soy-free: False, vegetarian: False}"
                else:
                    value = value.replace("u'", "")
                    value = value.replace("'", "")
                    value = value.replace("\"", "")

                    dic = eval(df["attributes"][i])
                    dic[key] = value

                df["attributes"][i] = str(dic)

        df.to_csv(source_file_path, index=False)

In [None]:
class FilterReview():
    def filter_review(self, metadata_source_file_path:str, reviews_source_file_path: str, destination_file_path:str):
        #Records the list of unique business_id
        list_of_unique_business_id=[]

        #Loop through the restaurant info file to collect all the business_id
        with open(metadata_source_file_path, "r") as f:
            for line in f:
                dict=json.loads(line)
                list_of_unique_business_id.append(dict["business_id"])

        with open(destination_file_path, "w", newline='') as file:
            writer=csv.writer(file)
            writer.writerow(["review_id", "user_id", "item_id", "stars", "useful", "funny", "cool", "text", "date"])
            #Loop through the restaurant review file and write to our output file
            with open(reviews_source_file_path, 'r') as f:
                for line in f:
                    dict=json.loads(line)
                    if(dict["business_id"] in list_of_unique_business_id):
                        review_id=dict["review_id"]
                        user_id=dict["user_id"]
                        item_id=dict["business_id"]
                        stars=dict["stars"]
                        useful=dict["useful"]
                        funny=dict["funny"]
                        cool=dict["cool"]
                        text=dict["text"]
                        date=dict["date"]
                        writer.writerow([review_id, user_id, item_id, stars, useful, funny, cool, text, date])


In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import transformers
# transformers.logging.set_verbosity_error()

"""
   Taken from  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""

def create_model(BERT_name, from_pt=True):
    ## BERT encoder
    encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

    ## Model
    input_ids = layers.Input(shape=(None,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
    # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

    embedding = encoder(
        # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        input_ids=input_ids, attention_mask=attention_mask
    )

    model = keras.Model(
        # inputs=[input_ids, attention_mask, token_type_ids],
        inputs=[input_ids, attention_mask],
        outputs=embedding, )

    model.compile()
    return model, input_ids.name, attention_mask.name

class BERT_model:
    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self.BERT_name = BERT_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.bert_model, self.name1, self.name2 = create_model(BERT_name, from_pt)

    def embed(self, texts, strategy=None, bs=48, verbose=0):
        tokenized_review = self.tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self.name1: tokenized_review['input_ids'],
                self.name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }

        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self.bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self.bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)


In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import pandas as pd
import os

class EmbedderCreator():
    def __init__(self, model:BERT_model):
        #This model is the model used to convert the restaurant review file into
        #embeddings
        self.embedding_model=model

    def embed(self, review_file_path:str, embedding_file_path:str):
        df = ""
        index = 0
        if(os.path.exists(embedding_file_path)):
            df = pd.read_csv(embedding_file_path)
            index = df.shape[0]
        else:
            # Creating the column title
            df = pd.DataFrame({
            'Review': [],
            "item_id":[]
            })

            # Writing the DataFrame to CSV
            df.to_csv(embedding_file_path, index=False)

        review_dataset=pd.read_csv(review_file_path)

        size=len(review_dataset["text"])

        # Writing the embedding into the file
        df = pd.read_csv(embedding_file_path)

        batch_size = 128

        list_of_review = [0] * batch_size
        list_of_business_id = [0] * batch_size

        for i in range(index, size):
            if(i%100==0):
                print(i/100)

            if(i%batch_size == 0 and i != 0):
                embedding = self.embedding_model.embed(list_of_review)
                embedding = embedding.tolist()
                df_new = pd.DataFrame({'Review': list_of_review, "item_id":list_of_business_id})

                # Append df_new to an existing csv file
                df_new.to_csv(embedding_file_path, mode='a', header=False, index = False)

                list_of_review[0] = review_dataset["text"][i]
                list_of_business_id[0] = review_dataset["item_id"][i]
            else:
                list_of_review[i%batch_size] = review_dataset["text"][i]
                list_of_business_id[i%batch_size] = review_dataset["item_id"][i]

        #Embed the remaining reviews
        remaining_reviews = size%batch_size
        list_of_review = list_of_review[:remaining_reviews]
        list_of_business_id = list_of_business_id[:remaining_reviews]

        embedding = self.embedding_model.embed(list_of_review)
        embedding = embedding.tolist()
        df_new = pd.DataFrame({'Review': list_of_review, "item_id":list_of_business_id})

        # Append df_new to an existing csv file
        df_new.to_csv(embedding_file_path, mode='a', header=False, index = False)

In [None]:
import pandas as pd

class SortMetaData():
    def sort_meta_data(self, source_file, destination_file):
        # load your data
        df = pd.read_csv(source_file)

        # sort the dataframe by the column of interest
        df = df.sort_values(by='item_id')

        # save your data back to csv
        df.to_csv(destination_file, index=False)

In [None]:
import pandas as pd

class SortEmbedding():
    def sort_embedding(self, source_file, destination_file):
        # load your data
        df = pd.read_csv(source_file)

        # sort the dataframe by the column of interest
        df = df.sort_values(by='item_id')

        # save your data back to csv
        df.to_csv(destination_file, index=False)

In [None]:
import pandas as pd
import torch

class CreateMatrix():
    def create_matrix(self, source_file, destination_file):
        # Loop through the sorted embedding csv file
        df=pd.read_csv(source_file)

        container=[]

        size=len(df["Embedding"])

        for i in range(size):
            embedding= eval(df["Embedding"][i])
            embedding=torch.tensor(embedding)
            container.append(embedding)

        container=torch.stack(container)

        torch.save(container, destination_file)

In [None]:
import pandas as pd
import torch

class CreateItemSeperation():
    def get_item_seperation(self, source_file_path:str, destination_file_path:str):
        # Load your CSV file into a pandas DataFrame
        df = pd.read_csv(source_file_path)

        # Group by the specified column and count the number of rows in each group
        value_counts = df.groupby('item_id').size()

        #Convert it into a list
        value_counts = value_counts.to_list()

        #Change it into a tensor
        tensor = torch.tensor(value_counts)

        torch.save(tensor, destination_file_path)

In [None]:
import pandas as pd
import torch
import numpy as np
import faiss

class CreateDatabase():
    def create_database(self, source_embedding_file_path: str, faiss_destination_file_path: str):
        # Load the metadata CSV file into a pandas DataFrame
        df_embedding = pd.read_csv(source_embedding_file_path)

        # Create the vector database
        dimension_size = 768
        index = faiss.IndexFlatIP(dimension_size)  # Create the index, uses dot product to measure similarity

        # Store each embedding into database
        for i in range(df_embedding.shape[0]):
            embedding_str = df_embedding["Embedding"][i]
            embedding_list = eval(embedding_str)
            embedding_np = np.array(embedding_list)
            embedding_np = embedding_np.reshape(1, 768)
            index.add(embedding_np)

        faiss.write_index(index, faiss_destination_file_path)

In [None]:
import pandas as pd
import json

class CreateMetadataStorage():
    def _format_optional(self, data: dict):
        for key in data:
            if data[key].lower() == 'true':
                data[key] = 'Yes'
            elif data[key].lower() == 'false':
                data[key] = 'No'

        if 'PriceRange' in data:
            if data['PriceRange'] == "1":
                data['PriceRange'] = "$0-$10"
            elif data['PriceRange']== "2":
                data['PriceRange'] = "$11-$30"
            elif data['PriceRange'] == "3":
                data['PriceRange'] = "$31-60"
            elif data['PriceRange'] == "4":
                data['PriceRange'] = "$60+"

        return data

    def create_json(self, metadata_source_file: str, destination_file_path: str):
        df = pd.read_csv(metadata_source_file)
        df = df.rename(columns={'attributes': 'optional'})

        df['latitude'] = df['latitude'].apply(float)
        df['longitude'] = df['longitude'].apply(float)
        df['stars'] = df['stars'].apply(float)
        df['review_count'] = df['review_count'].apply(int)
        df['is_open'] = df['is_open'].apply(bool)
        df['categories'] = df['categories'].apply(lambda x: list(x.split(",")))
        df['hours'] = df['hours'].apply(eval)
        df['optional'] = df['optional'].apply(eval)
        df['optional'] = df['optional'].apply(self._format_optional)
        df.to_json(destination_file_path, orient='records', lines=True)

In [None]:
class PreprocessData():
    def __init__(self, model_name:str):
        self.model_name=model_name
        self.embedding_model=BERT_model(self.model_name, self.model_name)
        self.top_restaurants_retriever=RetrieveTopRestaurant()
        self.convert_to_CSV=Json_to_CSV()
        self.clean = Remove_Weird_Stuff()
        self.find_review=FilterReview()
        self.create_embedding=EmbedderCreator(self.embedding_model)
        self.sort_meta_data=SortMetaData()
        self.sort_embedding=SortEmbedding()
        self.matrix=CreateMatrix()
        self.item=CreateItemSeperation()
        self.vector_database = CreateDatabase()
        self.metadata_database = CreateMetadataStorage()

    def preprocess_data(self, source_restaurant_info:str, source_restaurant_review:str):
        size = "all"
        size_str = str(size)
        if(size == "all"):
            size = -1

        #Filter out all the restaurants in Edmonton that are open and has opening hours
        if not os.path.isfile("top_"+size_str+"_restaurants.json"):
            self.top_restaurants_retriever.get_top_restaurant(source_restaurant_info, "top_"+size_str+"_restaurants.json", size)

        #Change the json file into a csv file
        if not os.path.isfile("top_"+size_str+"_restaurants.csv"):
            self.convert_to_CSV.get_csv("top_"+size_str+"_restaurants.json", "top_"+size_str+"_restaurants.csv")

        #Clean up the csv file
        if os.path.isfile("top_"+size_str+"_restaurants.csv"):
            self.clean.clean("top_"+size_str+"_restaurants.csv")

        #Find all the reviews for all the filtered restaurants
        if not os.path.isfile("top_"+size_str+"_restaurants_review.csv"):
            self.find_review.filter_review("top_"+size_str+"_restaurants.json", source_restaurant_review, "top_"+size_str+"_restaurants_review.csv")

        if not os.path.isfile("top_"+size_str+"_restaurants_review_sorted.csv"):
            self.sort_file_by_item_id("top_"+size_str+"_restaurants_review.csv", "top_"+size_str+"_restaurants_review_sorted.csv")

        #Sort the meta data for all filtered restaurants
        if not os.path.isfile("top_"+size_str+"_restaurants_sorted.csv"):
            self.sort_file_by_item_id("top_"+size_str+"_restaurants.csv", "top_"+size_str+"_restaurants_sorted.csv")

        # Data preprocessing for metadata storage
        if not os.path.isfile("item_metadata.json"):
            self.metadata_database.create_json("top_"+size_str+"_restaurants_sorted.csv", "item_metadata.json")


    def sort_file_by_item_id(self, source_file: str, destination_file: str):
        df = pd.read_csv(source_file)
        df = df.sort_values(by='item_id')
        df.to_csv(destination_file, index=False)


In [None]:
if __name__ == "__main__":
    data_preprocessing=PreprocessData("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")
    data_preprocessing.preprocess_data("< business info data file from yelp academic dataset >", "< reviews data file from yelp academic dataset >")