In [119]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [120]:
import json
import pandas as pd

class RetrieveTopRestaurant():
    def get_top_restaurant(self, source_file_path:str, destination_file_path:str, num_restaurant:int):
        # Open the original JSON file and load each line
        with open(source_file_path, 'r') as file:
            data = [json.loads(line) for line in file]

        filtered_data=[]

        counter=0

        # Filter out lines with the desired tag
        for item in data:
            if(item['categories']==None):
                continue
            if("Restaurants" in item['categories'] and item['city'] == "Edmonton" and item["is_open"]==1 and 
            not pd.isna(item['attributes']) and not pd.isna(item['hours'])):
                counter+=1
                try:
                    filtered_data.append(item)
                except:
                    print(filtered_data)
                
                if(counter == num_restaurant):
                    break

        # Write the filtered lines to a new JSON file
        with open(destination_file_path, 'w') as file:
            for item in filtered_data:
                file.write(json.dumps(item) + '\n')

In [121]:
import csv

class Json_to_CSV():
    def get_csv(self, source_file_path:str, destination_file_path:str):
        """This function takes in a json file as an input(source_file_path) and outputs a csv file(destination_file_path)

        Args:
            source_file_path:str : input json file path
            destination_file_path:str : output csv file path
        """
        with open(destination_file_path, "w", newline='') as file:
            writer=csv.writer(file)
            writer.writerow(["business_id", "name", "address", "city", "state", "postal_code", "latitude",
                            "longitude", "stars", "review_count", "is_open", "attributes", "categories", "hours"])

            with open(source_file_path, 'r') as f:
                for line in f:
                    # Load the JSON object from the line
                    json_obj = json.loads(line)

                    business_id=json_obj["business_id"]
                    name=json_obj["name"]
                    address=json_obj["address"]
                    city=json_obj["city"]
                    state=json_obj["state"]
                    postal_code=json_obj["postal_code"]
                    latitude=json_obj["latitude"]
                    longitude=json_obj["longitude"]
                    stars=json_obj["stars"]
                    review_count=json_obj["review_count"]
                    is_open=json_obj["is_open"]
                    attributes=json_obj["attributes"]
                    categories=json_obj["categories"]
                    hours=json_obj["hours"]

                    writer.writerow([business_id, name, address, city, state, postal_code, latitude, longitude,
                                    stars, review_count, is_open, attributes, categories, hours])

In [122]:
import pandas as pd
import ast

class Remove_Weird_Stuff():
    def clean(self, source_file_path:str):
        """
        Takes in a csv file and change all the attributes values inside the file into normal values

        :param source_file_path: file path to the file
        :return: A file without the weird u'
        """
        df = pd.read_csv(source_file_path)

        size = len(df["attributes"])

        for i in range(size):
            for key, value in ast.literal_eval(df["attributes"][i]).items():
                if(value == "{}"):
                    dic = ast.literal_eval(df["attributes"][i])
                    dic.pop(key)
                else:
                    value = value.replace("u'", "")
                    value = value.replace("'", "")
                    value = value.replace("\"", "")

                    dic = ast.literal_eval(df["attributes"][i])
                    dic[key] = value

                df["attributes"][i] = str(dic)

        df.to_csv(source_file_path, index=False)

In [123]:
class FilterReview():
    def filter_review(self, source_file_path:str, destination_file_path:str):
        #Records the list of unique business_id
        list_of_unique_business_id=[]

        #Loop through the restaurant info file to collect all the business_id
        with open(source_file_path, "r") as f:
            for line in f:
                dict=json.loads(line)
                list_of_unique_business_id.append(dict["business_id"])

        with open(destination_file_path, "w", newline='') as file:
            writer=csv.writer(file)
            writer.writerow(["review_id", "user_id", "business_id", "stars", "useful", "funny", "cool", "text", "date"])
            #Loop through the restaurant review file and write to our output file
            with open('/content/drive/MyDrive/business_review.json', 'r') as f:
                for line in f:
                    dict=json.loads(line)
                    if(dict["business_id"] in list_of_unique_business_id):
                        review_id=dict["review_id"]
                        user_id=dict["user_id"]
                        business_id=dict["business_id"]
                        stars=dict["stars"]
                        useful=dict["useful"]
                        funny=dict["funny"]
                        cool=dict["cool"]
                        text=dict["text"]
                        date=dict["date"]
                        writer.writerow([review_id, user_id, business_id, stars, useful, funny, cool, text, date])


In [124]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import transformers
transformers.logging.set_verbosity_error()

"""
   Taken from  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""

def create_model(BERT_name, from_pt=True):
    ## BERT encoder
    encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

    ## Model
    input_ids = layers.Input(shape=(None,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
    # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

    embedding = encoder(
        # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        input_ids=input_ids, attention_mask=attention_mask
    )

    model = keras.Model(
        # inputs=[input_ids, attention_mask, token_type_ids],
        inputs=[input_ids, attention_mask],
        outputs=embedding, )

    model.compile()
    return model, input_ids.name, attention_mask.name

class BERT_model:
    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self.BERT_name = BERT_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.bert_model, self.name1, self.name2 = create_model(BERT_name, from_pt)
           
    def embed(self, texts, strategy=None, bs=48, verbose=0):
        tokenized_review = self.tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self.name1: tokenized_review['input_ids'],
                self.name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }
        
        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self.bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self.bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)


In [125]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import pandas as pd
import os

class EmbedderCreator():
    def __init__(self, model:BERT_model):
        #This model is the model used to convert the restaurant review file into
        #embeddings
        self.embedding_model=model

    def embed(self, review_file_path:str, embedding_file_path:str):
        df = ""
        index = 0
        if(os.path.exists(embedding_file_path)):
            df = pd.read_csv(embedding_file_path)
            index = df.shape[0]
        else:
            # Creating the column title
            df = pd.DataFrame({
            'Review': [],
            'Embedding': [],
            "Business_ID":[]
            })

            # Writing the DataFrame to CSV
            df.to_csv(embedding_file_path, index=False)

        review_dataset=pd.read_csv(review_file_path)

        size=len(review_dataset["text"])

        # Writing the embedding into the file
        df = pd.read_csv(embedding_file_path)

        for i in range(index, size):
            review=review_dataset["text"][i]
            embedding=self.embedding_model.embed([review])
            embedding=torch.tensor(embedding)
            embedding=embedding.squeeze(0)
            embedding=embedding.tolist()

            # Defining new data as a dictionary
            new_data = {'Review': review_dataset["text"][i], 'Embedding': str(embedding), "Business_ID":review_dataset["business_id"][i]}

            # Transforming it into a DataFrame
            new_data_df = pd.DataFrame(new_data, index=[0])

            # Appending the new data
            df = pd.concat([df, new_data_df], ignore_index=False)

            if(i%100==0):
                print(i/100)
                df.to_csv(embedding_file_path, index=False)

        # Writing the updated data back to CSV
        df.to_csv(embedding_file_path, index=False)


In [126]:
import pandas as pd

class SortMetaData():
    def sort_meta_data(self, source_file, destination_file):
        # load your data
        df = pd.read_csv(source_file)

        # sort the dataframe by the column of interest
        df = df.sort_values(by='business_id')

        # save your data back to csv
        df.to_csv(destination_file, index=False)

In [127]:
import pandas as pd

class SortEmbedding():
    def sort_embedding(self, source_file, destination_file):
        # load your data
        df = pd.read_csv(source_file)

        # sort the dataframe by the column of interest
        df = df.sort_values(by='Business_ID')

        # save your data back to csv
        df.to_csv(destination_file, index=False)

In [128]:
import pandas as pd
import torch
import ast

class CreateMatrix():
    def create_matrix(self, source_file, destination_file):
        # Loop through the sorted embedding csv file
        df=pd.read_csv(source_file)

        container=[]

        size=len(df["Embedding"])

        for i in range(size):
            embedding=ast.literal_eval(df["Embedding"][i])
            embedding=torch.tensor(embedding)
            container.append(embedding)

        container=torch.stack(container)

        torch.save(container, destination_file)

In [129]:
import pandas as pd
import torch

class CreateItemSeperation():
    def get_item_seperation(self, source_file_path:str, destination_file_path:str):
        # Load your CSV file into a pandas DataFrame
        df = pd.read_csv(source_file_path)

        # Group by the specified column and count the number of rows in each group
        value_counts = df.groupby('Business_ID').size()

        #Convert it into a list
        value_counts = value_counts.to_list()

        #Change it into a tensor
        tensor = torch.tensor(value_counts)

        torch.save(tensor, destination_file_path)

In [130]:
class PreprocessData():
    def __init__(self, model_name:str):
        self.model_name=model_name
        self.embedding_model=BERT_model(self.model_name, self.model_name)
        self.top_restaurants_retriever=RetrieveTopRestaurant()
        self.convert_to_CSV=Json_to_CSV()
        self.clean = Remove_Weird_Stuff()
        self.find_review=FilterReview()
        self.create_embedding=EmbedderCreator(self.embedding_model)
        self.sort_meta_data=SortMetaData()
        self.sort_embedding=SortEmbedding()
        self.matrix=CreateMatrix()
        self.item=CreateItemSeperation()

    def preprocess_data(self, source_restaurant_info:str, source_restaurant_review:str):
        size = "all"
        size_str = str(size)
        if(size == "all"):
            size = -1
        
        #Filter out all the restaurants in Edmonton that are open and has opening hours
        if not os.path.isfile("top_"+size_str+"_restaurants.json"):
            self.top_restaurants_retriever.get_top_restaurant(source_restaurant_info, "top_"+size_str+"_restaurants.json", size)
        
        #Change the json file into a csv file
        if not os.path.isfile("top_"+size_str+"_restaurants.csv"):
            self.convert_to_CSV.get_csv("top_"+size_str+"_restaurants.json", "top_"+size_str+"_restaurants.csv")

        #Clean up the csv file
        if os.path.isfile("top_"+size_str+"_restaurants.csv"):
            self.clean.clean("top_"+size_str+"_restaurants.csv")
        
        #Find all the reviews for all the filtered restaurants
        if not os.path.isfile("top_"+size_str+"_restaurants_review.csv"):
            self.find_review.filter_review("top_"+size_str+"_restaurants.json", "top_"+size_str+"_restaurants_review.csv")

        #Sort the meta data for all filtered restaurants
        if not os.path.isfile("top_"+size_str+"_restaurants_sorted.csv"):
            self.sort_meta_data.sort_meta_data("top_"+size_str+"_restaurants.csv", "top_"+size_str+"_restaurants_sorted.csv")
        
        #Embed all the reviews
        if not os.path.isfile("top_"+size_str+"_restaurants_review_embedding.csv"):
            self.create_embedding.embed("top_"+size_str+"_restaurants_review.csv", "top_"+size_str+"_restaurants_review_embedding.csv")

        #Sort all the embedding
        if not os.path.isfile("top_"+size_str+"_restaurants_review_embedding_sorted.csv"):
            self.sort_embedding.sort_embedding("top_"+size_str+"_restaurants_review_embedding.csv", "top_"+size_str+"_restaurants_review_embedding_sorted.csv")

        #Get the matrix and item tensor
        if not os.path.isfile("matrix.pt"+size_str):
            self.matrix.create_matrix("top_"+size_str+"_restaurants_review_embedding_sorted.csv", "matrix.pt"+size_str)

        if not os.path.isfile("item.pt"+size_str):
            self.item.get_item_seperation("top_"+size_str+"_restaurants_review_embedding_sorted.csv", "item.pt"+size_str)

In [131]:
if __name__ == "__main__":
    data_preprocessing=PreprocessData("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")
    data_preprocessing.preprocess_data("/content/drive/MyDrive/business_info.json", "/content/drive/MyDrive/business_review.json")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["attributes"][i] = str(dic)


Music
0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0


KeyboardInterrupt: ignored