In [5]:
import os
import json

def loadbase():
    with open ('Dataset/Base/Restaurants.jsonl', 'r') as file:
        restaurants = [json.loads(line.strip()) for line in file]

    with open ('Dataset/Base/Hotels.jsonl', 'r') as file:
        hotels = [json.loads(line.strip()) for line in file]

    with open ('Dataset/Base/Attractions.jsonl', 'r') as file:
        attractions = [json.loads(line.strip()) for line in file]

        return restaurants, hotels, attractions

def clean_attraction(attraction):
    level_map = {
        0: "no",
        1: "low",
        2: "medium",
        3: "high"
    }
    keys = []
    values = []
    for key, value in attraction.items():
        keys.append(key)
        values.append(value['level'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i].replace('_',' '): level_map[values[i]] + ' ' + keys[i].replace('_',' ')}
        parsed = {**parsed, **new}

    return parsed

def parseAttraction(model,attractions_base):
    folder_path = f'preprocess/{model}/attractions'
    attractions_final = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            business_id_fromReviews = filename[:-5][-22:]
            
            for attraction in attractions_base:
                if attraction['business_id'] == business_id_fromReviews:
                    with open(folder_path + '/' + filename, 'r') as file:
                        attraction_fromReviews = json.load(file)
                    attraction_fromReviews_short = clean_attraction(attraction_fromReviews)
                    attraction_final = {**attraction, **attraction_fromReviews_short}
                    attractions_final.append(attraction_final)
    
    with open (f'Dataset/{model}/attractions.jsonl','w') as file:
        for item in attractions_final:
            json.dump(item, file)
            file.write('\n') 

    return attractions_final


def clean_hotel(hotel):
    level_map = {
        1 : "bad",
        2 : "below average",
        3 : "average",
        4 : "good",
        5 : "excellent"
    }
    keys = []
    values = []
    for key, value in hotel.items():
        keys.append(key)
        values.append(value['rating'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

def parseHotel(model,hotels_base):
    folder_path = f'preprocess/{model}/hotels'
    hotels_final = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            business_id_fromReviews = filename[:-5][-22:]
            
            for hotel in hotels_base:
                if hotel['business_id'] == business_id_fromReviews:
                    with open(folder_path + '/' + filename, 'r') as file:
                        hotel_fromReviews = json.load(file)
                    hotel_fromReviews_short = clean_hotel(hotel_fromReviews)
                    hotel_final = {**hotel, **hotel_fromReviews_short}
                    hotels_final.append(hotel_final)
    
    with open (f'Dataset/{model}/hotels.jsonl','w') as file:
        for item in hotels_final:
            json.dump(item, file)
            file.write('\n') 

    return hotels_final


if __name__ == '__main__':
    model = 'gpt4o'
    restaurants_base, hotels_base, attractions_base = loadbase()

    attractions_final = parseAttraction(model,attractions_base)
    hotels_final = parseHotel(model,hotels_base)

In [6]:
hotels_final

[{'business_id': '-D_3emciINpjvYXsHCf8OA',
  'name': 'AKA Rittenhouse Square',
  'address': '135 S 18th St',
  'latitude': 39.9503652,
  'longitude': -75.1704529,
  'stars': 4.5,
  'price': '$$$',
  'quality': 'good quality',
  'location': 'excellent location',
  'service': 'average service',
  'safety': 'average safety'},
 {'business_id': 'oZxSL3PFX15Lzr8cjszmgQ',
  'name': 'AKA University City',
  'address': '2929 Walnut St',
  'latitude': 39.951936,
  'longitude': -75.183911,
  'stars': 4.0,
  'price': 'Unknown price',
  'quality': 'good quality',
  'location': 'excellent location',
  'service': 'good service',
  'safety': 'good safety'},
 {'business_id': '3QPAh9VvYNTnqAFgBeBcng',
  'name': 'Alexander Inn',
  'address': '301 S 12th St',
  'latitude': 39.946396818,
  'longitude': -75.1609669488,
  'stars': 4.5,
  'price': '$$',
  'quality': 'average quality',
  'location': 'excellent location',
  'service': 'good service',
  'safety': 'good safety'},
 {'business_id': '6I_DA3uqOox50rB