In [1]:
import os
from openai import OpenAI
from pydantic import BaseModel
import json

In [2]:
client = OpenAI(
    api_key = os.getenv('OPEN_AI_API')
)

In [3]:
class rr_quality(BaseModel):
    rating: int
    reason: str

class rr_location(BaseModel):
    rating: int
    reason: str

class rr_service(BaseModel):    
    rating: int
    reason: str

class rr_safety(BaseModel):
    rating: int
    reason: str

class HotelEvaluation(BaseModel):
    quality: rr_quality
    location: rr_location
    service: rr_service
    safety: rr_safety

In [4]:
class rr_family(BaseModel):
    level: int
    reason: str

class rr_history(BaseModel):
    level: int
    reason: str

class rr_activity(BaseModel):    
    level: int
    reason: str

class rr_nature(BaseModel):
    level: int
    reason: str

class rr_food(BaseModel):
    level: int
    reason: str

class rr_shopping(BaseModel):
    level: int
    reason: str

class rr_price_level(BaseModel):
    level: int
    reason: str

class AttractionEvaluation(BaseModel):
    family_oriented: rr_family
    history_oriented: rr_history
    activity_oriented: rr_activity
    nature_oriented: rr_nature
    food_oriented: rr_food
    shopping_oriented: rr_shopping
    price_level: rr_price_level

In [5]:
class rr_flavor(BaseModel):
    rating: int
    reason: str

class rr_freshness(BaseModel):
    rating: int
    reason: str

class rr_healthy(BaseModel):
    rating: int
    reason: str

class rr_service(BaseModel):    
    rating: int
    reason: str

class rr_enviornment(BaseModel):
    rating: int
    reason: str

class rr_value(BaseModel):
    rating: int
    reason: str

class RestaurantEvaluation(BaseModel):
    flavor: rr_flavor
    freshness: rr_freshness
    healthy: rr_healthy
    service: rr_service
    enviornment: rr_enviornment
    value: rr_value

In [5]:
with open('Prompts/Review Summarization Prompts/system_prompt_hotel_JSON.txt', 'r') as file:
    system_prompt = file.read()

In [15]:
folder_path = 'Datasets/Reviews/Hotels'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            user_prompt = file.read()
            
        #generation into json format
        chat_completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system","content": system_prompt,},
                {"role": "user","content": user_prompt,}
            ],
            response_format=HotelEvaluation
        )

        filename_nofiletype = filename[:-4]
        filename_jsontype = filename_nofiletype + '.json'
        result = json.loads(chat_completion.choices[0].message.parsed.json())

        with open('Outputs/Reviews_Summarization/Hotels/' + filename_jsontype, 'w') as file:
            json.dump(result, file)
        


In [7]:
with open('Prompts/Review Summarization Prompts/system_prompt_attraction_JSON.txt', 'r') as file:
    system_prompt = file.read()

In [8]:
folder_path = 'Datasets/Reviews/Attractions'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            user_prompt = file.read()
            
        #generation into json format
        chat_completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system","content": system_prompt,},
                {"role": "user","content": user_prompt,}
            ],
            response_format=AttractionEvaluation
        )

        filename_nofiletype = filename[:-4]
        filename_jsontype = filename_nofiletype + '.json'
        result = json.loads(chat_completion.choices[0].message.parsed.json())

        with open('Outputs/Reviews_Summarization/Attractions/' + filename_jsontype, 'w') as file:
            json.dump(result, file)
    

In [6]:
with open('Prompts/Review Summarization Prompts/system_prompt_restaurant_JSON.txt', 'r') as file:
    system_prompt = file.read()

In [None]:
folder_path = 'Datasets/Reviews/Restaurants'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            user_prompt = file.read()
            
        #generation into json format
        chat_completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system","content": system_prompt},
                {"role": "user","content": user_prompt}
            ],
            response_format=RestaurantEvaluation
        )

        filename_nofiletype = filename[:-4]
        filename_jsontype = filename_nofiletype + '.json'
        result = json.loads(chat_completion.choices[0].message.parsed.json())

        with open('Outputs/Reviews_Summarization/Restaurants/' + filename_jsontype, 'w') as file:
            json.dump(result, file)

In [30]:
with open("Datasets/Reviews/Restaurants/Bistrot La Minette_4_-IcMpkF_sBRHomWZHNzA.txt", 'r') as file:
    user_prompt = file.read()

In [31]:
chat_completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    temperature=0,
    messages=[
        {"role": "system","content": system_prompt,},
        {"role": "user","content": user_prompt,}
    ],
    response_format=RestaurantEvaluation
)

Parse together

In [5]:
import json
import os

In [None]:
attractions_base = []
with open("Datasets/Attractions_task1.jsonl", 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_obj = json.loads(line.strip())  # strip() removes any leading/trailing whitespace
        attractions_base.append(json_obj)

In [94]:
def clean_attraction(attraction):
    level_map = {
        0: "no",
        1: "low",
        2: "medium",
        3: "high"
    }
    keys = []
    values = []
    for key, value in attraction.items():
        keys.append(key)
        values.append(value['level'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + '_' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

In [96]:
folder_path = 'Outputs/Reviews_Summarization/Attractions'
attractions_final = []
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        business_id_fromReviews = filename[:-5][-22:]
        
        for attraction in attractions_base:
            if attraction['business_id'] == business_id_fromReviews:
                with open(folder_path + '/' + filename, 'r') as file:
                    attraction_fromReviews = json.load(file)
                attraction_fromReviews_short = clean_attraction(attraction_fromReviews)
                attraction_final = {**attraction, **attraction_fromReviews_short}
                attractions_final.append(attraction_final)

In [97]:
attractions_final[0]

{'business_id': 'DSGHZnDLRbR9wk-u1trRUQ',
 'name': '16th Street Seafood',
 'address': '1542 Tasker St',
 'latitude': 39.9307459,
 'longitude': -75.1715522,
 'stars': 4.0,
 'price': 'Unknown price',
 'attraction_type': 'Local Flavor',
 'perks': 'Seafood',
 'family_oriented': 'low_family_oriented',
 'history_oriented': 'no_history_oriented',
 'activity_oriented': 'low_activity_oriented',
 'nature_oriented': 'no_nature_oriented',
 'food_oriented': 'high_food_oriented',
 'shopping_oriented': 'no_shopping_oriented'}

In [98]:
# save attractions_final
with open('Datasets/Attractions_task1.jsonl', 'w') as file:
    for attraction in attractions_final:
        json.dump(attraction, file)
        file.write('\n')

some extra work for attraction, if the original attraction got no price, then replace it with the price level summarized by the llm

In [28]:
attractions_base = []
with open("Datasets/Attractions_task1.jsonl", 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_obj = json.loads(line.strip())  # strip() removes any leading/trailing whitespace
        attractions_base.append(json_obj)

In [32]:
attractions_base[0]

{'business_id': 'DSGHZnDLRbR9wk-u1trRUQ',
 'name': '16th Street Seafood',
 'address': '1542 Tasker St',
 'latitude': 39.9307459,
 'longitude': -75.1715522,
 'stars': 4.0,
 'price': '$$',
 'attraction_type': 'Local Flavor',
 'perks': 'Seafood',
 'family_oriented': 'low family oriented',
 'history_oriented': 'no history oriented',
 'activity_oriented': 'low activity oriented',
 'nature_oriented': 'no nature oriented',
 'food_oriented': 'high food oriented',
 'shopping_oriented': 'no shopping oriented'}

In [31]:
folder_path = 'Outputs/Reviews_Summarization/Attractions'
attractions_final = []
level_map = {
    1: "$",
    2: "$$",
    3: "$$$",
    4: "$$$$"
}
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        business_id_fromReviews = filename[:-5][-22:]
        
        for attraction in attractions_base:
            if attraction['business_id'] == business_id_fromReviews:
                with open(folder_path + '/' + filename, 'r') as file:
                    attraction_fromReviews = json.load(file)
                if attraction['price'] == 'Unknown price':
                    attraction['price'] = level_map[attraction_fromReviews['price_level']['level']]

In [33]:
# save hotels_final
with open('Datasets/Attractions_task1.jsonl', 'w') as file:
    for attraction in attractions_base:
        json.dump(attraction, file)
        file.write('\n')

In [99]:
hotels_base = []
with open("Datasets/Hotels_task1.jsonl", 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_obj = json.loads(line.strip())  # strip() removes any leading/trailing whitespace
        hotels_base.append(json_obj)

In [None]:
def clean_hotel(hotel):
    level_map = {
        1: "1 star",
        2: "2 stars",
        3: "3 stars",
        4: "4 stars",
        5: "5 stars"
    }
    keys = []
    values = []
    for key, value in hotel.items():
        keys.append(key)
        values.append(value['rating'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

In [104]:
folder_path = 'Outputs/Reviews_Summarization/Hotels'
hotels_final = []
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        business_id_fromReviews = filename[:-5][-22:]
        
        for hotel in hotels_base:
            if hotel['business_id'] == business_id_fromReviews:
                with open(folder_path + '/' + filename, 'r') as file:
                    hotel_fromReviews = json.load(file)
                hotel_fromReviews_short = clean_hotel(hotel_fromReviews)
                hotel_final = {**hotel, **hotel_fromReviews_short}
                hotels_final.append(hotel_final)

In [105]:
hotels_final[0]

{'business_id': '-D_3emciINpjvYXsHCf8OA',
 'name': 'AKA Rittenhouse Square',
 'address': '135 S 18th St',
 'latitude': 39.9503652,
 'longitude': -75.1704529,
 'stars': 4.5,
 'price': '$$$',
 'quality': '3 stars quality',
 'location': '5 stars location',
 'service': '4 stars service',
 'safety': '3 stars safety'}

In [107]:
# save hotels_final
with open('Datasets/Hotels_task1.jsonl', 'w') as file:
    for hotel in hotels_final:
        json.dump(hotel, file)
        file.write('\n')

In [108]:
restaurants_base = []
with open("Datasets/Restaurants_task1.jsonl", 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_obj = json.loads(line.strip())  # strip() removes any leading/trailing whitespace
        restaurants_base.append(json_obj)

In [None]:
def clean_restaurant(restaurant):
    level_map = {
        1: "1 star",
        2: "2 stars",
        3: "3 stars",
        4: "4 stars",
        5: "5 stars"
    }
    keys = []
    values = []
    for key, value in restaurant.items():
        keys.append(key)
        values.append(value['rating'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

In [110]:
folder_path = 'Outputs/Reviews_Summarization/Restaurants'
restaurants_final = []
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        business_id_fromReviews = filename[:-5][-22:]
        
        for restaurant in restaurants_base:
            if restaurant['business_id'] == business_id_fromReviews:
                with open(folder_path + '/' + filename, 'r') as file:
                    restaurant_fromReviews = json.load(file)
                restaurant_fromReviews_short = clean_restaurant(restaurant_fromReviews)
                restaurant_final = {**restaurant, **restaurant_fromReviews_short}
                restaurants_final.append(restaurant_final)

In [111]:
restaurants_final[0]

{'business_id': 'wuH4TPUo8oJo4E59xZKsNg',
 'name': '&pizza - Walnut',
 'address': '430 Walnut St',
 'latitude': 39.9492067124,
 'longitude': -75.1659201062,
 'stars': 4.5,
 'good_for_meal': 'latenight, lunch, dinner',
 'price': '$$',
 'restaurant_type_1': 'Restaurants',
 'restaurant_type_2': 'Not Applicable',
 'cuisine_1': 'Pizza',
 'cuisine_2': 'Vegetarian',
 'perks': 'Vegan',
 'flavor': '4 stars flavor',
 'freshness': '5 stars freshness',
 'healthy': '4 stars healthy',
 'service': '3 stars service',
 'enviornment': '4 stars enviornment',
 'value': '4 stars value'}

In [114]:
# save hotels_final
with open('Datasets/Restaurants_task1.jsonl', 'w') as file:
    for restaurant in restaurants_final:
        json.dump(restaurant, file)
        file.write('\n')

Change stars to words for hotel and restaurant, _ to space for attractions


In [None]:
import json

hotels = []
with open ('Datasets/Hotels_task1.jsonl', 'r') as file:
    for line in file:
        json_obj = json.loads(line.strip()) 
        hotels.append(json_obj)

In [4]:
hotels[0]

{'business_id': '-D_3emciINpjvYXsHCf8OA',
 'name': 'AKA Rittenhouse Square',
 'address': '135 S 18th St',
 'latitude': 39.9503652,
 'longitude': -75.1704529,
 'stars': 4.5,
 'price': '$$$',
 'quality': '3 stars quality',
 'location': '5 stars location',
 'service': '4 stars service',
 'safety': '3 stars safety'}

In [22]:
level_map = {
    '1 star ' : "bad",
    '2 stars' : "below average",
    '3 stars' : "average",
    '4 stars' : "good",
    '5 stars' : "excellent"
}

hotels_updated = []
for hotel in hotels:
    hotel_updated = {}
    for key,value in hotel.items():
        if key == 'quality' or key == 'location' or key == 'service' or key == 'safety':
            value_updated = level_map[value[:7]] + ' ' + key
            hotel_updated.update({key: value_updated})
        else:
            hotel_updated.update({key: value})
    hotels_updated.append(hotel_updated)

In [27]:
# save hotels_final
with open('Datasets/Hotels_task1.jsonl', 'w') as file:
    for hotel in hotels_updated:
        json.dump(hotel, file)
        file.write('\n')

In [28]:
restaurants = []
with open ('Datasets/Restaurants_task1.jsonl', 'r') as file:
    for line in file:
        json_obj = json.loads(line.strip()) 
        restaurants.append(json_obj)

In [29]:
restaurants[0]

{'business_id': 'wuH4TPUo8oJo4E59xZKsNg',
 'name': '&pizza - Walnut',
 'address': '430 Walnut St',
 'latitude': 39.9492067124,
 'longitude': -75.1659201062,
 'stars': 4.5,
 'good_for_meal': 'latenight, lunch, dinner',
 'price': '$$',
 'restaurant_type_1': 'Restaurants',
 'restaurant_type_2': 'Not Applicable',
 'cuisine_1': 'Pizza',
 'cuisine_2': 'Vegetarian',
 'perks': 'Vegan',
 'flavor': '4 stars flavor',
 'freshness': '5 stars freshness',
 'healthy': '4 stars healthy',
 'service': '3 stars service',
 'enviornment': '4 stars enviornment',
 'value': '4 stars value'}

In [30]:
level_map = {
    '1 star ' : "bad",
    '2 stars' : "below average",
    '3 stars' : "average",
    '4 stars' : "good",
    '5 stars' : "excellent"
}

restaurants_updated = []
for restaurant in restaurants:
    restaurant_updated = {}
    for key,value in restaurant.items():
        if key == 'flavor' or key == 'freshness' or key == 'healthy' or key == 'service' or key == 'enviornment' or key == 'value':
            value_updated = level_map[value[:7]] + ' ' + key
            restaurant_updated.update({key: value_updated})
        else:
            restaurant_updated.update({key: value})
    restaurants_updated.append(restaurant_updated)

In [33]:
restaurants_updated[99]

{'business_id': 'nIAbuktMEzVjT4P9pG89rQ',
 'name': 'Buddakan',
 'address': '325 Chestnut St',
 'latitude': 39.9489186,
 'longitude': -75.1471667,
 'stars': 4.0,
 'good_for_meal': 'dinner',
 'price': '$$$',
 'restaurant_type_1': 'Restaurants',
 'restaurant_type_2': 'Not Applicable',
 'cuisine_1': 'Asian Fusion',
 'cuisine_2': 'Chinese',
 'perks': 'Not Applicable',
 'flavor': 'good flavor',
 'freshness': 'good freshness',
 'healthy': 'average healthy',
 'service': 'good service',
 'enviornment': 'excellent enviornment',
 'value': 'average value'}

In [None]:
with open('Datasets/Restaurants_task1.jsonl', 'w') as file:
    for restaurant in restaurants_updated:
        json.dump(restaurant, file)
        file.write('\n')

In [5]:
attractions = []
with open ('Datasets/Attractions_task1.jsonl', 'r') as file:
    for line in file:
        json_obj = json.loads(line.strip()) 
        attractions.append(json_obj)

In [16]:
attractions[1]

{'business_id': 'AGR4G6RCCjzmtVk0xjLTrg',
 'name': '76 Carriage Company',
 'address': '1350 Schuylkill Ave',
 'latitude': 39.9375741,
 'longitude': -75.2037673,
 'stars': 4.5,
 'price': 'Unknown price',
 'attraction_type': 'Tours',
 'perks': 'Local Flavor',
 'family_oriented': 'high_family_oriented',
 'history_oriented': 'high_history_oriented',
 'activity_oriented': 'low_activity_oriented',
 'nature_oriented': 'low_nature_oriented',
 'food_oriented': 'no_food_oriented',
 'shopping_oriented': 'no_shopping_oriented'}

In [14]:
attractions_updated = []
for attraction in attractions:
    attraction_updated = {}
    for key,value in attraction.items():
        if key == 'family_oriented' or key == 'history_oriented' or key == 'activity_oriented' or key == 'nature_oriented' or key == 'food_oriented' or key == 'shopping_oriented':
            value_updated = value.replace('_', ' ')
            attraction_updated.update({key: value_updated})
        else:
            attraction_updated.update({key: value})
    attractions_updated.append(attraction_updated)

In [17]:
attractions_updated[1]

{'business_id': 'AGR4G6RCCjzmtVk0xjLTrg',
 'name': '76 Carriage Company',
 'address': '1350 Schuylkill Ave',
 'latitude': 39.9375741,
 'longitude': -75.2037673,
 'stars': 4.5,
 'price': 'Unknown price',
 'attraction_type': 'Tours',
 'perks': 'Local Flavor',
 'family_oriented': 'high family oriented',
 'history_oriented': 'high history oriented',
 'activity_oriented': 'low activity oriented',
 'nature_oriented': 'low nature oriented',
 'food_oriented': 'no food oriented',
 'shopping_oriented': 'no shopping oriented'}

In [18]:
with open('Datasets/Attractions_task1.jsonl', 'w') as file:
    for attraction in attractions_updated:
        json.dump(attraction, file)
        file.write('\n')