In [2]:
import os
from Utils.Review import (hotel_review_extraction_agent,
                          attraction_review_extraction_agent, 
                          restaurant_review_extraction_agent,
                          JSON_EXTRACTION_HOTEL_PROMPT,
                          JSON_EXTRACTION_ATTRACTION_PROMPT,
                          JSON_EXTRACTION_RESTAURANT_PROMPT
                          )
from langchain_openai import ChatOpenAI
from openai import OpenAI
from pydantic import BaseModel
from langchain.schema import (HumanMessage)
import json

OPENAI_API_KEY = os.getenv('OPEN_AI_API')

#summarization
class ExtractAgent:
    def __init__(self,
            summarize_llm) -> None:
        self.summarize_llm = summarize_llm
    
        if self.summarize_llm == 'gpt-4o-2024-11-20':
            self.llm = ChatOpenAI(temperature=0,
                        model_name='gpt-4o-2024-11-20',
                        openai_api_key=OPENAI_API_KEY)
    
    def summarize(self, reviews, extraction_agent):
        self.agent_prompt = extraction_agent
        self.reviews = reviews
        content = self.build_agent_prompt()
        if self.summarize_llm == 'gpt-4o-2024-11-20':
            request = self.llm.invoke([HumanMessage(content)]).content
        return request
    
    def build_agent_prompt(self) -> str:
        return self.agent_prompt.format(reviews = self.reviews)


#extract json    
# Hotel 
class rr_quality(BaseModel):
    rating: int
    reason: str

class rr_location(BaseModel):
    rating: int
    reason: str

class rr_service(BaseModel):    
    rating: int
    reason: str

class rr_safety(BaseModel):
    rating: int
    reason: str

class HotelEvaluation(BaseModel):
    quality: rr_quality
    location: rr_location
    service: rr_service
    safety: rr_safety

#attraction
class rr_family(BaseModel):
    level: int
    reason: str

class rr_history(BaseModel):
    level: int
    reason: str

class rr_activity(BaseModel):    
    level: int
    reason: str

class rr_nature(BaseModel):
    level: int
    reason: str

class rr_food(BaseModel):
    level: int
    reason: str

class rr_shopping(BaseModel):
    level: int
    reason: str

class AttractionEvaluation(BaseModel):
    family_oriented: rr_family
    history_oriented: rr_history
    activity_oriented: rr_activity
    nature_oriented: rr_nature
    food_oriented: rr_food
    shopping_oriented: rr_shopping

#restaurant
class rr_flavor(BaseModel):
    rating: int
    reason: str

class rr_freshness(BaseModel):
    rating: int
    reason: str

class rr_service(BaseModel):    
    rating: int
    reason: str

class rr_environment(BaseModel):
    rating: int
    reason: str

class rr_value(BaseModel):
    rating: int
    reason: str

class RestaurantEvaluation(BaseModel):
    flavor: rr_flavor
    freshness: rr_freshness
    service: rr_service
    environment: rr_environment
    value: rr_value

class JSONAgent:
    def __init__(self) -> None:
        self.client = OpenAI(
            api_key = os.getenv('OPEN_AI_API')
        )
    
    def parse(self, user_prompt, system_prompt, format):
        #generation into json format
        completion = self.client.beta.chat.completions.parse(
            model="gpt-4o-2024-11-20",
            temperature=0,
            messages=[
                {"role": "system","content": system_prompt,},
                {"role": "user","content": user_prompt,}
            ],
            response_format=format
        )
        output = json.loads(completion.choices[0].message.content)
        return output

if __name__ == '__main__':
    file_path = 'Dataset/Reviews/Hotels'
    for filename in os.listdir(file_path):
        #testing
        #if filename != 'AKA Rittenhouse Square_-D_3emciINpjvYXsHCf8OA.txt':
        #    continue
        continue

        with open (file_path + '/' + filename, 'r') as file:
            reviews = file.read()
            #print(reviews)
    
        agent = ExtractAgent(summarize_llm='gpt-4o-2024-11-20')
        summarization = agent.summarize(reviews, hotel_review_extraction_agent) # make this a more obvious selection 
        
        json_agent = JSONAgent()
        output = json_agent.parse(summarization, JSON_EXTRACTION_HOTEL_PROMPT, HotelEvaluation)

        with open('preprocess/gpt4o/hotels/' + filename[:-4] + '.json', 'w') as outfile:
            json.dump(output, outfile)

    file_path = 'Dataset/Reviews/Attractions'
    for filename in os.listdir(file_path):
        #testing
        #if filename != 'Benjamin Franklin Museum_ubYUSAHPt4vNjZDEn44eTA.txt':
        continue

        with open (file_path + '/' + filename, 'r') as file:
            reviews = file.read()
            #print(reviews)
    
        agent = ExtractAgent(summarize_llm='gpt-4o-2024-11-20')
        summarization = agent.summarize(reviews, attraction_review_extraction_agent) # make this a more obvious selection 
        
        json_agent = JSONAgent()
        output = json_agent.parse(summarization, JSON_EXTRACTION_ATTRACTION_PROMPT, AttractionEvaluation)

        with open('preprocess/gpt4o/attractions/' + filename[:-4] + '.json', 'w') as outfile:
            json.dump(output, outfile)

    
    file_path = 'Dataset/Reviews/Restaurants'
    for filename in os.listdir(file_path):
        #testing
        if filename != '&pizza - Walnut_wuH4TPUo8oJo4E59xZKsNg.txt':
            continue

        with open (file_path + '/' + filename, 'r') as file:
            reviews = file.read()
            #print(reviews)
    
        agent = ExtractAgent(summarize_llm='gpt-4o-2024-11-20')
        summarization = agent.summarize(reviews, restaurant_review_extraction_agent) # make this a more obvious selection 
        
        json_agent = JSONAgent()
        output = json_agent.parse(summarization, JSON_EXTRACTION_RESTAURANT_PROMPT, RestaurantEvaluation)

        with open('preprocess/gpt4o/restaurants/' + filename[:-4] + '.json', 'w') as outfile:
            json.dump(output, outfile)





You are an assistant designed to summarize reviews of businesses for travel planning purposes. Your goal is to provide **faithful, concise, and relevant information** based on the following reviews complied into the txt file. Follow these principles:  

1. Focus on Travel-Relevant Details: Prioritize aspects crucial to travelers, such as food quality, location convenience (proximity to landmarks and transportation options), ambiance, cleanliness, service quality, amenities, and overall reliability.
2. Avoid Bias: Provide balanced evaluations that reflect the consensus of available reviews. Clearly indicate when opinions are mixed, and refrain from fabricating, exaggerating, or omitting key details.
3. Clarify Nuances: Highlight notable trends in feedback (e.g., "frequent mentions of slow service" or "consistent praise for convenient location") to provide an accurate overview.
4. Respect Context: Differentiate between subjective opinions (e.g., “some diners found the portions small”) 

In [None]:
output