In [17]:
import pandas as pd
from openai import OpenAI
import json
from pydantic import BaseModel, Field
from typing import List, Optional

In [18]:
# OpenAI 클라이언트 초기화
client = OpenAI(api_key="your API key")

In [19]:
class CategoryData(BaseModel):
    category: str = Field(..., description="Category name (e.g., 맛, 위생, 서비스, 분위기, 가성비)")
    group_keywords: List[str] = Field(..., description="List of keywords belonging to this category.")
    representative_sentence: Optional[str] = Field(None, description="A summary sentence for the category in Korean.")

In [20]:
class RestaurantKeywords(BaseModel):
    store_name: str = Field(..., description="Name of the restaurant.")
    sentiment: str = Field(..., description="Sentiment type (Positive or Negative).")
    categories: List[CategoryData] = Field(..., description="List of categorized keywords with summaries.")

In [21]:
def load_data(file_path):
    """CSV 파일을 로드하여 데이터프레임으로 반환"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        raise

In [37]:
def call_openai_api(store_name, keywords, sentiment):
    """OpenAI API에 요청을 보내고 Pydantic 모델로 응답을 파싱"""
    try:
        prompt = f"""
       You are tasked with analyzing a list of positive and negative keywords related to restaurant reviews and performing two tasks: “Category Classification” and “Representative Sentence Generation”. Follow these instructions carefully:

        **Input**:
        - A list of positive and negative keywords extracted from restaurant reviews.
        
        **Output**:
        Perform the following two tasks:
        
        ---
        
        **Task 1: Category Classification**
        0. Do Not Miss Any Category
        1. Classify each keyword into one of the following five categories:
           - **맛 (Taste)**: Keywords related to food quality, menu, flavor, freshness, or presentation.
           - **서비스 (Service)**: Keywords related to staff behavior, attentiveness, speed, or customer experience.
           - **위생 (Cleanliness)**: Keywords related to the cleanliness of the restaurant, utensils, or environment.
           - **가성비 (Cost-Effectiveness)**: Keywords related to the price, portion size, or overall value for money.
           - **분위기 (Ambiance)**: Keywords related to the restaurant’s atmosphere, interior design, noise levels, or overall mood.
        2. Save the classified keywords under the corresponding category in the "Group Keywords" field.
        3. If a keyword does not match any of the five categories, exclude it from classification and do not include it in any category.
        
        ---
        
        **Task 2: Representative Sentence Generation**
        1. For each category from Task 1, generate a **Representative Sentence** in Korean based on the keywords in "Group Keywords".
        2. The **Representative Sentence** must:
           - Be a complete and natural sentence in Korean.
           - Summarize the most semantically frequent and important information from the "Group Keywords".
           - Specifically include **relevant details** such as mentioned food items, dishes, or specific features that appear in the keywords (e.g., 김치찌개, 계란말이, 기름지다, 날파리가 날라다닌다).
           - Reflect the overall sentiment (positive/negative) of the keywords in the category.
        3. **Do not generate a Representative Sentence** for a category if it has fewer than 5 keywords in "Group Keywords". Leave the field empty in such cases.
        
        ---
        
        **Additional Guidelines**:
        - Avoid simply listing or rephrasing the keywords; instead, create a meaningful summary that incorporates key details from the keywords.
        - Use concise and contextually accurate expressions to highlight the unique points of the reviews, particularly specific items or features mentioned in the keywords.
        Return the results in the following JSON format:
        {{
            "store_name": "{store_name}",
            "sentiment": "{sentiment}",
            "categories": [
                {{
                    "category": "맛",
                    "group_keywords": ["<Keyword1>", "<Keyword2>", ...],
                    "representative_sentence": "<Representative Sentence in Korean>"
                }},
                {{
                    "category": "위생",
                    "group_keywords": ["<Keyword1>", "<Keyword2>", ...],
                    "representative_sentence": "<Representative Sentence in Korean>"
                }},
                {{
                    "category": "서비스",
                    "group_keywords": ["<Keyword1>", "<Keyword2>", ...],
                    "representative_sentence": "<Representative Sentence in Korean>"
                }},
                {{
                    "category": "분위기",
                    "group_keywords": ["<Keyword1>", "<Keyword2>", ...],
                    "representative_sentence": "<Representative Sentence in Korean>"
                }},
                {{
                    "category": "가성비",
                    "group_keywords": ["<Keyword1>", "<Keyword2>", ...],
                    "representative_sentence": "<Representative Sentence in Korean>"
                }}
            ]
        }}
        Keywords: {keywords}
        """
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", 
                 "content": "You are a helpful assistant that classifies restaurant keywords into predefined categories and generates concise summaries for each category."},
                {"role": "user", "content": prompt}
            ],
            response_format=RestaurantKeywords,  # Pydantic 모델로 응답 포맷 지정
        )

        parsed = completion.choices[0].message.parsed
        return parsed.model_dump()  # Pydantic 객체를 딕셔너리로 변환
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return None

In [38]:
def process_keywords(dataframe):
    """키워드 데이터를 OpenAI API를 통해 처리"""
    results = []
    for index, row in dataframe.iterrows():
        store_name = row['StoreName']
        positive_keywords = eval(row['Positive_Keywords'])
        negative_keywords = eval(row['Negative_Keywords'])

        # 긍정 키워드 처리
        if positive_keywords:
            response = call_openai_api(store_name, positive_keywords, "Positive")
            if response:
                results.append(response)

        # 부정 키워드 처리
        if negative_keywords:
            response = call_openai_api(store_name, negative_keywords, "Negative")
            if response:
                results.append(response)
    
    return results

In [39]:
def save_to_csv(results, output_path):
    """JSON 데이터를 CSV 파일로 저장"""
    if not results:
        print("No data to save. Exiting...")
        return

    rows = []
    for result in results:
        store_name = result["store_name"]
        sentiment = result["sentiment"]
        for category_data in result["categories"]:
            rows.append({
                "StoreName": store_name,
                "Sentiment": sentiment,
                "Category": category_data["category"],
                "Group Keywords": ", ".join(category_data["group_keywords"]),
                "Representative Sentence": category_data["representative_sentence"] or ""
            })

    df = pd.DataFrame(rows)
    print("Saving the following data:")
    print(df.head())  # 디버깅용 출력
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Results saved to {output_path}")

In [40]:
def main():
    input_file = "store_pos_neg_keywords.csv"
    output_file = "store_analysis_output.csv"

    df = load_data(input_file)
    results = process_keywords(df)
    save_to_csv(results, output_file)

In [None]:
if __name__ == "__main__":
    main()