In [1]:
import pandas as pd
import json
from openai import OpenAI
from tqdm import tqdm

In [2]:
client = OpenAI()

In [3]:
def get_joined_reviews(json_data):
    # Extract the reviews
    reviews = json_data["reviews"]
    
    # Concatenate the review texts into a single string, each as a paragraph
    joined_text = "\n\n".join([review["text"]["text"] for review in reviews if "text" in review and "text" in review["text"]])
    
    # Return the concatenated text
    return joined_text

In [4]:
def build_prompt(raw_reviews):
    prompt_template = """
You are an expert at summarizing text. You will be presented with some reviews of some restaurants. 
I need you to summarize the information in a single paragraph with between 7 and 10 sentences. 
Try to include the names of the dishes, i.e., eggs, lasagna, steak, among others. 
Also include the type of meal they mention, i.e., breakfast, brunch, dinner. 
I want the text to be redacted as a general review from the restaurant.

REVIEWS: 
{reviews}
""".strip()

    prompt = prompt_template.format(reviews=raw_reviews).strip()
    return prompt

In [5]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [6]:
def summarized(raw_reviews):
    review = get_joined_reviews(raw_reviews)
    prompt = build_prompt(review)
    answer = llm(prompt)
    return answer

In [7]:
def data_cleaner(restaurants_data):
    clean_data = []
    i = 0

    # Iterate over each restaurant with tqdm to show progress
    for restaurant in tqdm(restaurants_data, desc="Processing restaurants"):
        # Skip restaurants with no reviews
        if not ("reviews" in restaurant and restaurant["reviews"]):
            continue
        
        # Create the new dictionary for the restaurant
        
        new_restaurant = {
            "id": restaurant["id"],
            "nationalPhoneNumber": restaurant.get("nationalPhoneNumber"),
            "formattedAddress": restaurant.get("formattedAddress"),
            "rating": restaurant.get("rating"),
            "websiteUri": restaurant.get("websiteUri"),
            "regularOpeningHours": restaurant.get("regularOpeningHours", {}).get("weekdayDescriptions", None),
            "displayName": restaurant["displayName"]["text"],
            "primaryType": restaurant.get("primaryType"),
            "editorialSummary": restaurant.get("editorialSummary", {}).get("text", None),
            "reviews": summarized(restaurant)
        }
        
        # Append the new restaurant to the list
        clean_data.append(new_restaurant)
    
    return clean_data

In [8]:
data_unclean = 'data_unclean.json'

In [9]:
with open(data_unclean, 'r') as archivo:
    restaurants_data = json.load(archivo)

In [10]:
clean_data = data_cleaner(restaurants_data)

Processing restaurants: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2039/2039 [1:21:44<00:00,  2.41s/it]


In [12]:
df = pd.DataFrame(clean_data)

In [14]:
df.to_csv('clean_data.csv', index=False)