In [None]:
import pandas as pd
import os
from langchain_openai import ChatOpenAI
import ast
import requests
import time
import numpy as np
from pathlib import Path
import re
from haversine import haversine, Unit
from math import radians, cos
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_core.output_parsers import StrOutputParser
from haversine import haversine, Unit

In [None]:
# Insert your own openai api key 
os.environ["OPENAI_API_KEY"] = "your_api_key"
# Get this api key from https://www.mapbox.com/
geo_api_key = "your_api_key"

In [None]:
# you may change the path of this two csv files
df = pd.read_csv('business.csv')
df_tip = pd.read_csv('tip.csv')

In [None]:
df['city'].value_counts().head(10)

In [None]:
# Select the name of the city you want to process
selected_city = 'Santa Barbara'
df_city = df[df['city'] == selected_city]
df_city.head(5)

In [None]:
tip_preprocessed = df_tip.groupby('business_id')['text'].agg(list).reset_index()
merged_df = pd.merge(df_city, tip_preprocessed, on='business_id', how='inner')
merged_df.head()

In [None]:
filtered_df = merged_df[merged_df['text'].apply(lambda x: isinstance(x, list) and len(x) > 5)]

In [None]:
print(len(df_city))
print(len(merged_df))
print(len(filtered_df))

In [None]:
template = """ You are a master of summarising reviews, now I have some tips, they are in the form of lists in python and split with comma, I would like you to help me make a summary for each lists. Here are some example of summary:
list:  [\'Love their pastries and drinks!\', \'Really good egg tart and bubble tea.  Best we have had in Chinatown so far\', \'Was told they stopped making almond tarts... Unfortunately that was one of their best items I looked forward to when visiting Philly.\', \'After 6 pm the bread is on sale!\', \'Best steamed pork bun in chinatown!\', \'great cupcakes & almond cookies\', "Order the rainbow cake it\'s pretty and not too sweet", \'bun is sucked here and the waitress was really mean and cheap\', \'I found chilled pork buns that I could take home and steam. They turned out great.\', \'The cold tea w milk makes my day.\']
summary: Customers praise this establishment for its excellent pastries and drinks, highlighting the egg tart, bubble tea, steamed pork bun, cupcakes, almond cookies, and a particularly pretty, not too sweet rainbow cake. The best experiences include finding high-quality items like chilled pork buns for home steaming and enjoying discounts on bread after 6 pm. However, there's disappointment over the discontinuation of almond tarts and negative feedback about the quality of buns and customer service from the staff.
list:['Love sonic but orders are constantly wrong...', 'Foods always been good. Shakes r delicious!']
summary: The feedback highlights a mix of experiences at Sonic. While there is love for the brand and appreciation for the quality of food and delicious shakes, there is also frustration over frequent inaccuracies in order fulfillment.
now it is your turn.
"""


In [None]:
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template="list: {input}, summary:"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
output_parser = StrOutputParser()


In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125")
chain = prompt_template | llm | output_parser

def summarize_tip(tip):
    response = chain.invoke({"input":tip})
    return response

def process_chunk(chunk):
    chunk['tips_summary'] = chunk['text'].apply(summarize_tip)
    return chunk



chunk_size = 200 

start_chunk_index = 0
chunks = [merged_df [i:i+chunk_size] for i in range(0, merged_df .shape[0], chunk_size)]
processed_chunks = []


for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}...")
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)
    processed_chunk.to_csv("summary_n.csv", mode='a', header=not bool(i), index=False)


print("Processing complete.")

In [None]:
df_P = pd.read_csv('summary_n.csv')
df_P.head()

In [None]:
def complete_address(lat, lon,api = geo_api_key):
    url = f"https://api.mapbox.com/search/geocode/v6/reverse?longitude={lon}&latitude={lat}&limit=1&access_token={api}"  
    try:
        response = requests.get(url)
        response.raise_for_status()  
        data = response.json()
        

        if not data.get('features'):
            return {"error": "No features found in the API response."}
        

        feature = data['features'][0]
        properties = feature.get('properties', {})
        context = properties.get('context', {})

        neighborhood = None
        postcode = None
        address = None

        neighborhood = context.get('neighborhood', {}).get('name')
        postcode = context.get('postcode', {}).get('name')
        address = context.get('address', {}).get('name')


        if not neighborhood:
            neighborhood = properties.get('neighborhood')

        return {
            'neighborhood': neighborhood,
            'postcode': postcode,
            'full_address':address
        }
        
    except requests.exceptions.HTTPError as http_err:
        return {"error": f"HTTP error occurred: {http_err}"}
    except Exception as err:
        return {"error": f"Other error: {err}"}

In [None]:
chunk_size = 200
num_chunks = int(np.ceil(len(df_P) / chunk_size))

# Create a directory for temporary results
temp_dir = Path("./yelp_temp_results_n")
temp_dir.mkdir(parents=True, exist_ok=True)

start_chunk_index = 6

for i in range(start_chunk_index, num_chunks):
    start_index = i * chunk_size
    end_index = (i + 1) * chunk_size
    
    # Retrieve the current chunk
    chunk = df_P.iloc[start_index:end_index]
    
    # Apply the function to each row in the chunk and expand the results into new columns
    chunk[['neighbourhood', 'postcode','full_address']] = chunk.apply(
        lambda row: pd.Series(complete_address(row['latitude'], row['longitude'])),
        axis=1
    )
    
    # Save the processed chunk to a file
    temp_file = temp_dir / f"chunk_{i}.csv"
    chunk.to_csv(temp_file, index=False)
    
    print(f"Chunk {i} processed and saved.")

In [None]:
temp_dir = Path("./yelp_temp_results_n")

chunks_paths = list(temp_dir.glob("chunk_*.csv"))

if not chunks_paths:
    raise FileNotFoundError("No chunk files found in 'temp_results' directory.")

all_chunks = [pd.read_csv(chunk_path) for chunk_path in sorted(chunks_paths, key=lambda path: int(path.stem.split('_')[-1]))]
merged_results_loc = pd.concat(all_chunks, ignore_index=True)




In [None]:
merged_results_loc['description'] =  merged_results_loc.apply(lambda row: (
    f"{row['name']} is located at {row['full_address']} and primarily serves the category of {row['categories']}. "
    f"It is open for business at these hours: {row['hours']}. "
    f"Customers often highlight: '{row['tips_summary']}'."
), axis=1)

In [None]:
filtered_df = merged_results_loc[merged_results_loc['text'].str.len() > 20]
filtered_df = filtered_df.dropna(subset=['full_address'])
filtered_df = filtered_df.dropna(subset=['hours'])

final_input1 = filtered_df[['business_id','name','latitude','longitude','description']]
final_input2 = filtered_df[['business_id','name','longitude','latitude','full_address','categories','stars','tips_summary','description']]


print(len(merged_results_loc))
print(len(filtered_df))
print(len(final_input1))
print(len(final_input2))


In [None]:
city_formatted = selected_city.replace(" ", "_")
file_path1 = f"input/yelp_test_input_{city_formatted}.csv"
file_path2 = f"input/yelp_demo_input_{city_formatted}.csv"
file_path3 = f"yelp_full_{city_formatted}.csv"

final_input1.to_csv(file_path1,index=False)
final_input2.to_csv(file_path2,index=False)
filtered_df.to_scv(file_path3,index=False)
