In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from operator import itemgetter

import pandas as pd
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import models
from langchain_core.runnables import chain
from langchain_core.output_parsers import StrOutputParser
import os
import ast
from langchain_openai import ChatOpenAI
import json
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
import numpy as np

import requests
from langchain.prompts import ChatPromptTemplate
import re
from math import radians, cos

In [None]:
# Change the path from here
input_path = 'your_path.csv' # POI data
output_path = 'your_path.csv' # where to save the data

test = pd.read_csv('your_path.csv') #testset
input = pd.read_csv(input_path)

# change openai api key
os.environ["OPENAI_API_KEY"] = "your_api_key"

In [None]:
test = test.loc[test['Answer'] != '[]']
len(test)

In [None]:
# load embeddings
# embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1", cache_folder=embedding_path)
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')


file_path = input_path
metadata_columns = ['business_id','name','longitude', 'latitude']
loader = CSVLoader(
    file_path=file_path,
    metadata_columns=metadata_columns,
    csv_args={
        'delimiter': ',',
        'quotechar': '"',
    }
)

data = loader.load()
for doc in data:
    doc.metadata['longitude'] = float(doc.metadata['longitude'])
    doc.metadata['latitude'] = float(doc.metadata['latitude'])

# store data in vector database
vectorstore = Qdrant.from_documents(
    documents=data, 
    embedding=embeddings, 
    path = 'vectoreDB', 
    collection_name="yelp_colls",
    force_recreate=False) 


In [None]:
# set llm to gpt-4o
llm = ChatOpenAI(temperature=0, model="gpt-4o")
rag_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system","You are an assistant for location information sorting tasks. Below is the location information retrieved from the database,\
            which will be given to you in JSON format. You are asked to filter and sort this information based on the question asked. You first need to determine whether the information is relevant to the question, and then sort all the relevant information. The ones that best match the \
            question and help answer it have the highest priority. The format of your output must be a Python dictionary, where the key is the name of the location and the value is the reason why you chose this location and ranked it there. The location with the highest priority is placed higher, i.e., index is 0. \
            Please note that there could be more than one result in the dictionary. If the information about a location could only partially match the question asked, you could also put it in the dictionary, but specify the advantages and disadvantages of this place in the value of the dictionary. \
            If you could not complete the task or do not know the answer, just return the empty dictionary and do not refer to any additional knowledge"
     
        ),
        ("human", "information:{context}\nquestion:{question}"),
    ]
)


In [None]:
# set llm to gpt-o1
# llm = ChatOpenAI(temperature=1, model="o1-preview")
# rag_prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "human","You are an assistant for location information sorting tasks. Below is the location information retrieved from the database,\
#             which will be given to you in JSON format. You are asked to filter and sort this information based on the question asked. You first need to determine whether the information is relevant to the question, and then sort all the relevant information. The ones that best match the \
#             question and help answer it have the highest priority. The format of your output must be a Python dictionary, where the key is the name of the location and the value is the reason why you chose this location and ranked it there. The location with the highest priority is placed higher, i.e., index is 0. \
#             Please note that there could be more than one result in the dictionary. If the information about a location could only partially match the question asked, you could also put it in the dictionary, but specify the advantages and disadvantages of this place in the value of the dictionary. \
#             If you could not complete the task or do not know the answer, just return the empty dictionary and do not refer to any additional knowledge"
     
#         ),
#         ("human", "information:{context}\nquestion:{question}"),
#     ]
# )

In [None]:
global ans
@chain
def find_query(_dict,vectorstore=vectorstore):
    query = _dict['query']
    lat = _dict['lat']
    lon = _dict['lon']
    global ans  

    side_km = 5

    half_side_km = side_km / 2
    delta_lat = half_side_km / 111  
    delta_lon = half_side_km / (111 * np.cos(radians(lat)))

    filter = models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.latitude",
                range=models.Range(
                    gte=lat - delta_lat,  
                    lte=lat + delta_lat,  
                ),
            ),
            models.FieldCondition(
                key="metadata.longitude",
                range=models.Range(
                    gte=lon - delta_lon,  
                    lte=lon + delta_lon,  
                ),
            )
        ]
    )
    ans = vectorstore.similarity_search(query,k=10,filter=filter)
    docs_content = [doc.page_content for doc in ans]
    global_docs_content = json.dumps(docs_content, ensure_ascii=False, indent=4)
    return global_docs_content



qa_chain = (
    {
        "context": {"query":itemgetter("question"),"lat":itemgetter("lat"),"lon":itemgetter("lon")} | find_query,
        "question": itemgetter("question")
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)


df = test.copy()

qa_pairs = []

for _, row in df.iterrows():

    question, correct_ans = row['Query'], row['Answer']
    lat,lon = row['latitude'], row['longitude']

    answer = qa_chain.invoke({"question": question , "lat": lat, "lon": lon})
    # rank_dict = ast.literal_eval(answer)
    # rank_list = list(rank_dict.items())
    # best_key, best_value = rank_list[0]

    # normalized_ans = normalize_text(correct_ans)
    # found = any(normalized_ans in normalize_text(key) for key in rank_dict.keys())

    qa_pairs.append({
        "question": question,
        # "LLM_answer": list(rank_dict.keys()),
        "ans":answer,
        "correct_ans": correct_ans,
        "docs": ans  
    })

qa_df = pd.DataFrame(qa_pairs)

In [None]:
qa_df.head()

In [None]:
def str_to_list(s):
    s = s.strip('[]')
    id_list = s.split(',')
    id_list = [id_.strip() for id_ in id_list]
    return id_list

In [None]:
qa_df['id_list'] = qa_df['correct_ans'].apply(str_to_list)


id_to_name = pd.Series(input.name.values, index=input.business_id).to_dict()
def map_ids_to_names(id_list, mapping):
    return [mapping.get(id_, 'Unknown') for id_ in id_list]


qa_df['name_list'] = qa_df['id_list'].apply(lambda x: map_ids_to_names(x, id_to_name))
qa_df.drop(['correct_ans'], axis=1, inplace=True)

In [None]:
def extract_dict_from_codeblock(code_str):
    start_marker = '```python\n'
    end_marker = '\n```'

    if '```python' in code_str:
        start_idx = code_str.find(start_marker)
        if start_idx == -1:
            raise ValueError("Error, no ```python")
        end_idx = code_str.find(end_marker, start_idx)
        if end_idx == -1:
            raise ValueError("Error, no ending ``` after ```python")
        dict_str = code_str[start_idx + len(start_marker):end_idx].strip()
    else:
        dict_str = code_str.strip()


    try:
        data_dict = json.loads(dict_str)
    except json.JSONDecodeError:
        try:
            data_dict = ast.literal_eval(dict_str)
        except Exception as e:
            raise ValueError(f"JSON Error: {e}")

    return data_dict

In [None]:
qa_df = qa_df[~qa_df['name_list'].apply(lambda x: x == ['Unknown'])]
qa_df = qa_df.loc[qa_df['ans'] != '{}']

In [None]:
qa_df['ans_dict'] = qa_df['ans'].apply(lambda x: extract_dict_from_codeblock(x))

In [None]:
def compute_metrics(row):
    true_names = set(row['name_list'])
    pred_names = set(row['ans_dict'].keys())
    tp = len(true_names & pred_names)
    precision = tp / len(pred_names) if len(pred_names) > 0 else 0
    recall = tp / len(true_names) if len(true_names) > 0 else 0
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return pd.Series({'precision': precision, 'recall': recall, 'f1': f1})


qa_df.loc[:, ['precision', 'recall', 'f1']] = qa_df.apply(compute_metrics, axis=1)


avg_precision = qa_df['precision'].mean()
avg_recall = qa_df['recall'].mean()
avg_f1 = qa_df['f1'].mean()

print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1 Score: {avg_f1:.4f}')


In [None]:
qa_df.to_csv(output_path,index=False)