In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from operator import itemgetter

import pandas as pd
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import models
from langchain_core.runnables import chain
from langchain_core.output_parsers import StrOutputParser
import os
import ast
from langchain_openai import ChatOpenAI
import json
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
import numpy as np

import requests
from langchain.prompts import ChatPromptTemplate
import re
from math import radians, cos


In [None]:
# Change the path from here
input_path = 'your_path.csv' # POI data
output_path = 'your_path.csv' # where to save the data

test = pd.read_csv('your_path.csv') #testset
input = pd.read_csv(input_path)

# change openai api key
os.environ["OPENAI_API_KEY"] = "your_api_key"

In [None]:
test = test.loc[test['Answer'] != '[]']
len(test)

In [None]:
# load embeddings
# embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1", cache_folder=embedding_path)
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')


file_path = input_path
metadata_columns = ['business_id','name','longitude', 'latitude']
loader = CSVLoader(
    file_path=file_path,
    metadata_columns=metadata_columns,
    csv_args={
        'delimiter': ',',
        'quotechar': '"',
    }
)

data = loader.load()
for doc in data:
    doc.metadata['longitude'] = float(doc.metadata['longitude'])
    doc.metadata['latitude'] = float(doc.metadata['latitude'])

# store data in vector database
vectorstore = Qdrant.from_documents(
    documents=data, 
    embedding=embeddings, 
    path = 'VectorDB_EM', 
    collection_name="yelp_colls",
    force_recreate=False) 


In [None]:
def find_query(query,lat,lon,vectorstore=vectorstore):
    query = query
    lat = lat
    lon = lon


    side_km = 5

    half_side_km = side_km / 2
    delta_lat = half_side_km / 111  
    delta_lon = half_side_km / (111 * np.cos(radians(lat)))

    filter = models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.latitude",
                range=models.Range(
                    gte=lat - delta_lat,  
                    lte=lat + delta_lat,  
                ),
            ),
            models.FieldCondition(
                key="metadata.longitude",
                range=models.Range(
                    gte=lon - delta_lon,  
                    lte=lon + delta_lon,  
                ),
            )
        ]
    )
    ans = vectorstore.similarity_search(query,k=10,filter=filter)
    docs_content = [doc.page_content for doc in ans]
    ans_name = [doc.metadata['name'] for doc in ans]

    return docs_content,ans_name



df = test.copy()

qa_pairs = []

for _, row in df.iterrows():

    question, correct_ans = row['Query'], row['Answer']
    lat,lon = row['latitude'], row['longitude']

    docs_content,ans_name = find_query(question,lat,lon)


    qa_pairs.append({
        "question": question,
        "docs": docs_content,
        "ans":ans_name,
        "correct_ans": correct_ans,
    })

qa_df = pd.DataFrame(qa_pairs)

In [None]:
qa_df.head()

In [None]:
def str_to_list(s):
    s = s.strip('[]')
    id_list = s.split(',')
    id_list = [id_.strip() for id_ in id_list]
    return id_list

In [None]:
qa_df['id_list'] = qa_df['correct_ans'].apply(str_to_list)


id_to_name = pd.Series(input.name.values, index=input.business_id).to_dict()
def map_ids_to_names(id_list, mapping):
    return [mapping.get(id_, 'Unknown') for id_ in id_list]


qa_df['name_list'] = qa_df['id_list'].apply(lambda x: map_ids_to_names(x, id_to_name))
qa_df.drop(['correct_ans'], axis=1, inplace=True)

In [None]:
qa_df.head()

In [None]:
def compute_metrics(row):
    true_names = set(row['name_list'])
    pred_names = set(row['ans'])
    tp = len(true_names & pred_names)
    precision = tp / len(pred_names) if len(pred_names) > 0 else 0
    recall = tp / len(true_names) if len(true_names) > 0 else 0
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return pd.Series({'precision': precision, 'recall': recall, 'f1': f1})


qa_df.loc[:, ['precision', 'recall', 'f1']] = qa_df.apply(compute_metrics, axis=1)


avg_precision = qa_df['precision'].mean()
avg_recall = qa_df['recall'].mean()
avg_f1 = qa_df['f1'].mean()

print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1 Score: {avg_f1:.4f}')


In [None]:
qa_df.to_csv(output_path,index=False)