In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from operator import itemgetter
import pandas as pd
import re
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.util import ngrams
import math

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# Change the path from here
input_path = 'your_path.csv' # POI data
output_path = 'your_path.csv' # where to save the data

test = pd.read_csv('your_path.csv') #testset
input = pd.read_csv(input_path)


In [None]:
file_path = input_path 
metadata_columns = ['business_id','name','longitude', 'latitude']
loader = CSVLoader(
    file_path=file_path,
    metadata_columns=metadata_columns,
    csv_args={
        'delimiter': ',',
        'quotechar': '"',
    }
)

data = loader.load()
for doc in data:
    doc.metadata['longitude'] = float(doc.metadata['longitude'])
    doc.metadata['latitude'] = float(doc.metadata['latitude'])

In [None]:
documents = [doc.page_content for doc in data]
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]
tokenized_documents = [[word for word in doc if word not in stop_words and word.isalpha()] for doc in tokenized_documents]

In [None]:
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(text) for text in tokenized_documents]

In [None]:
lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
doc_topics = [lda.get_document_topics(bow) for bow in corpus]

In [None]:
def lda_query(query, target_latitude, target_longitude,side_km=5):
    tokenized_query = word_tokenize(query.lower())
    tokenized_query = [word for word in tokenized_query if word not in stop_words and word.isalpha()]


    query_bow = dictionary.doc2bow(tokenized_query)
    query_doc_topics = lda.get_document_topics(query_bow)

    half_side_km = side_km / 2
    delta_lat = half_side_km / 111 

    lat_rad = math.radians(target_latitude)

    if math.cos(lat_rad) != 0:
        delta_lon = half_side_km / (111 * math.cos(lat_rad))
    else:
        delta_lon = 180  
    

    scores = []
    for i, doc_distribution in enumerate(doc_topics):
        score = sum([prob * next((prob for topic_id, prob in doc_distribution if topic_id == topic_id_query), 0)
                     for topic_id_query, prob in query_doc_topics])
        scores.append((score, i))


    sorted_scores = sorted(scores, key=lambda x: x[0], reverse=True)

    results = []
    for score, index in sorted_scores:
        doc = data[index]
   
        latitude = float(doc.metadata['latitude'])
        longitude = float(doc.metadata['longitude'])
        
        if (target_latitude - delta_lat <= latitude <= target_latitude + delta_lat) and \
           (target_longitude - delta_lon <= longitude <= target_longitude + delta_lon):
            results.append(doc)
            if len(results) == 10:
                break

    return results


In [None]:
def str_to_list(s):
    s = s.strip('[]')
    id_list = s.split(',')
    id_list = [id_.strip() for id_ in id_list]
    return id_list

In [None]:
test['id_list'] = test['Answer'].apply(str_to_list)


id_to_name = pd.Series(input.name.values, index=input.business_id).to_dict()
def map_ids_to_names(id_list, mapping):
    return [mapping.get(id_, 'Unknown') for id_ in id_list]


test['name_list'] = test['id_list'].apply(lambda x: map_ids_to_names(x, id_to_name))
test.drop(['Answer'], axis=1, inplace=True)

In [None]:
qa_pairs = []
for _, row in test.iterrows():
    question, correct_name = row['Query'], row['name_list']
    lat,lon = row['latitude'],row['longitude']


    lda_results = lda_query(question,lat,lon)

    qa_pairs.append({
        "question": question,
        "LDA_answer": [doc.metadata['name'] for doc in lda_results],
        "correct_name": correct_name
    })

qa_df = pd.DataFrame(qa_pairs)


In [None]:
def compute_metrics(row):
    true_names = set(row['correct_name'])
    pred_names = set(row['LDA_answer'])
    tp = len(true_names & pred_names)
    precision = tp / len(pred_names) if len(pred_names) > 0 else 0
    recall = tp / len(true_names) if len(true_names) > 0 else 0
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return pd.Series({'precision': precision, 'recall': recall, 'f1': f1})


qa_df.loc[:, ['precision', 'recall', 'f1']] = qa_df.apply(compute_metrics, axis=1)


avg_precision = qa_df['precision'].mean()
avg_recall = qa_df['recall'].mean()
avg_f1 = qa_df['f1'].mean()

print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1 Score: {avg_f1:.4f}')

In [None]:
qa_df.to_csv(output_path,index=False)