In [None]:
import os
import csv
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

def loadbase():
    with open ('Dataset/Base/Restaurants.jsonl', 'r') as file:
        restaurants = [json.loads(line.strip()) for line in file]

    with open ('Dataset/Base/Hotels.jsonl', 'r') as file:
        hotels = [json.loads(line.strip()) for line in file]

    with open ('Dataset/Base/Attractions.jsonl', 'r') as file:
        attractions = [json.loads(line.strip()) for line in file]

        return restaurants, hotels, attractions

def clean_attraction(attraction):
    level_map = {
        0: "no",
        1: "low",
        2: "medium",
        3: "high"
    }
    keys = []
    values = []
    for key, value in attraction.items():
        keys.append(key)
        values.append(value['level'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i].replace('_',' '): level_map[values[i]] + ' ' + keys[i].replace('_',' ')}
        parsed = {**parsed, **new}

    return parsed

def parseAttraction(model,attractions_base):
    folder_path = f'preprocess/{model}/attractions'
    attractions_final = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            business_id_fromReviews = filename[:-5][-22:]
            
            for attraction in attractions_base:
                if attraction['business_id'] == business_id_fromReviews:
                    with open(folder_path + '/' + filename, 'r') as file:
                        attraction_fromReviews = json.load(file)
                    attraction_fromReviews_short = clean_attraction(attraction_fromReviews)
                    attraction_final = {**attraction, **attraction_fromReviews_short}
                    attractions_final.append(attraction_final)
    
    with open (f'Dataset/{model}/attractions.jsonl','w') as file:
        for item in attractions_final:
            json.dump(item, file)
            file.write('\n') 

    return attractions_final

def clean_hotel(hotel):
    level_map = {
        1 : "bad",
        2 : "below average",
        3 : "average",
        4 : "good",
        5 : "excellent"
    }
    keys = []
    values = []
    for key, value in hotel.items():
        keys.append(key)
        values.append(value['rating'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

def parseHotel(model,hotels_base):
    folder_path = f'preprocess/{model}/hotels'
    hotels_final = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            business_id_fromReviews = filename[:-5][-22:]
            
            for hotel in hotels_base:
                if hotel['business_id'] == business_id_fromReviews:
                    with open(folder_path + '/' + filename, 'r') as file:
                        hotel_fromReviews = json.load(file)
                    hotel_fromReviews_short = clean_hotel(hotel_fromReviews)
                    hotel_final = {**hotel, **hotel_fromReviews_short}
                    hotels_final.append(hotel_final)
    
    with open (f'Dataset/{model}/hotels.jsonl','w') as file:
        for item in hotels_final:
            json.dump(item, file)
            file.write('\n') 

    return hotels_final

def clean_restaurant(restaurant):
    level_map = {
        1 : "bad",
        2 : "below average",
        3 : "average",
        4 : "good",
        5 : "excellent"
    }
    keys = []
    values = []
    for key, value in restaurant.items():
        keys.append(key)
        values.append(value['rating'])

    parsed = {}

    for i in range(len(keys)):
        new = {keys[i]: level_map[values[i]] + ' ' + keys[i]}
        parsed = {**parsed, **new}

    return parsed

def parseRestaurant(model,restaurants_base):
    folder_path = f'preprocess/{model}/restaurants'
    restaurants_final = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            business_id_fromReviews = filename[:-5][-22:]
            
            for restaurant in restaurants_base:
                if restaurant['business_id'] == business_id_fromReviews:
                    with open(folder_path + '/' + filename, 'r') as file:
                        restaurant_fromReviews = json.load(file)
                    restaurant_fromReviews_short = clean_restaurant(restaurant_fromReviews)
                    restaurant_final = {**restaurant, **restaurant_fromReviews_short}
                    restaurants_final.append(restaurant_final)
    
    with open (f'Dataset/{model}/restaurants.jsonl','w') as file:
        for item in restaurants_final:
            json.dump(item, file)
            file.write('\n') 

    return restaurants_final

def createAllData(model):
    #load jsonl
    with open (f'Dataset/{model}/attractions.jsonl', 'r') as file:
        attractions = [json.loads(line.strip()) for line in file]
    with open (f'Dataset/{model}/hotels.jsonl', 'r') as file:
        hotels = [json.loads(line.strip()) for line in file]
    with open (f'Dataset/{model}/restaurants.jsonl', 'r') as file:
        restaurants = [json.loads(line.strip()) for line in file]

    #make into csv
    att_df = pd.DataFrame(attractions)
    att_df = att_df.loc[:, ['name', 'address', 'latitude','longitude','stars', 'price','family oriented','history oriented','activity oriented','nature oriented','food oriented','shopping oriented']]
    att_df.to_csv(f'Dataset/{model}/Attractions.csv', index=False)

    hot_df = pd.DataFrame(hotels)
    hot_df = hot_df.loc[:, ['name', 'address', 'latitude','longitude', 'stars', 'price','quality','location','service','safety']]
    hot_df.to_csv(f'Dataset/{model}/Hotels.csv', index=False)

    rest_df = pd.DataFrame(restaurants)
    rest_df = rest_df.loc[:, ['name', 'address', 'latitude','longitude', 'stars', 'good_for_meal', 'price','cuisine_1','cuisine_2','flavor','freshness','service','environment','value']]
    rest_df.to_csv(f'Dataset/{model}/Restaurants.csv', index=False)

    #make into txt
    with open (f'Dataset/{model}/Attractions.csv', 'r') as file:
        reader = csv.reader(file)
        text = ''
        for row in reader:
            #put each row into a string
            text += ','.join(row)+ '\n'
    with open (f'Dataset/{model}/Attractions.txt', 'w') as file:
        file.write(text)

    with open (f'Dataset/{model}/Hotels.csv', 'r') as file:
        reader = csv.reader(file)
        text = ''
        for row in reader:
            #put each row into a string
            text += ','.join(row)+ '\n'
    with open (f'Dataset/{model}/Hotels.txt', 'w') as file:
        file.write(text)

    with open (f'Dataset/{model}/Restaurants.csv', 'r') as file:
        reader = csv.reader(file)
        text = ''
        for row in reader:
            #put each row into a string
            text += ','.join(row)+ '\n'
    with open (f'Dataset/{model}/Restaurants.txt', 'w') as file:
        file.write(text)
    
    #make into one txt file
    with open (f'Dataset/{model}/Attractions.txt', 'r') as file:
        attraction_text = file.read().replace('\n', '. ')

    with open (f'Dataset/{model}/Hotels.txt', 'r') as file:
        hotel_text = file.read().replace('\n', '. ')

    with open (f'Dataset/{model}/Restaurants.txt', 'r') as file:
        restaurant_text = file.read().replace('\n', '. ')
    data = [
        {
            "Description": "Accommodations in Philadelphia",
            "Content": hotel_text
        },
        {
            "Description": "Attractions in Philadelphia",
            "Content": attraction_text
        },
        {
            "Description": "Restaurants in Philadelphia",
            "Content": restaurant_text
        }
    ]

    with open(f'Dataset/{model}/all_data.json', 'w') as f:
        json.dump(data, f, indent=4)

def createFilteredData(model):
    restaurants = pd.read_csv(f'Dataset/{model}/Restaurants.csv')
    hotels = pd.read_csv(f'Dataset/{model}/Hotels.csv')
    attractions = pd.read_csv(f'Dataset/{model}/Attractions.csv')

    with open ('Prompts/evals.jsonl', 'r') as file:
        evals = [json.loads(line.strip()) for line in file]

    filteredData = []
    for i in range(len(evals)):
        prompt = evals[i]['eval_info']
        #attractionSearch
        #budget
        budget = prompt['price'][0]
        price_map = {'cheap budget':['$','$$'],'moderate budget':['$','$$','$$$'],'expensive budget':['$$','$$$','$$$$']}
        price_limit = price_map[budget.lower()]
        attractions_filtered = attractions[attractions['price'].isin(price_limit)]

        #preference
        preference = prompt['attraction'][0].lower()
        pref_list = ['medium ' + preference, 'high ' + preference]
        attractions_filtered = attractions_filtered[attractions_filtered[preference].isin(pref_list)]

        #restaurants
        #budget
        restaurants_filtered = restaurants[restaurants['price'].isin(price_limit)]

        #cuisine
        cuisine = prompt['cuisine'][0]
        if cuisine == 'US':
            cuisine = ['American','American (New)','American (Traditional)']
        else:
            cuisine = [cuisine]

        restaurants_filtered = restaurants_filtered[(restaurants_filtered['cuisine_1'].isin(cuisine)) | (restaurants_filtered['cuisine_2'].isin(cuisine))]
        #preference
        preference = prompt['restaurant']
        if preference != []:
            for pref in preference:
                prefs = pref.split(' ')
                col = prefs[1].lower()
                pref_list = ['good ' + col, 'excellent ' + col]
                restaurants_filtered = restaurants_filtered[restaurants_filtered[col].isin(pref_list)]

        #hotel search
        #budget
        hotels_filtered = hotels[hotels['price'].isin(price_limit)]

        #preference
        preference = prompt['hotel']
        if preference != []:
            for pref in preference:
                prefs = pref.split(' ')
                col = prefs[1].lower()
                pref_list = ['good ' + col, 'excellent ' + col]
                hotels_filtered = hotels_filtered[hotels_filtered[col].isin(pref_list)]
        
        #spatial clustering of all the filtered data
        #attractions_filtered; restaurants_filtered; hotels_filtered
        allBusiness_names = []
        allBusiness_cordinates = []
        for index, row in attractions_filtered.iterrows():
            allBusiness_names.append(row['name'])
            allBusiness_cordinates.append([row['latitude'], row['longitude']])
        #for index, row in restaurants_filtered.iterrows():
        #    allBusiness.append([row['latitude'], row['longitude']])
        for index, row in hotels_filtered.iterrows():
            allBusiness_names.append(row['name'])
            allBusiness_cordinates.append([row['latitude'], row['longitude']])
        
        coordinates = np.array(allBusiness_cordinates)
        #k = int(prompt['day'][0].strip()[0]) 0.23 1.24
        # k=10 0.27 1.3
        k = int(len(coordinates)/5)
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(coordinates)
        # Cluster labels
        allBusiness_clusterNumbers = kmeans.labels_

        clusterInfo = pd.DataFrame({'business': allBusiness_names, 'cluster number': allBusiness_clusterNumbers})
        clusterInfo['cluster number'] = clusterInfo['cluster number'].apply(lambda x: 'Cluster_' + str(x))
        clusterInfo = clusterInfo.sort_values('cluster number')

        cluster_dict = clusterInfo.groupby('cluster number')['business'].apply(list).to_dict()

        #prepare the input data
        attractions_filtered.to_csv(f'preprocess/{model}/filteredAttractions.csv', index=False)
        with open (f'preprocess/{model}/filteredAttractions.csv', 'r') as f:
            reader = csv.reader(f)
            attractions_txt = ''
            for row in reader:
                attractions_txt += ','.join(row)+ '\n'

        restaurants_filtered.to_csv(f'preprocess/{model}/filteredRestaurants.csv', index=False)
        with open (f'preprocess/{model}/filteredRestaurants.csv', 'r') as f:
            reader = csv.reader(f)
            restaurants_txt = ''
            for row in reader:
                restaurants_txt += ','.join(row)+ '\n'

        hotels_filtered.to_csv(f'preprocess/{model}/filteredHotels.csv', index=False)
        with open (f'preprocess/{model}/filteredHotels.csv', 'r') as f:
            reader = csv.reader(f)
            hotels_txt = ''
            for row in reader:
                hotels_txt += ','.join(row)+ '\n'
        
        data = [
            {
                "Description": "Filtered Accommodations in Philadelphia",
                "Content": hotels_txt
            },
            {
                "Description": "Filtered Attractions in Philadelphia",
                "Content": attractions_txt
            },{
                "Description": "Filtered Restaurants in Philadelphia",
                "Content": restaurants_txt
            },
            {
                "Description": "Near by businesses in clusters about attractions and hotels",
                "Content": cluster_dict
            }
        ]

        filteredData.append({"index": i+1, "filtered_data": data})

    with open(f'Dataset/{model}/filtered_data.jsonl', 'w') as f:
        for item in filteredData:
            json.dump(item, f)
            f.write('\n')

if __name__ == '__main__':

    restaurants_base, hotels_base, attractions_base = loadbase()
    
    model = 'gpt4o'
    #gpt4o
    #attractions_final = parseAttraction(model,attractions_base)
    #hotels_final = parseHotel(model,hotels_base)
    #restaurants_final = parseRestaurant(model,restaurants_base)
    #createAllData(model)
    createFilteredData(model)