In [77]:
generated_model = 'Llama317B'

In [78]:
import json
from fuzzywuzzy import fuzz
import numpy as np
import os
import sys

In [79]:
with open ('../Datasets/Restaurants_task1.jsonl', 'r') as file:
    restaurants = [json.loads(line.strip()) for line in file]

with open ('../Datasets/Hotels_task1.jsonl', 'r') as file:
    hotels = [json.loads(line.strip()) for line in file]

with open ('../Datasets/Attractions_task1.jsonl', 'r') as file:
    attractions = [json.loads(line.strip()) for line in file]

def getID(name,address,category):
    #as long as there is a '-', then return -2
    #if there is an empty list, then return []
    #if the information doesn't match, return -1

    if name == "-" and address == "-":
        return -2

    #normal case
    idFromName = []
    idFromAddress = []

    address = address.split(",")[0]
    
    #restaurants
    if category == 'restaurants':
        for restaurant in restaurants:
            if restaurant['name'].lower() == name.lower():
                idFromName.append(restaurant['business_id'])
            if restaurant['address'].lower() == address.lower():
                idFromAddress.append(restaurant['business_id'])
        set1 = set(idFromName)
        set2 = set(idFromAddress)
        #if the extracted id from name and address make an agreement
        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        # we have to use similarity score to determine the id
        else:
            name_sim_score = []
            address_sim_score = []

            for restaurant in restaurants:
                name_sim_score.append(fuzz.ratio(name.lower(), restaurant['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), restaurant['address'].lower()))

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            #if the score is high enough, then we claim the id
            if max(scores) >= 120:
                return restaurants[np.argmax(scores)]['business_id']
            #if the score is less than 60 for each, then we indicate that the business is out of the pool
            else:
                return -1
    #attractions
    if category == 'attractions':
        for attraction in attractions:
            if attraction['name'].lower() == name.lower():
                idFromName.append(attraction['business_id'])
            if attraction['address'].lower() == address.lower():
                idFromAddress.append(attraction['business_id'])
        
        set1 = set(idFromName)
        set2 = set(idFromAddress)

        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        else:
            name_sim_score = []
            address_sim_score = []

            for attraction in attractions:
                name_sim_score.append(fuzz.ratio(name.lower(), attraction['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), attraction['address'].lower()))

            if max(name_sim_score) == 100:
                return attractions[np.argmax(name_sim_score)]['business_id']

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            if max(scores) >= 120:
                return attractions[np.argmax(scores)]['business_id']
            else:
                return -1
    #hotels
    if category == 'hotels':
        for hotel in hotels:
            if hotel['name'].lower() == name.lower():
                idFromName.append(hotel['business_id'])
            if hotel['address'].lower() == address.lower():
                idFromAddress.append(hotel['business_id'])
        set1 = set(idFromName)
        set2 = set(idFromAddress)
        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        else:
            name_sim_score = []
            address_sim_score = []

            for hotel in hotels:
                name_sim_score.append(fuzz.ratio(name.lower(), hotel['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), hotel['address'].lower()))

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            if max(scores) >= 120:
                return hotels[np.argmax(scores)]['business_id']
            else:
                return -1


def prepareEval(plan):
    plan_eval = []
    for days in plan['itinerary']:
        day = {}
        day['days'] = days['days']
        day['breakfast'] = getID(days['breakfast']['name'],days['breakfast']['address'],'restaurants')
        day['morning_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['morning_attractions']]
        day['lunch'] = getID(days['lunch']['name'],days['lunch']['address'],'restaurants')
        day['afternoon_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['afternoon_attractions']]
        day['dinner'] = getID(days['dinner']['name'],days['dinner']['address'],'restaurants')
        day['night_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['night_attractions']]
        day['accommodation'] = getID(days['accommodation']['name'],days['accommodation']['address'],'hotels')
        plan_eval.append(day)
    return plan_eval

In [80]:
def populateCordinates(plan_eval, data, data_hotel):
    cordinates = []
    for day in plan_eval:
        cordinate_one_day = []
        
        cordinate_one_day.append(getCordinate_Hotel(day['accommodation'], data_hotel))
        
        for attraction in day['morning_attractions']:
            if(attraction != -1):
                cordinate_one_day.append(getCordinate(attraction,data))
        for attraction in day['afternoon_attractions']:
            if(attraction != -1):
                cordinate_one_day.append(getCordinate(attraction,data))
        for attraction in day['night_attractions']:
            if(attraction != -1):
                cordinate_one_day.append(getCordinate(attraction,data))
                
        cordinates.append(cordinate_one_day)
    return cordinates

def getCordinate(id,data):
    for attraction in data:
        if attraction['business_id'] == id:
            return (attraction['latitude'], attraction['longitude'])
def getCordinate_Hotel(id,data_hotel):
    for hotel in data_hotel:
        if hotel['business_id'] == id:
            return (hotel['latitude'], hotel['longitude'])
        
def getDistanceMatrix(cordinates):
    n = len(cordinates)
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            distance_matrix[i][j] = distance_matrix[j][i] = ((cordinates[i][0]*1000 - cordinates[j][0]*1000)**2 + (cordinates[i][1]*1000 - cordinates[j][1]*1000)**2)**0.5
    return distance_matrix

def populateShortestDistanceOneDay(cordinates):
    shortest_distance_list = []
    shortest_distance_info_lists = []
    for oneday in cordinates:
        distance_matrix = getDistanceMatrix(oneday)
        n = len(distance_matrix)
        info_lists = []
        optimized_distance = totalCost(1, 0, n, distance_matrix,info_lists)
        shortest_distance_list.append(optimized_distance)
        shortest_distance_info_lists.append(info_lists)
    return shortest_distance_list, shortest_distance_info_lists

def totalCost(mask, pos, n, cost, info_lists):
    distance_list = []
    i_list = []
    # Base case: if all cities are visited, return the
    # cost to return to the starting city (0)
    if mask == (1 << n) - 1:
        return cost[pos][0]

    ans = sys.maxsize   

    # Try visiting every city that has not been visited yet
    for i in range(n):
        if (mask & (1 << i)) == 0: 
            i_list.append(i)
            # If city i is not visited, visit it and 
            #  update the mask
            distance_list.append(cost[pos][i] +
                      totalCost(mask | (1 << i), i, n, cost, info_lists))
        

    info_list = [pos,i_list, distance_list]
    info_lists.append(info_list)
    
    ans = min(distance_list)
    return ans

def populatePlannedDistanceOneDay(cordinates):
    planned_distance_list = []
    for oneday in cordinates:
        distance_matrix = getDistanceMatrix(oneday)
        #print(distance_matrix)
        distance = 0
        for i in range(len(distance_matrix)):
            if i == len(distance_matrix) - 1:
                j = 0
            else:
                j = i + 1
            distance += distance_matrix[i][j]
        planned_distance_list.append(distance)
    return planned_distance_list

def getDistanceGapRatio(shortest_distances_by_day, planned_distances_by_day):
    distance_gap = 0
    total_distance = 0
    for optimized_distance, planned_distance in zip(shortest_distances_by_day, planned_distances_by_day):
        gap = []
        gap = np.sum(np.array(planned_distance) - np.array(optimized_distance))
        distance_gap += gap
        
        total = np.sum(np.array(planned_distance))
        total_distance += total
        
    return distance_gap / total_distance

def getOptimizedOrder(shortest_distance_info_lists):


    order_list = []
    for day in shortest_distance_info_lists:

        if len(day) == 0:
            order_list.append([[0],[0]])
            continue

        pos = 0
        n = len(day[-1][1]) + 1
        #get a list of 1 to n
        candidates = list(range(n-1))
        #add 1 to the values
        candidates = [x+1 for x in candidates]

        moves = []

        while len(candidates) > 0:
            #find the last one in the lnfo_list
            for i in range(len(day)):
                if day[i][0] == pos and day[i][1] == candidates:
                    #print(day[i][0],day[i][1])
                    next_move = day[i][1][np.argmin(day[i][2])]
                    #print(next_move)
                    pos = next_move
                    moves.append(next_move)
                    #take next move out of candidates
                    candidates.remove(next_move)

        moves_reversed = moves[::-1]
        optimized_route = [[0] + moves, [0] + moves_reversed]
        order_list.append(optimized_route)

    return order_list
        
def getPositionDeviationRatio(shortest_order_by_day):
    total_places = 0
    total_deviation = 0
    for plan in shortest_order_by_day:
        for day in plan:
            n = len(day[0])
            total_places += n
            output_route = list(range(n))
            gap_1 = sum([1 if x != y else 0 for x,y in zip(output_route,day[0])])
            gap_2 = sum([1 if x != y else 0 for x,y in zip(output_route,day[1])])
            total_deviation += min(gap_1, gap_2)
    return total_deviation / total_places

In [81]:
with open('../Datasets/Attractions_task1.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

with open('../Datasets/Hotels_task1.jsonl', 'r') as f:
    data_hotel = [json.loads(line) for line in f]

def daywiseTSP(generated_model):
    shortest_distances_by_day = []
    planned_distances_by_day = []
    shortest_order_by_day = []
    for filename in os.listdir(f'Outputs/evals/{generated_model}'):
        #just a test
        if(filename != 'Plan_Eval_3.json'):
           continue

        #load the plan in json format, and it's corresponding requirement eval json file.
        index = filename[:-5][10:]
        plan = json.load(open(f'Outputs/evals/{generated_model}/Plan_Eval_{index}.json'))
        eval = json.load(open(f'Prompts/evals/Prompt_Eval_{index}.json'))
        
        # Failure rate related
        # prepare a result list to return
        # outofpool, missinginfo,
        # prepare the evaluation for each plan, search the business id
        plan_eval = prepareEval(plan)
        print(plan_eval)

        #get the cordinates
        cordinates = populateCordinates(plan_eval, data, data_hotel)
        print(cordinates)
        #one day shortest distance
        shortest_distance_list_each_day, shortest_distance_info_lists = populateShortestDistanceOneDay(cordinates)
        print(shortest_distance_info_lists)
        shortest_distances_by_day.append(shortest_distance_list_each_day)
        
        shortest_order_list_each_day = getOptimizedOrder(shortest_distance_info_lists)
        
        shortest_order_by_day.append(shortest_order_list_each_day)
        #shortest_order_by_day(info_list)
        
        #one day planned distance
        planned_distance_list_each_day = populatePlannedDistanceOneDay(cordinates)
        planned_distances_by_day.append(planned_distance_list_each_day)

        #plan wise (multi day) optimization calculation


        

    #get distance gap ratio
    distance_gap_ratio = getDistanceGapRatio(shortest_distances_by_day, planned_distances_by_day)

    #position deviation ratio
    position_deviation_ratio = getPositionDeviationRatio(shortest_order_by_day)
    
    return distance_gap_ratio, position_deviation_ratio

In [82]:
#shortest_distances_by_day
#planned_distances_by_day
#shortest_order_by_day

In [83]:
distance_gap_ratio, position_deviation_ratio = daywiseTSP(generated_model)

[{'days': 'Day 1', 'breakfast': '0RuvlgTnKFbX3IK0ZOOocA', 'morning_attractions': ['mdD_JS-xG5EX_QxVGD1dYw'], 'lunch': 'cOXc8c85Ms6dMEAJazLXHQ', 'afternoon_attractions': ['Qw7tz-UkPrpXaVidWuab4Q', '8F6URCmvOYbzX4ErKgmQ0w'], 'dinner': 'CuyQt_cUFS1i738GXTNciw', 'night_attractions': ['Ib9HV7ekw459jM1Ksdiyiw'], 'accommodation': '0FMOvA2Noq-gieWfnXB7aQ'}, {'days': 'Day 2', 'breakfast': '0RuvlgTnKFbX3IK0ZOOocA', 'morning_attractions': [-1], 'lunch': -1, 'afternoon_attractions': [-1], 'dinner': 'AKrFJ7vuBbLPfE9u2HVEkQ', 'night_attractions': [-1], 'accommodation': '0FMOvA2Noq-gieWfnXB7aQ'}]
[[(39.9091912, -75.1646617), (39.9987353985, -75.198543291), (39.965573, -75.180969), (39.9488054891, -75.1817243795), (39.9488980012, -75.1500296367)], [(39.9091912, -75.1646617)]]
[[[3, [4], [np.float64(74.01186421372778)]], [4, [3], [np.float64(74.82754446726452)]], [2, [3, 4], [np.float64(90.79638148323198), np.float64(109.97437627882442)]], [2, [4], [np.float64(77.4638182112512)]], [4, [2], [np.float64(

Multiday

In [84]:
def getHotelIndex(day,cordinates):
    hotel_index = 0
    if day > 0:
        for j in range(day):
            hotel_index += len(cordinates[j])
    return hotel_index

def totalCost_multiday(mask, pos, day, cordinates, n, visited, cost, info_lists, memo):
    visit_requirement = len(cordinates[day])
    distance_list = []
    i_list = []

    hotel_index = getHotelIndex(day,cordinates)
    # Base case: if all cities are visited, return the
    # cost to return to the starting city (0)

    if mask == (1 << n) - 1:
        return cost[pos][hotel_index]
    
    if memo[pos][mask] != -1:
        return memo[pos][mask]

    if visit_requirement == visited:
        for i in range(n):
            if (mask & (1 << i)) == 0: 
                i_list.append(i)
                distance_list.append(cost[hotel_index][i] + totalCost_multiday(mask | (1 << i), i, day + 1, cordinates, n, 2, cost, info_lists,memo))
        
        info_list = [pos,i_list, distance_list]
        info_lists.append(info_list)
        
        return min(distance_list) + cost[pos][hotel_index] # change this to the old hotel position
    
    # Try visiting every city that has not been visited yet
    for i in range(n):
        if (mask & (1 << i)) == 0: 

            i_list.append(i)
            # If city i is not visited, visit it and 
             #  update the mask
            distance_list.append(cost[pos][i] +
                      totalCost_multiday(mask | (1 << i), i, day, cordinates, n, visited + 1, cost, info_lists,memo))
        

    info_list = [pos,i_list, distance_list]
    info_lists.append(info_list)
    
    memo[pos][mask] = min(distance_list)

    return min(distance_list)

def getDistanceMatrix_by_plan(cordinates):
    n = 0
    for day in cordinates:
        for place in day:
            n+=1
    flattened = []
    for day in cordinates:
        for location in day:
            flattened.append(location)
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            distance_matrix[i][j] = distance_matrix[j][i] = ((flattened[i][0]*1000 - flattened[j][0]*1000)**2 + (flattened[i][1]*1000 - flattened[j][1]*1000)**2)**0.5
    return distance_matrix

def getOptimizedDistance_by_plan(cordinates,distance_matrix):
    n = len(distance_matrix)
    info_lists = []
    #newMask will have all the hotels as 1 before the function.
    newMask = 1
    index_list = list(range(len(cordinates) - 1))
    index_list = index_list[::-1]
    newMask = 1
    for i in index_list:
        newMask = (newMask << (len(cordinates[i]))) + 1
    memo = [[-1] * (1 << n) for _ in range(n)]
    optimized_distance = totalCost_multiday(newMask,0,0,cordinates,n,1,distance_matrix, info_lists,memo)
    return optimized_distance, info_lists

def getOptimizedOrder_by_plan(info_lists):
    pos = 0
    lookfor = info_lists[-1][1].copy()
    moves = []
    while len(lookfor) > 0:
        for record in info_lists:
            if(record[0] == pos and record[1] == lookfor):
                nextmove = record[1][np.argmin(record[2])]
                pos = nextmove
                moves.append(nextmove)
                lookfor.remove(pos)
    return moves

def getPlannedDistance_by_plan(cordinates, distance_matrix):
    distance = 0
    for i in range(len(distance_matrix)):
        if i == len(distance_matrix) - 1:
            j = len(distance_matrix) - len(cordinates[-1])
        else:
            j = i + 1
        distance += distance_matrix[i][j]
    return distance

def getDistanceGapRatio_by_plan(optimized_distances_by_plan, planned_distances_by_plan):
    gaps = np.array([])
    for optimized,planned in zip(optimized_distances_by_plan,planned_distances_by_plan):
        gap = planned - optimized
        gaps = np.append(gaps,gap)
    total_gap = np.sum(gaps)
    total_planned = np.sum(planned_distances_by_plan)
    ratio = total_gap / total_planned
    return ratio

In [85]:
def getDistanceGapRatio_by_plan(optimized_distances_by_plan, planned_distances_by_plan):
    gaps = np.array([])
    for optimized,planned in zip(optimized_distances_by_plan,planned_distances_by_plan):
        gap = planned - optimized
        gaps = np.append(gaps,gap)
    total_gap = np.sum(gaps)
    total_planned = np.sum(planned_distances_by_plan)
    ratio = total_gap / total_planned
    return ratio

In [86]:
with open('../Datasets/Attractions_task1.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

with open('../Datasets/Hotels_task1.jsonl', 'r') as f:
    data_hotel = [json.loads(line) for line in f]

def planwiseTSP(generated_model):
    optimized_distances_by_plan = []
    optimized_orders_by_plan = []
    planned_distances_by_plan = []
    for filename in os.listdir(f'Outputs/evals/{generated_model}'):
        #just a test
        #if(filename != 'Plan_Eval_6.json'):
        #   continue

        #load the plan in json format, and it's corresponding requirement eval json file.
        index = filename[:-5][10:]
        plan = json.load(open(f'Outputs/evals/{generated_model}/Plan_Eval_{index}.json'))
        eval = json.load(open(f'Prompts/evals/Prompt_Eval_{index}.json'))
        
        # Failure rate related
        # prepare a result list to return
        # outofpool, missinginfo,
        # prepare the evaluation for each plan, search the business id
        plan_eval = prepareEval(plan)
        #if(plan_eval)
        if plan_eval[0]['accommodation'] == -1:
            continue
        #get the cordinates
        cordinates = populateCordinates(plan_eval, data, data_hotel)
        print(cordinates)
        distance_matrix = getDistanceMatrix_by_plan(cordinates)
        
        optimized_distance, info_lists = getOptimizedDistance_by_plan(cordinates,distance_matrix)
        
        optimized_distances_by_plan.append(optimized_distance)
        
        optimized_order = getOptimizedOrder_by_plan(info_lists)
        optimized_orders_by_plan.append(optimized_order)

        planned_distance = getPlannedDistance_by_plan(cordinates, distance_matrix)
        planned_distances_by_plan.append(planned_distance)

    distance_gap_ratio_by_plan = getDistanceGapRatio_by_plan(optimized_distances_by_plan, planned_distances_by_plan)
    return distance_gap_ratio_by_plan

In [87]:
ratio = planwiseTSP('Llama317B')
ratio

[[(39.9517461, -75.1602286), (39.9488980012, -75.1500296367), (39.9533414645, -75.1588545174), (39.9472668, -75.14594)], [(39.9517461, -75.1602286), (39.952091, -75.1401399646), (39.9605399867, -75.1725628227)], [(39.9517461, -75.1602286), (39.9495474, -75.1426984), (39.9683684, -75.1726552), (39.9533414645, -75.1588545174)]]
[[(39.9491236494, -75.1488953439), (39.9533414645, -75.1588545174), (39.9605399867, -75.1725628227), (39.9582109, -75.1731373822)], [(39.9491236494, -75.1488953439), (39.942712, -75.159313), (39.948212, -75.1455516), (39.9582109, -75.1731373822)], [(39.9491236494, -75.1488953439), (39.9605399867, -75.1725628227), (39.9582109, -75.1731373822)], [(39.9491236494, -75.1488953439), (39.948212, -75.1455516), (39.942712, -75.159313)]]
[[(39.956853, -75.169558), (39.9495474, -75.1426984), (39.9683684, -75.1726552), (39.9582109, -75.1731373822)], [(39.956853, -75.169558), (39.9488980012, -75.1500296367), (39.942712, -75.159313), (39.9556999, -75.1501373), (39.9467905885, -

np.float64(0.3014048502226079)

In [88]:
#my thought is instead of find the difference of the order, we find the different of the clustering. 
# originially, 123, 456 are the two cluster, which are two days, not we want 126, and 357, which reduce
# the distance a lot. so we can find a way to calculate the different of the 