In [166]:
import json
from fuzzywuzzy import fuzz
import numpy as np

In [167]:
with open ('Outputs/Task1_json/Task1_json_1.json') as f:
    plan = json.load(f)

In [168]:
with open ('Prompts/Task1_eval/Task1_eval_1.json') as f:
    eval = json.load(f)

In [169]:
with open ('Datasets/Restaurants_task1.jsonl', 'r') as file:
    restaurants = [json.loads(line.strip()) for line in file]

with open ('Datasets/Hotels_task1.jsonl', 'r') as file:
    hotels = [json.loads(line.strip()) for line in file]

with open ('Datasets/Attractions_task1.jsonl', 'r') as file:
    attractions = [json.loads(line.strip()) for line in file]

def getID(name,address,category):
    #if there is any information that is missing and the extraction tool got -
    if name == "-" or address == "-":
        return -2

    #normal case
    idFromName = []
    idFromAddress = []

    address = address.split(",")[0]
    
    #restaurants
    if category == 'restaurants':
        for restaurant in restaurants:
            if restaurant['name'].lower() == name.lower():
                idFromName.append(restaurant['business_id'])
            if restaurant['address'].lower() == address.lower():
                idFromAddress.append(restaurant['business_id'])
        set1 = set(idFromName)
        set2 = set(idFromAddress)
        #if the extracted id from name and address make an agreement
        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        # we have to use similarity score to determine the id
        else:
            name_sim_score = []
            address_sim_score = []

            for restaurant in restaurants:
                name_sim_score.append(fuzz.ratio(name.lower(), restaurant['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), restaurant['address'].lower()))

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            #if the score is high enough, then we claim the id
            if max(scores) >= 120:
                return restaurants[np.argmax(scores)]['business_id']
            #if the score is less than 60 for each, then we indicate that the business is out of the pool
            else:
                return -1
    #attractions
    if category == 'attractions':
        for attraction in attractions:
            if attraction['name'].lower() == name.lower():
                idFromName.append(attraction['business_id'])
            if attraction['address'].lower() == address.lower():
                idFromAddress.append(attraction['business_id'])
        
        set1 = set(idFromName)
        set2 = set(idFromAddress)

        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        else:
            name_sim_score = []
            address_sim_score = []

            for attraction in attractions:
                name_sim_score.append(fuzz.ratio(name.lower(), attraction['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), attraction['address'].lower()))

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            if max(scores) >= 120:
                return attractions[np.argmax(scores)]['business_id']
            else:
                return -1
    #hotels
    if category == 'hotels':
        for hotel in hotels:
            if hotel['name'].lower() == name.lower():
                idFromName.append(hotel['business_id'])
            if hotel['address'].lower() == address.lower():
                idFromAddress.append(hotel['business_id'])
        set1 = set(idFromName)
        set2 = set(idFromAddress)
        if(len(set1 & set2) == 1):
            return list(set1 & set2)[0]
        else:
            name_sim_score = []
            address_sim_score = []

            for hotel in hotels:
                name_sim_score.append(fuzz.ratio(name.lower(), hotel['name'].lower()))
                address_sim_score.append(fuzz.ratio(address.lower(), hotel['address'].lower()))

            scores = np.array(name_sim_score) + np.array(address_sim_score)
            if max(scores) >= 120:
                return hotels[np.argmax(scores)]['business_id']
            else:
                return -1

In [170]:
plan_eval = []
for days in plan['itinerary']:
    day = {}
    day['days'] = days['days']
    day['breakfast'] = getID(days['breakfast']['name'],days['breakfast']['address'],'restaurants')
    day['morning_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['morning_attractions']]
    day['lunch'] = getID(days['lunch']['name'],days['lunch']['address'],'restaurants')
    day['afternoon_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['afternoon_attractions']]
    day['dinner'] = getID(days['dinner']['name'],days['dinner']['address'],'restaurants')
    day['night_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['night_attractions']]
    day['accommodation'] = getID(days['accommodation']['name'],days['accommodation']['address'],'hotels')
    plan_eval.append(day)
plan_eval

[{'days': '1',
  'breakfast': 'TwnzM8mJn_nT2PJf1x-9kQ',
  'morning_attractions': ['Ib9HV7ekw459jM1Ksdiyiw'],
  'lunch': 'wHkYLlZyPXllrQRlvidUlg',
  'afternoon_attractions': ['kbEVlzQLcYS3JSQPG9QMOQ',
   '8_O6LXLyMgpq1g9CIwcW4w'],
  'dinner': 'S8ZFYEgMejpChID8tzKo9A',
  'night_attractions': ['pxZAz8pv18wK_t-m8WpN0g'],
  'accommodation': '3QPAh9VvYNTnqAFgBeBcng'},
 {'days': '2',
  'breakfast': 'rYqmaOIULRouz_1db07OdQ',
  'morning_attractions': ['Qw7tz-UkPrpXaVidWuab4Q'],
  'lunch': '6ajnOk0GcY9xbb5Ocaw8Gw',
  'afternoon_attractions': ['4mWzXhD8vo0bABVCGAhlqA',
   'ytynqOUb3hjKeJfRj5Tshw'],
  'dinner': '05ev984NYfimRN0UiFrxaA',
  'night_attractions': ['dBRWKIS7h-qZCi8EjUJ2HA'],
  'accommodation': '3QPAh9VvYNTnqAFgBeBcng'}]

### Failure Rate

1. Outside the pool

In [171]:
with open ('Outputs/Task1_json/Task1_json_1_outOfPool.json') as f:
    plan_outOfPool = json.load(f)

In [172]:
plan_eval_outOfPool = []
for days in plan_outOfPool['itinerary']:
    day = {}
    day['days'] = days['days']
    day['breakfast'] = getID(days['breakfast']['name'],days['breakfast']['address'],'restaurants')
    day['morning_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['morning_attractions']]
    day['lunch'] = getID(days['lunch']['name'],days['lunch']['address'],'restaurants')
    day['afternoon_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['afternoon_attractions']]
    day['dinner'] = getID(days['dinner']['name'],days['dinner']['address'],'restaurants')
    day['night_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['night_attractions']]
    day['accommodation'] = getID(days['accommodation']['name'],days['accommodation']['address'],'hotels')
    plan_eval_outOfPool.append(day)
plan_eval_outOfPool

[{'days': '1',
  'breakfast': -2,
  'morning_attractions': ['Ib9HV7ekw459jM1Ksdiyiw'],
  'lunch': -1,
  'afternoon_attractions': [],
  'dinner': 'S8ZFYEgMejpChID8tzKo9A',
  'night_attractions': ['pxZAz8pv18wK_t-m8WpN0g'],
  'accommodation': -2},
 {'days': '2',
  'breakfast': 'rYqmaOIULRouz_1db07OdQ',
  'morning_attractions': ['Qw7tz-UkPrpXaVidWuab4Q'],
  'lunch': '6ajnOk0GcY9xbb5Ocaw8Gw',
  'afternoon_attractions': [-1, 'ytynqOUb3hjKeJfRj5Tshw'],
  'dinner': '05ev984NYfimRN0UiFrxaA',
  'night_attractions': [],
  'accommodation': -2}]

In [173]:
for day in plan_eval_outOfPool:
    for key,value in day.items():
        if isinstance(value, list):
            for id in value:
                if id == -1:
                    print('outside the pool')
        else:
            if value == -1:
                print('outside the pool')

outside the pool
outside the pool


2. Missing info

In [174]:
with open ('Outputs/Task1_json/Task1_json_1_missingInfo.json') as f:
    plan_missingInfo = json.load(f)

In [175]:
plan_eval_missingInfo = []
for days in plan_missingInfo['itinerary']:
    day = {}
    day['days'] = days['days']
    day['breakfast'] = getID(days['breakfast']['name'],days['breakfast']['address'],'restaurants')
    day['morning_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['morning_attractions']]
    day['lunch'] = getID(days['lunch']['name'],days['lunch']['address'],'restaurants')
    day['afternoon_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['afternoon_attractions']]
    day['dinner'] = getID(days['dinner']['name'],days['dinner']['address'],'restaurants')
    day['night_attractions'] = [getID(attraction['name'],attraction['address'],'attractions') for attraction in days['night_attractions']]
    day['accommodation'] = getID(days['accommodation']['name'],days['accommodation']['address'],'hotels')
    plan_eval_missingInfo.append(day)
plan_eval_missingInfo

[{'days': '1',
  'breakfast': -2,
  'morning_attractions': ['Ib9HV7ekw459jM1Ksdiyiw'],
  'lunch': 'wHkYLlZyPXllrQRlvidUlg',
  'afternoon_attractions': [],
  'dinner': 'S8ZFYEgMejpChID8tzKo9A',
  'night_attractions': ['pxZAz8pv18wK_t-m8WpN0g'],
  'accommodation': -2},
 {'days': '2',
  'breakfast': 'rYqmaOIULRouz_1db07OdQ',
  'morning_attractions': ['Qw7tz-UkPrpXaVidWuab4Q'],
  'lunch': '6ajnOk0GcY9xbb5Ocaw8Gw',
  'afternoon_attractions': ['4mWzXhD8vo0bABVCGAhlqA',
   'ytynqOUb3hjKeJfRj5Tshw'],
  'dinner': '05ev984NYfimRN0UiFrxaA',
  'night_attractions': [],
  'accommodation': -2}]

In [176]:
miss_info = False

for day in plan_eval_missingInfo:
    for key,value in day.items():
        #night attraction can be skipped
        if key == 'night_attractions':
            continue

        if isinstance(value, list):
           if(len(value) == 0):
                print('miss list info')
                miss_info = True
        else:
            if value == -2:
                print('miss other info')
                miss_info = True

if(miss_info):
    print("miss info")

miss other info
miss list info
miss other info
miss other info
miss info
