In [None]:
# import airpots2.csv and print head
import pandas as pd
airports = pd.read_csv('data/airports2.csv')

airports.head()

In [None]:
# group by destination_airport, of which there may be many entries with the same value, and count the number of flights for each, stored in `flights`. make a new dataframe with one entry per destination_airport, and the sum of the number of flights for each.
flights = airports.groupby('Destination_airport').size()
flights = flights.reset_index(name='flights')

# print the top 10 destination_airports by number of flights
flights.sort_values('flights', ascending=False).head(10)

In [None]:
# load airports.json from ../lib/airports.json, load into a df, and print the first 5 entries
import json
with open('data/airports.json') as f:
    airports_json = json.load(f)

airports_json = pd.DataFrame(airports_json).T
airports_json.head()


In [None]:
# filter to iata not null, print num unique iata codes
airports_json = airports_json[airports_json['iata'].notnull()]
airports_json['iata'].nunique()

In [None]:
# using the flights df, merge with the airports_json df on the Destination_airport column, and discard those who do not have a match in flights

merged = pd.merge(flights, airports_json, left_on='Destination_airport', right_on='iata', how='inner')
merged.head()

In [None]:
# export back to json
merged.to_json('data/merged.json', orient='records')

Process IATA -> desc

In [None]:
from openai import OpenAI
import json
client = OpenAI()

def getBlurb(city: str, client): # city, state format
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "system",
        "content": "You will be provided with a city in the United States. Please write a 2-3 sentence descriptive blurb, with the goal of providing an accurate description to potential tourists. Mention the area's climate and key destinations."
      },
      {
        "role": "user",
        "content": city
      }
    ],
    temperature=1,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response.choices[0].message.content

# load data/merged.json
with open('data/merged.json') as f:
    merged = json.load(f)

In [None]:
res = getBlurb('Minneapolis, Minnesota', client)

In [None]:

import time
from tqdm import tqdm
# create failed and blurb list from the json files, if they already exist. otherewise, create empty lists
try:
    with open('data/failed.txt') as f:
        failed = f.read().splitlines()
except Exception as e:
    failed = []
try:
    with open('data/blurb.json') as f:
        blurb = json.load(f)
except Exception as e:
    blurb = []

for i, row in tqdm(enumerate(merged)):
  try:
    # check if iata code is already in blurb list
    if row['iata'] in [b['iata'] for b in blurb]:
      continue
    res = getBlurb(row['city'] + ', ' + row['state'], client)
    blurb.append({'iata': row['iata'], 'city': row['city'], 'blurb': res})
  except Exception as e:
    print(e)
    failed.append(row['iata'])
  if i % 30 == 0:
    time.sleep(5)
    # write to json and failed list in case the process crashes
    with open('data/blurb.json', 'w') as f:
      json.dump(blurb, f)
    with open('data/failed.txt', 'w') as f:
        f.write('\n'.join(failed))

with open('data/blurb.json', 'w') as f:
    json.dump(blurb, f)
with open('data/failed.txt', 'w') as f:
    f.write('\n'.join(failed))

In [None]:
def trimEmbedding(embedding):
    return embedding[:512]

def getEmbedding(client, text):
    res = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )

    return trimEmbedding(res.data[0].embedding)


import os
from supabase import create_client, Client

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [None]:
def addRows(client, table, rows):
    data, count = supabase.table(table).insert(rows).execute()

def processJsonEntryToRow(entry):
    return {
        'iata': entry['iata'],
        'location': entry['city'],
        'blurb': entry['blurb'],
        'embedding': getEmbedding(client, entry['blurb'])
    }

In [None]:
# process all entries in the blurb json file to rows, and add to the supabase table. try 30 at a time to avoid rate limiting, and try catch to avoid errors
# if there is an error, update the failed_emb.txt file with the iata code
from tqdm import tqdm
import time
try:
    with open('data/failed_emb.txt') as f:
        failed_emb = f.read().splitlines()
except Exception as e:
    failed_emb = []

# make a succeeded list as well
try:
    with open('data/succeeded_emb.txt') as f:
        succeeded_emb = f.read().splitlines()
except Exception as e:
    succeeded_emb = []

with open('data/blurb.json') as f:
    blurb = json.load(f)

for i, entry in tqdm(enumerate(blurb)):
    rows = []
    try:
        if entry['iata'] in succeeded_emb:
            continue
        row = processJsonEntryToRow(entry)
        addRows(supabase, 'blurb', [row])
    except Exception as e:
        print(e)
        failed_emb.append(entry['iata'])
    if i % 100 == 0:
        time.sleep(5)


JSON experiments

In [5]:
from openai import OpenAI
client = OpenAI()

def get_JSON_format(query: str, date: str):
    SYSTEM = f"""
    You are a helpful assistant designed turn queries into JSON format. Users will be asking for information about booking flights, 
    and you will be provided with their natural language query. This may contain multiple components -- primarily it will be a description of 
    where the users want to go, either natural language (e.g. "I want to go somewhere warm"), or specific (e.g. "I want to go to Orlando"). 
    They may also choose to provide information about where they want to depart from, this may be as a city or an airport IATA code. Finally,
    there may be information about the date range they are happy to take the departing flight during.
    
    The JSON has the following keys:
    departure_airport: The airport of departure, as an IATA code.
    destination_flavor_text: Natural language details about where the user wants to go, to be processed later.
    departure_start_date: The first valid date of departure, in the format YYYY-MM-DD.
    departure_end_date: The last valid date of departure, in the format YYYY-MM-DD.

    Note that today's date is {date}.

    If you are not provided with details that would fit to any of these keys, please return that key as an empty string.

    Some examples:
    Query: "I want to go to Orlando"
    JSON: "departure_airport": "", "destination_flavor_text": "I want to go to Orlando", "departure_start_date": "", "departure_end_date": ""

    Query: "I want to go somewhere warm, with a dry desert climate"
    JSON: "departure_airport": "", "destination_flavor_text": "I want to go somewhere warm, with a dry desert climate", "departure_start_date": "", "departure_end_date": ""

    Query: "I'm leaving from Chicago and I want to go to somewhere cold that is known for skiing"
    JSON: "departure_airport": "ORD", "destination_flavor_text": "I want to go to somewhere cold that is known for skiing", "departure_start_date": "", "departure_end_date": ""

    Query: "I want to leave in the next week from LAX and go to New York"
    JSON: "departure_airport": "LAX", "destination_flavor_text": "I want to go to New York", "departure_start_date": "2024-04-23", "departure_end_date": "2024-04-30"
    """
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": SYSTEM},
            {"role": "user", "content": query}
        ]
    )
    return response.choices[0].message.content

In [7]:
DATE = "2024-04-23"

jsonres = get_JSON_format("I want to go to Orlando, today or tomorrow", DATE)

In [9]:
# parse jsonres, a string, into a dictionary
import json
jsonres = json.loads(jsonres)
jsonres

{'departure_airport': '',
 'destination_flavor_text': 'I want to go to Orlando',
 'departure_start_date': '2024-04-23',
 'departure_end_date': '2024-04-24'}