In [81]:
import pandas as pd
import json

In [130]:
ENTITY_MAPPER = {
    # Location
    "city_name":         "Location",
    "airport_name":      "Location",
    "airport_code":      "Location",
    "state_name":        "Location",
    "state_code":        "Location",
    "country_name":      "Location",

    # Airline
    "airline_name":      "Airline",
    "airline_code":      "Airline",

    # DateTime
    "day_name":          "DateTime",
    "day_number":        "DateTime",
    "month_name":        "DateTime",
    "year":              "DateTime",
    "date_relative":     "DateTime",
    "today_relative":    "DateTime",
    "days_code":         "DateTime",
    "time":              "DateTime",
    "start_time":        "DateTime",
    "end_time":          "DateTime",
    "flight_time":       "DateTime",

    # Modifier
    "period_of_day":     "Modifier",
    "period_mod":        "Modifier",
    "time_relative":     "Modifier",
    "flight_mod":        "Modifier",
    "mod":               "Modifier",

    # Price
    "round_trip":        "Price",
    "cost_relative":     "Price",
    "fare_amount":       "Price",
    "fare_basis_code":   "Price",

    # FlightDetails
    "class_type":        "FlightDetails",
    "transport_type":    "FlightDetails",
    "flight_stop":       "FlightDetails",
    "flight_days":       "FlightDetails",
    "connect":           "FlightDetails",
    "restriction_code":  "FlightDetails",
    "economy":           "FlightDetails",

    # Identifier
    "flight_number":     "Identifier",
    "aircraft_code":     "Identifier",

    # Meal
    "meal":              "Meal",
    "meal_description":  "Meal",
    "meal_code":         "Meal",

    # Other
    "or":                "X",
    "O":                 "X",
}


INTENT_MAPPER = {
    # Booking-related queries (flights, fares, ground service)
    "flight":                      "Flight",
    "airfare":                     "Fare",
    "ground_service":              "Fare",
    "flight+airfare":              "Fare",
    "ground_service+ground_fare":  "Fare",
    "ground_fare":                 "Fare",
    "cheapest":                    "Fare",

    # Flight information (airline, aircraft, flight numbers)
    "airline":                     "FlightInfo",
    "abbreviation":                "FlightInfo",
    "airline+flight_no":           "FlightInfo",
    "aircraft":                    "FlightInfo",
    "flight_no":                   "FlightInfo",
    "aircraft+flight+flight_no":   "FlightInfo",

    # Location-based queries
    "airport":                     "Location",
    "city":                        "Location",
    "distance":                    "Location",

    # Scheduling/time queries
    "flight_time":                 "Schedule",
    "airfare+flight_time":         "Schedule",

    # Capacity/quantity queries
    "capacity":                    "Capacity",
    "quantity":                    "Capacity",

    # Meal-related queries
    "meal":                        "Meal",

    # Restriction or rule queries
    "restriction":                 "Restriction",
}


In [138]:
def process_entity(st):
    entities = []
    # entities_of_interest = {'city_name', 'airline_name', 'day_name', "time", 'airport_name'}

    for p in st.split(" "):
        p = p.split(".")[-1].split("-")[-1]
        entities.append(ENTITY_MAPPER.get(p, p))
        # if p in entities_of_interest:
        #     entities.append(p)
        # else:
        #     entities.append("X")
    
    return entities

def process_data(df):

    df.loc[:, 'intent'] = df.intent.transform(lambda x: INTENT_MAPPER.get(x, x))
    
    # df = df[df.intent.isin(['other', 'airfare', 'ground_service',  'airline', 'abbreviation'])]
    df.loc[:, 'entity'] = df.slots.transform(process_entity)

    records = df[['text', 'entity', 'intent']].to_json(orient='records')
    records = json.loads(records)
    return records
    


In [139]:
train = pd.read_csv("https://huggingface.co/datasets/tuetschek/atis/raw/main/atis_train.csv")
test  = pd.read_csv("https://huggingface.co/datasets/tuetschek/atis/raw/main/atis_test.csv")


In [133]:
df = process_data(train)

In [134]:
df.intent.value_counts()

intent
Flight         3666
Fare            719
FlightInfo      400
Capacity         67
Location         59
Schedule         55
Meal              6
Restriction       6
Name: count, dtype: int64

In [135]:
df.entity.explode().value_counts()

entity
X                36112
Location         11766
DateTime          3525
Modifier          1610
Price             1268
Airline           1263
FlightDetails      818
Identifier         115
Meal               114
Name: count, dtype: int64

In [136]:
train.loc[:, 'entity'] = train.slots.apply(process_entity)

In [137]:
train.entity.explode().value_counts()

entity
X                36112
Location         11766
DateTime          3525
Modifier          1610
Price             1268
Airline           1263
FlightDetails      818
Identifier         115
Meal               114
Name: count, dtype: int64

In [140]:
train_records = process_data(train)
test_records  = process_data(test)

In [141]:
with open("data/train.json", 'w') as f:
    json.dump(train_records, f, indent=4)

with open("data/test.json", 'w') as f:
    json.dump(test_records, f, indent=4)


In [89]:
train_records

[{'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
  'entity': ['X',
   'X',
   'X',
   'X',
   'X',
   'city_name',
   'X',
   'time',
   'time',
   'X',
   'X',
   'X',
   'city_name',
   'X',
   'time',
   'X',
   'X',
   'X'],
  'intent': 'flight'},
 {'text': 'what flights are available from pittsburgh to baltimore on thursday morning',
  'entity': ['X',
   'X',
   'X',
   'X',
   'X',
   'city_name',
   'X',
   'city_name',
   'X',
   'day_name',
   'X'],
  'intent': 'flight'},
 {'text': 'cheapest airfare from tacoma to orlando',
  'entity': ['X', 'X', 'X', 'city_name', 'X', 'city_name'],
  'intent': 'airfare'},
 {'text': 'round trip fares from pittsburgh to philadelphia under 1000 dollars',
  'entity': ['X', 'X', 'X', 'X', 'city_name', 'X', 'city_name', 'X', 'X', 'X'],
  'intent': 'airfare'},
 {'text': 'i need a flight tomorrow from columbus to minneapolis',
  'entity': ['X', 'X', 'X', 'X', 'X', 'X', 'city_name', 'X', 'city_name'],
  'int