In [106]:
import os
import requests
from datetime import datetime
import pandas as pd
import holidays

## weather

In [107]:
def is_alert(code: int) -> bool:
    """
    Returns True if the weather condition code should trigger an alert, False otherwise.
    Optionally uses the description for more granularity.
    """

    # Thunderstorms (Group 2xx)
    if 200 <= code <= 232:
        return True  # All thunderstorms are potentially hazardous

    # Drizzle (Group 3xx)
    if 300 <= code <= 321:
        return False  # Generally light, not hazardous

    # Rain (Group 5xx)
    if code in {502, 503, 504, 511, 522, 531}:
        return True  # Heavy, extreme, or freezing rain
    elif 500 <= code <= 531:
        return False  # Light/moderate rain usually not alert-level

    # Snow (Group 6xx)
    if code in {602, 622, 616, 613, 620}:
        return True  # Heavy or mixed snow/ice conditions
    elif 600 <= code <= 622:
        return False  # Light snow/sleet usually advisory only

    # Atmosphere (Group 7xx)
    if code in {741, 762, 771, 781}:
        return True  # Fog, ash, squalls, tornado
    elif 700 <= code <= 781:
        return False  # Mist, haze, etc.

    # Clear & Clouds (Group 800-804)
    if code == 800:
        return False
    if 801 <= code <= 804:
        return False  # Cloud cover, no hazard

    return False  # Default to non-alert if unknown

In [108]:
city_coords = {
    "new york": (40.7128, -74.0060),
    "los angeles": (34.0522, -118.2437),
    "chicago": (41.8781, -87.6298),
    "houston": (29.7604, -95.3698),
    "phoenix": (33.4484, -112.0740),
    "philadelphia": (39.9526, -75.1652),
    "san antonio": (29.4241, -98.4936),
    "san diego": (32.7157, -117.1611),
    "dallas": (32.7767, -96.7970),
    "san jose": (37.3382, -121.8863),
    "austin": (30.2672, -97.7431),
    "jacksonville": (30.3322, -81.6557),
    "fort worth": (32.7555, -97.3308),
    "columbus": (39.9612, -82.9988),
    "charlotte": (35.2271, -80.8431)
}

In [109]:
coords = [value for key, value in city_coords.items()]
coords

[(40.7128, -74.006),
 (34.0522, -118.2437),
 (41.8781, -87.6298),
 (29.7604, -95.3698),
 (33.4484, -112.074),
 (39.9526, -75.1652),
 (29.4241, -98.4936),
 (32.7157, -117.1611),
 (32.7767, -96.797),
 (37.3382, -121.8863),
 (30.2672, -97.7431),
 (30.3322, -81.6557),
 (32.7555, -97.3308),
 (39.9612, -82.9988),
 (35.2271, -80.8431)]

## city

In [35]:
df_city = pd.DataFrame({
    'id': range(1, len(city_coords)+1),
    'city_name': city_coords.keys()})

df_city

Unnamed: 0,id,city_name
0,1,new york
1,2,los angeles
2,3,chicago
3,4,houston
4,5,phoenix
5,6,philadelphia
6,7,san antonio
7,8,san diego
8,9,dallas
9,10,san jose


In [110]:
# from geopy.geocoders import Nominatim

# def get_city_name(lat, lon):
#     geolocator = Nominatim(user_agent="weather_alert_app")
#     location = geolocator.reverse((lat, lon), language="en")
#     if location and 'address' in location.raw:
#         address = location.raw['address']
#         # return address.get('city') or address.get('town') or address.get('village') or address.get('state')
#         return address.get('city')
#     return None

# # Example usage:
# city = get_city_name(39.099724, -94.578331)  # Washington, DC
# print(city)

In [123]:


def get_city_id(df):
    def find_city(lat, lon, lookup):
        target = (lat, lon)
        return next((k for k, v in lookup.items() if v == target), None)
    
    city_to_id = dict(zip(df_city['city_name'], df_city['id']))
    df['city'] = df.apply(lambda row: find_city(row["lat"], row["lon"], city_coords), axis=1)
    df['city_id'] = df['city'].map(city_to_id)
    df = df.drop(columns='city')

    return df

In [111]:
from datetime import datetime, timedelta, timezone

# Start and end dates
start_date = datetime(2025, 1, 1, tzinfo=timezone.utc)
end_date = datetime(2025, 6, 1, tzinfo=timezone.utc)

# Generate list of timestamps at 00:00:00 UTC each day
timestamps = []
current_date = start_date
while current_date <= end_date:
    timestamps.append(int(current_date.timestamp()))
    current_date += timedelta(days=1)

print(timestamps)

[1735689600, 1735776000, 1735862400, 1735948800, 1736035200, 1736121600, 1736208000, 1736294400, 1736380800, 1736467200, 1736553600, 1736640000, 1736726400, 1736812800, 1736899200, 1736985600, 1737072000, 1737158400, 1737244800, 1737331200, 1737417600, 1737504000, 1737590400, 1737676800, 1737763200, 1737849600, 1737936000, 1738022400, 1738108800, 1738195200, 1738281600, 1738368000, 1738454400, 1738540800, 1738627200, 1738713600, 1738800000, 1738886400, 1738972800, 1739059200, 1739145600, 1739232000, 1739318400, 1739404800, 1739491200, 1739577600, 1739664000, 1739750400, 1739836800, 1739923200, 1740009600, 1740096000, 1740182400, 1740268800, 1740355200, 1740441600, 1740528000, 1740614400, 1740700800, 1740787200, 1740873600, 1740960000, 1741046400, 1741132800, 1741219200, 1741305600, 1741392000, 1741478400, 1741564800, 1741651200, 1741737600, 1741824000, 1741910400, 1741996800, 1742083200, 1742169600, 1742256000, 1742342400, 1742428800, 1742515200, 1742601600, 1742688000, 1742774400, 174

In [112]:
# Create DataFrames
df_coords = pd.DataFrame(coords, columns=["lat", "lon"])
df_times = pd.DataFrame(timestamps, columns=["timestamp"])

# Add keys to enable cartesian join
df_coords["key"] = 1
df_times["key"] = 1

# Cartesian product via merge
df_coord_time = pd.merge(df_coords, df_times, on="key").drop("key", axis=1)

# Optional: convert timestamp to readable date
df_coord_time["datetime"] = pd.to_datetime(df_coord_time["timestamp"], unit="s")

print(df_coord_time)

          lat      lon   timestamp   datetime
0     40.7128 -74.0060  1735689600 2025-01-01
1     40.7128 -74.0060  1735776000 2025-01-02
2     40.7128 -74.0060  1735862400 2025-01-03
3     40.7128 -74.0060  1735948800 2025-01-04
4     40.7128 -74.0060  1736035200 2025-01-05
...       ...      ...         ...        ...
2710  35.2271 -80.8431  1750896000 2025-06-26
2711  35.2271 -80.8431  1750982400 2025-06-27
2712  35.2271 -80.8431  1751068800 2025-06-28
2713  35.2271 -80.8431  1751155200 2025-06-29
2714  35.2271 -80.8431  1751241600 2025-06-30

[2715 rows x 4 columns]


In [113]:
weather_api = os.getenv('OpenWeatherMap_API')


# url = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?lat=39.099724&lon=-94.578331&dt=1735689600&&appid={weather_api}"

# response = requests.get(url)

# if response.status_code == 200:
#     data_w = response.json()
#     # print("Current temperature:", data['current']['temp'])
#     print(data_w)
# else:
#     print("Error:", response.status_code, response.text)

In [114]:
import time

In [115]:
df_w = pd.DataFrame()


for index, row in df_coord_time[:3].iterrows():
    lat = row['lat']
    lon = row['lon']
    timestamp = row['timestamp']
    url = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={timestamp}&appid={weather_api}"

    response = requests.get(url)

    time.sleep(1)

    if response.status_code == 200:
        data_w = response.json()
        # print("Current temperature:", data['current']['temp'])
        print(index, data_w)
    else:
        print("Error:", response.status_code, response.text)
    
    weather_list = data_w['data'][0]['weather']

    # Build a list of records for each weather condition
    records = [{'lat': data_w['lat'], 'lon': data_w['lon'], 'date': data_w['data'][0]['dt'], 'weather_id': w['id'], 'weather': w['main']} for w in weather_list]

    # Convert to DataFrame
    df_w = pd.concat([df_w, pd.DataFrame(records)])

df_w.head()

0 {'lat': 40.7128, 'lon': -74.006, 'timezone': 'America/New_York', 'timezone_offset': -18000, 'data': [{'dt': 1735689600, 'sunrise': 1735647596, 'sunset': 1735681097, 'temp': 282.46, 'feels_like': 278.72, 'pressure': 1007, 'humidity': 77, 'dew_point': 278.64, 'clouds': 0, 'visibility': 10000, 'wind_speed': 8.8, 'wind_deg': 100, 'wind_gust': 0, 'weather': [{'id': 800, 'main': 'Clear', 'description': 'clear sky', 'icon': '01n'}]}]}
1 {'lat': 40.7128, 'lon': -74.006, 'timezone': 'America/New_York', 'timezone_offset': -18000, 'data': [{'dt': 1735776000, 'sunrise': 1735734002, 'sunset': 1735767545, 'temp': 279.14, 'feels_like': 273.74, 'pressure': 1006, 'humidity': 62, 'dew_point': 272.51, 'clouds': 20, 'visibility': 10000, 'wind_speed': 11.3, 'wind_deg': 290, 'wind_gust': 0, 'weather': [{'id': 801, 'main': 'Clouds', 'description': 'few clouds', 'icon': '02n'}]}]}
2 {'lat': 40.7128, 'lon': -74.006, 'timezone': 'America/New_York', 'timezone_offset': -18000, 'data': [{'dt': 1735862400, 'sunri

Unnamed: 0,lat,lon,date,weather_id,weather
0,40.7128,-74.006,1735689600,800,Clear
0,40.7128,-74.006,1735776000,801,Clouds
0,40.7128,-74.006,1735862400,800,Clear


In [116]:
# Extracting relevant data

# weather_list = data_w['data'][0]['weather']

# # Build a list of records for each weather condition
# records = [{'lat': data_w['lat'], 'lon': data_w['lon'], 'date': data_w['data'][0]['dt'], 'weather_id': w['id'], 'weather': w['main']} for w in weather_list]

# # Convert to DataFrame
# df_w = pd.DataFrame(records)
# print(df_w)

In [124]:


# Apply it row-wise

# df_w['utc_date'] = pd.to_datetime(df_w['date'], unit='s', utc=True)
# df_w['date'] = pd.to_datetime(df_w['date'], unit='s').dt.date
df_weather = df_w.copy()
df_weather['date'] = pd.to_datetime(df_w['date'], unit='s').dt.strftime('%Y-%m-%d')
df_weather['alert'] = df_weather['weather_id'].apply(lambda x: is_alert(x))
df_weather = get_city_id(df_weather)

df_weather['id'] = range(1, len(df_weather)+1)

df_weather = df_weather.drop(['lat', 'lon'], axis=1)


In [125]:

# adjusted order
desired_order = ['id', 'date', 'city_id', 'weather_id', 'weather', 'alert']
df_weather = df_weather[desired_order]

df_weather

Unnamed: 0,id,date,city_id,weather_id,weather,alert
0,1,2025-01-01,1,800,Clear,False
0,2,2025-01-02,1,801,Clouds,False
0,3,2025-01-03,1,800,Clear,False


### national holidays

In [8]:
# cal_api = os.getenv('Calendarific_API')
# url = f"https://calendarific.com/api/v2/holidays?api_key={cal_api}&country=US&year=2025"

# response = requests.get(url)

# if response.status_code == 200:
#     data_h = response.json()
#     # print("Current temperature:", data['current']['temp'])
#     print(data_h)
# else:
#     print("Error:", response.status_code, response.text)

In [9]:
# df_holiday = pd.DataFrame(data_h['response']['holidays'])
# df_holiday

In [10]:
# for ptype in df_holiday['primary_type'].unique():
#     print(f"\n--- {ptype} ---")
#     print(df_holiday[df_holiday['primary_type'] == ptype]['name'].unique())


In [11]:
# df_holiday = df_holiday[df_holiday['primary_type'].isin(['Federal Holiday', 'State Holiday'])][['name', 'locations']]
# df_holiday = df_holiday[df_holiday['primary_type'].isin(['Federal Holiday', 'State Holiday'])][['name', 'locations']]

In [12]:
# # Example: merging 'locations' column by 'name' using newline separator
# merged = df_holiday.groupby('name', as_index=False).agg({
#     'locations': lambda x: '\\'.join(str(v) for v in x)
#     # For non-text columns you might want to keep the first or max, etc.
#     # 'date': 'first',  # Or 'min', 'max', etc., depending on your goal
# })

# merged

In [13]:
# def keep_all_location(group):
#     if 'All' in group['locations'].values:
#         return group[group['locations'] == 'All']
#     else:
#         return group

# result = df_holiday.groupby('name', group_keys=False).apply(keep_all_location)
# result

In [None]:
# # data placeholder
# df_holidays = pd.DataFrame()

# # get 23-25 holiday data
# for year in [2023, 2024, 2025]:
#     us_holidays = holidays.UnitedStates(years=year)
#     df_holidays = pd.concat([df_holidays, pd.DataFrame(list(us_holidays.items()), columns=['date', 'name'])], ignore_index=True)

# # add id column
# df_holidays['id'] = range(1, len(df_holidays)+1)
# df_holidays['date'] = pd.to_datetime(df_holidays['date'], errors='coerce')

# # adjust order
# desired_order = ['id', 'name', 'date']
# df_holidays = df_holidays[desired_order]
# df_holidays['state'] = 'placeholder'

# df_holidays

Unnamed: 0,id,name,date,state
0,1,New Year's Day,2023-01-01,placeholder
1,2,New Year's Day (observed),2023-01-02,placeholder
2,3,Memorial Day,2023-05-29,placeholder
3,4,Juneteenth National Independence Day,2023-06-19,placeholder
4,5,Independence Day,2023-07-04,placeholder
5,6,Labor Day,2023-09-04,placeholder
6,7,Veterans Day,2023-11-11,placeholder
7,8,Veterans Day (observed),2023-11-10,placeholder
8,9,Thanksgiving Day,2023-11-23,placeholder
9,10,Christmas Day,2023-12-25,placeholder


## airport

In [27]:
# # city - airport
# iata_results = []


# for city, (lat, lon) in city_coords.items():
#     try:
#         response = amadeus.reference_data.locations.airports.get(latitude=lat, longitude=lon)
#         if response.data:
#             nearest = response.data[0]
#             iata_results.append({
#                 "iata_code": nearest["iataCode"]
#             })
#     except Exception as e:
#         print(f"Failed for {city}: {e}")

# # Convert to DataFrame
# df_iata = pd.DataFrame(iata_results)
# df_iata['city_id'] = range(1, len(df_iata)+1)
# df_iata['id'] = range(1, len(df_iata)+1)
# desired_order = ['id', 'iata_code', 'city_id']
# df_iata = df_iata[desired_order]

# print(df_iata)

   iata_code  id
0        EWR   1
1        LAX   2
2        ORD   3
3        IAH   4
4        PHX   5
5        PHL   6
6        SAT   7
7        SAN   8
8        DFW   9
9        SJC  10
10       AUS  11
11       JAX  12
12       DFW  13
13       CMH  14
14       CLT  15


In [41]:

# df_iata.to_csv('data/intermediate/city_airport.csv')
df_iata = pd.read_csv('data/intermediate/city_airport.csv', index_col=0)
df_iata

Unnamed: 0,id,iata_code,city_id
0,1,EWR,1
1,2,LAX,2
2,3,ORD,3
3,4,IAH,4
4,5,PHX,5
5,6,PHL,6
6,7,SAT,7
7,8,SAN,8
8,9,DFW,9
9,10,SJC,10


## route

In [50]:
from itertools import permutations

# Generate all ordered pairs (departure ≠ destination)
routes = list(permutations(df_iata["id"], 2))

# Convert to DataFrame
routes_df = pd.DataFrame(routes, columns=["departure_airport_id", "destination_airport_id"])

# Optional: Add a unique route_id (if you'll insert into SQL table)
routes_df["id"] = range(1, len(routes_df) + 1)
desired_order = ['id', "departure_airport_id", "destination_airport_id"]
routes_df = routes_df[desired_order]

routes_df

Unnamed: 0,id,departure_airport_id,destination_airport_id
0,1,1,2
1,2,1,3
2,3,1,4
3,4,1,5
4,5,1,6
...,...,...,...
205,206,15,10
206,207,15,11
207,208,15,12
208,209,15,13


## flight

In [2]:
# !pip install amadeus

Collecting amadeus
  Downloading amadeus-12.0.0.tar.gz (36 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: amadeus
  Building wheel for amadeus (pyproject.toml) ... [?25ldone
[?25h  Created wheel for amadeus: filename=amadeus-12.0.0-py2.py3-none-any.whl size=67555 sha256=ec3f4e4a1af961ea7b2b931eef658bb7005d9a82c81c47ac5e5c5c7f3749dbdb
  Stored in directory: /Users/kay/Library/Caches/pip/wheels/12/3d/ca/0c01ed3dde8eadb37517c80c1bab31308fc99f0ee31e539850
Successfully built amadeus
Installing collected packages: amadeus
Successfully installed amadeus-12.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [104]:
def airport_to_route(df):
    # Create mapping from IATA to airport ID
    iata_to_id = df_iata.set_index('iata_code')['id'].to_dict()

    # Map airport IDs to flights
    df['departure_airport_id'] = df['origin'].map(iata_to_id)
    df['arrival_airport_id'] = df['destination'].map(iata_to_id)

    # Create route lookup: {(depart_id, arrive_id): route_id}
    route_lookup = {
        (row['departure_airport_id'], row['destination_airport_id']): row['id']
        for _, row in routes_df.iterrows()
    }

    # Apply to get route_id
    df['route_id'] = df.apply(
        lambda row: route_lookup.get((row['departure_airport_id'], row['arrival_airport_id'])),
        axis=1
    )
    print(df)
    return(df)

In [76]:
# # historical price trial
# from amadeus import Client, ResponseError

# price_api = os.getenv('AMADEUS_API_KEY')
# price_secret = os.getenv('AMADEUS_SECRET')

# amadeus = Client(
#     client_id=price_api,
#     client_secret=price_secret
# )

# try: 
#     response = amadeus.analytics.itinerary_price_metrics.get(originIataCode='EWR', destinationIataCode='LAX',
#                                             departureDate='2025-01-02')
# except ResponseError as error:
#     print(error)

# response.data

[{'type': 'itinerary-price-metric',
  'origin': {'iataCode': 'EWR'},
  'destination': {'iataCode': 'LAX'},
  'departureDate': '2025-01-02',
  'transportType': 'FLIGHT',
  'currencyCode': 'EUR',
  'oneWay': False,
  'priceMetrics': [{'amount': '63.87', 'quartileRanking': 'MINIMUM'},
   {'amount': '363.63', 'quartileRanking': 'FIRST'},
   {'amount': '378.45', 'quartileRanking': 'MEDIUM'},
   {'amount': '493.62', 'quartileRanking': 'THIRD'},
   {'amount': '588.18', 'quartileRanking': 'MAXIMUM'}]}]

In [119]:
airport_route = pd.DataFrame(list(permutations(df_iata["iata_code"], 2)))
# airport_route

dates = pd.date_range(start="2025-01-01", end="2025-06-01", freq="D", tz="UTC")
df_dates = pd.DataFrame(dates)

route_dates = airport_route.merge(df_dates, how="cross")
route_dates.columns = ['departure_iata', 'arrival_iata', 'departure_date']
route_dates['departure_date'] = pd.to_datetime(route_dates['departure_date']).dt.strftime('%Y-%m-%d')


route_dates

Unnamed: 0,departure_iata,arrival_iata,departure_date
0,EWR,LAX,2025-01-01
1,EWR,LAX,2025-01-02
2,EWR,LAX,2025-01-03
3,EWR,LAX,2025-01-04
4,EWR,LAX,2025-01-05
...,...,...,...
31915,CLT,CMH,2025-05-28
31916,CLT,CMH,2025-05-29
31917,CLT,CMH,2025-05-30
31918,CLT,CMH,2025-05-31


In [120]:
from amadeus import Client, ResponseError

price_api = os.getenv('AMADEUS_API_KEY')
price_secret = os.getenv('AMADEUS_SECRET')

amadeus = Client(
    client_id=price_api,
    client_secret=price_secret
)
# Quartile label mapping
quartile_map = {
    "MINIMUM": "price_quantile_minimum",
    "FIRST": "price_quantile_low",
    "MEDIUM": "price_quantile_middle",
    "THIRD": "price_quantile_high",
    "MAXIMUM": "price_quantile_maximum"
}


# Parse each item in the list
response_data = []
for _, route in route_dates[:2].iterrows():
    departure_iata = route['departure_iata']
    arrival_iata = route['arrival_iata']
    departure_date = route['departure_date']
    # print(arrival_iata)
    try: 
        response = amadeus.analytics.itinerary_price_metrics.get(
            originIataCode=departure_iata, 
            destinationIataCode=arrival_iata,
            departureDate=departure_date,
            currencyCode="USD")
        # print(response.data)
        
        # Only keep first record for simplicity
        if response.data:
            # print(response_data)
            response_data.append(response.data[0])

    except ResponseError as error:
        print(f"Error for {departure_iata} to {arrival_iata} on {departure_date}: {error}")

rows = []
for item in response_data:
    row = {
        "origin": item['origin']['iataCode'],
        "destination": item['destination']['iataCode'],
        "departure_date": item['departureDate']
    }
    for metric in item['priceMetrics']:
        key = quartile_map.get(metric['quartileRanking'])
        row[key] = float(metric['amount'])
    rows.append(row)

# Convert to DataFrame
df_f = pd.DataFrame(rows)

# Show result
print(df_f)

  origin destination departure_date  price_quantile_minimum  \
0    EWR         LAX     2025-01-01                   74.99   
1    EWR         LAX     2025-01-02                   74.99   

   price_quantile_low  price_quantile_middle  price_quantile_high  \
0              426.94                 444.33               579.56   
1              426.94                 444.33               579.56   

   price_quantile_maximum  
0                  690.58  
1                  690.58  


In [105]:
df_flight = airport_to_route(df_f)
df_flight = df_flight.iloc[:, 2:]
df_flight['id'] = range(1, len(df_flight)+1)
desired_order = ['id', 'route_id', 'departure_date', 'price_quantile_minimum', 'price_quantile_low', 'price_quantile_middle', 'price_quantile_high', 'price_quantile_maximum']
df_flight = df_flight[desired_order]

df_flight

  origin destination departure_date  price_quantile_minimum  \
0    EWR         LAX     2025-01-01                   74.99   
1    EWR         LAX     2025-01-02                   74.99   

   price_quantile_low  price_quantile_middle  price_quantile_high  \
0              426.94                 444.33               579.56   
1              426.94                 444.33               579.56   

   price_quantile_maximum  departure_airport_id  arrival_airport_id  route_id  
0                  690.58                     1                   2         1  
1                  690.58                     1                   2         1  


Unnamed: 0,id,route_id,departure_date,price_quantile_minimum,price_quantile_low,price_quantile_middle,price_quantile_high,price_quantile_maximum
0,1,1,2025-01-01,74.99,426.94,444.33,579.56,690.58
1,2,1,2025-01-02,74.99,426.94,444.33,579.56,690.58


In [126]:
# import pandas as pd


# def parse_flight_price(flight_offers: list) -> pd.DataFrame:
#     rows = []

#     for offer in flight_offers:
#         total_price = offer.get("price", {}).get("grandTotal")
#         traveler_pricing = offer.get("travelerPricings", [{}])[0]
#         cabin = traveler_pricing.get("fareDetailsBySegment", [{}])[0].get("cabin")

#         for itinerary in offer.get("itineraries", []):
#             for segment in itinerary.get("segments", []):
#                 row = {
#                     "departure_iata": segment["departure"]["iataCode"],
#                     "arrival_iata": segment["arrival"]["iataCode"],
#                     "departure_time": segment["departure"]["at"],
#                     "arrival_time": segment["arrival"]["at"],
#                     "carrier_code": segment["carrierCode"],
#                     "total_price": total_price,
#                     "cabin": cabin
#                 }
#                 rows.append(row)

#     return pd.DataFrame(rows)

# df = parse_flight_price(response.data)
# print(df)

## Database setting

In [198]:
import duckdb as dd

# Create an in-memory DuckDB connection
con = dd.connect('data/static.duckdb')


In [199]:
# optional: clear database
drop_order = ["flight", "weather", "route", "nearest_airport", "city"]
for table_name in drop_order:
    con.execute(f"DROP TABLE IF EXISTS {table_name}")

In [200]:
# Read SQL schema from file
with open("schema/static.sql", "r") as f:
    sql_script = f.read()

# Execute the schema to build tables
con.execute(sql_script)
# drop a useless table
con.execute("DROP TABLE IF EXISTS location")

# Verify tables were created
print(con.execute("SHOW TABLES").fetchdf())

              name
0             city
1           flight
2  nearest_airport
3            route
4          weather


In [201]:
## insert locations info
# Register and insert into the location table
con.register("df_city", df_city)
con.execute("INSERT INTO city (id, city_name) SELECT id, city_name FROM df_city")

con.execute("PRAGMA table_info('city')").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,BIGINT,True,,True
1,1,city_name,VARCHAR,True,,False


In [202]:
## get all key info
# con.execute("SELECT * FROM duckdb_constraints()").fetchdf()

In [203]:
df_weather

Unnamed: 0,id,date,city_id,weather_id,weather,alert
0,1,2025-01-01,1,800,Clear,False
0,2,2025-01-02,1,801,Clouds,False
0,3,2025-01-03,1,800,Clear,False


In [204]:
### insert weather data
con.register("df_weather", df_weather)
con.execute("INSERT INTO weather SELECT * FROM df_weather")
con.execute("PRAGMA table_info('weather')").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,BIGINT,True,,True
1,1,date,DATE,True,,False
2,2,city_id,BIGINT,True,,False
3,3,weather_id,BIGINT,True,,False
4,4,weather,VARCHAR,True,,False
5,5,alert,BOOLEAN,True,,False


In [205]:
### insert airport data
con.register("df_iata", df_iata)
con.execute("INSERT INTO nearest_airport SELECT * FROM df_iata")
con.execute("PRAGMA table_info('nearest_airport')").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,BIGINT,True,,True
1,1,iata_code,VARCHAR,True,,False
2,2,city_id,BIGINT,True,,False


In [206]:
### insert route data
con.register("routes_df", routes_df)
con.execute("INSERT INTO route SELECT * FROM routes_df")
con.execute("PRAGMA table_info('route')").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,BIGINT,True,,True
1,1,departure_airport_id,BIGINT,True,,False
2,2,destination_airport_id,BIGINT,True,,False


In [207]:
### insert holiday data
con.register("df_flight", df_flight)
con.execute("INSERT INTO flight SELECT * FROM df_flight")
con.execute("PRAGMA table_info('flight')").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,BIGINT,True,,True
1,1,route_id,BIGINT,True,,False
2,2,departure_date,DATE,True,,False
3,3,price_quartile_minimum,SMALLINT,True,,False
4,4,price_quartile_low,SMALLINT,True,,False
5,5,price_quartile_middle,SMALLINT,True,,False
6,6,price_quartile_high,SMALLINT,True,,False
7,7,price_quartile_maximum,SMALLINT,True,,False


In [208]:
tables = con.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()

for (table_name,) in tables:
    print(f"Table: {table_name}")
    rows = con.execute(f"SELECT * FROM {table_name} LIMIT 3").fetchdf()
    print(rows)
    print("\n" + "-"*40 + "\n")

Table: city
   id    city_name
0   1     new york
1   2  los angeles
2   3      chicago

----------------------------------------

Table: flight
   id  route_id departure_date  price_quartile_minimum  price_quartile_low  \
0   1         1     2025-01-01                      75                 427   
1   2         1     2025-01-02                      75                 427   

   price_quartile_middle  price_quartile_high  price_quartile_maximum  
0                    444                  580                     691  
1                    444                  580                     691  

----------------------------------------

Table: nearest_airport
   id iata_code  city_id
0   1       EWR        1
1   2       LAX        2
2   3       ORD        3

----------------------------------------

Table: route
   id  departure_airport_id  destination_airport_id
0   1                     1                       2
1   2                     1                       3
2   3                     

In [209]:
# end the database
con.commit()  # Optional, but ensures changes are flushed
con.close()   # Closes the DB and safely handles the WAL