In [1]:
# ===========================
# Sustainable Flight Advisor MVP
# ===========================

# --- 1) Imports ---
import pandas as pd
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import streamlit as st
from tqdm import tqdm  # optional for progress bars

# ===========================
# 2) Load Airports & Routes
# ===========================
# Airports (OpenFlights)
airports_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
airports = pd.read_csv(
    airports_url,
    header=None,
    names=["id","name","city","country","iata","icao","lat","lon","alt","tz","dst","tzdb"],
    quotechar='"'
)
airports['iata'] = airports['iata'].astype(str).str.strip().str.upper()
airports = airports[airports['iata'].str.len() == 3]  # keep only real 3-letter IATA codes

# Routes (OpenFlights)
routes_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat"
routes = pd.read_csv(
    routes_url,
    header=None,
    names=["airline","airline_id","source_airport","source_airport_id",
           "dest_airport","dest_airport_id","codeshare","stops","equipment"],
    quotechar='"'
)

# Merge coordinates
routes = routes.merge(
    airports[['iata','lat','lon']],
    left_on='source_airport',
    right_on='iata',
    how='left'
).rename(columns={'lat':'source_lat','lon':'source_lon'}).drop(columns='iata')

routes = routes.merge(
    airports[['iata','lat','lon']],
    left_on='dest_airport',
    right_on='iata',
    how='left'
).rename(columns={'lat':'dest_lat','lon':'dest_lon'}).drop(columns='iata')

# Drop routes with missing coordinates
routes = routes.dropna(subset=['source_lat','source_lon','dest_lat','dest_lon']).reset_index(drop=True)

# ===========================
# 3) Distance & CO2 Calculator
# ===========================
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    phi1 = math.radians(lat1); phi2 = math.radians(lat2)
    dphi = math.radians(lat2 - lat1); dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return R * 2 * math.asin(math.sqrt(a))

DEFAULT_FACTOR_G_PER_PKM = 83.0  # g CO2 per pax-km

distances = []
co2s = []

for _, row in tqdm(routes.iterrows(), total=len(routes)):
    d = haversine_km(row['source_lat'], row['source_lon'], row['dest_lat'], row['dest_lon'])
    distances.append(d)
    co2s.append(d * DEFAULT_FACTOR_G_PER_PKM / 1000.0)  # kg CO2

routes['distance_km'] = distances
routes['co2_kg'] = co2s

# ===========================
# 4) BTS On-Time Delay Model (AI)
# ===========================
# Placeholder: replace with actual BTS data
# Expected columns: origin, dest, carrier, dep_hour, day_of_week, month, label_delay (>15min)
# bts_df = pd.read_csv("BTS_OnTimeData.csv")

# For MVP demo, create a dummy dataset
# Normally you would load thousands/millions of BTS rows here
bts_df = pd.DataFrame({
    'origin':['JFK','LAX','SFO','JFK'],
    'dest':['SFO','JFK','LAX','SFO'],
    'carrier':['AA','AA','UA','DL'],
    'dep_hour':[8,12,15,9],
    'day_of_week':[1,2,3,4],
    'month':[1,2,3,4],
    'delayed':[0,1,0,1]  # label
})

features = ['dep_hour','day_of_week','month']  # extend with origin/dest/carrier encoding
X = bts_df[features]
y = bts_df['delayed']

model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)

# Example: predict delay probability for a route
bts_df['delay_prob'] = model.predict_proba(X)[:,1]

# ===========================
# 5) Combine with Routes
# ===========================
# For simplicity, merge on origin/dest/carrier for MVP
routes_demo = routes.merge(
    bts_df[['origin','dest','carrier','delay_prob']],
    left_on=['source_airport','dest_airport','airline'],
    right_on=['origin','dest','carrier'],
    how='left'
)

# Fill NaN delay probabilities with average or 0.1 for demo
routes_demo['delay_prob'] = routes_demo['delay_prob'].fillna(0.1)

# ===========================
# 6) Streamlit UI
# ===========================
st.title("Sustainable Flight Advisor MVP")

origin_input = st.text_input("Origin IATA:", value="JFK").upper()
dest_input = st.text_input("Destination IATA:", value="SFO").upper()

filtered = routes_demo[
    (routes_demo['source_airport']==origin_input) &
    (routes_demo['dest_airport']==dest_input)
]

# Rank by CO2 + delay probability
filtered['score'] = filtered['co2_kg'] * 0.5 + filtered['delay_prob'] * 100  # simple weighted score
filtered = filtered.sort_values('score')

st.write(f"Available routes from {origin_input} to {dest_input}:")
st.dataframe(filtered[['airline','source_airport','dest_airport','distance_km','co2_kg','delay_prob','score']])


0it [00:00, ?it/s]
2025-10-19 15:59:31.390 
  command:

    streamlit run C:\Users\gregr\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [3]:
import pandas as pd

url_airports = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"

airports = pd.read_csv(
    url_airports,
    header=None,
    names=["id","name","city","country","iata","icao","lat","lon","alt","tz","dst","tzdb"],
    quotechar='"'
)

print("Total rows loaded:", len(airports))
print("Sample rows:")
print(airports.head())

# Keep only valid 3-letter IATA codes
airports['iata'] = airports['iata'].astype(str).str.strip().str.upper()
airports_valid = airports[airports['iata'].str.len() == 3]
print("Valid airports:", len(airports_valid))


Total rows loaded: 7698
Sample rows:
                                                         id              name  \
1 Goroka Airport                                     Goroka  Papua New Guinea   
2 Madang Airport                                     Madang  Papua New Guinea   
3 Mount Hagen Kagamuga Airport                  Mount Hagen  Papua New Guinea   
4 Nadzab Airport                                     Nadzab  Papua New Guinea   
5 Port Moresby Jacksons International Airport  Port Moresby  Papua New Guinea   

                                              city country      iata  \
1 Goroka Airport                               GKA    AYGA -6.081690   
2 Madang Airport                               MAG    AYMD -5.207080   
3 Mount Hagen Kagamuga Airport                 HGU    AYMH -5.826790   
4 Nadzab Airport                               LAE    AYNZ -6.569803   
5 Port Moresby Jacksons International Airport  POM    AYPY -9.443380   

                                           