In [1]:
import pandas as pd
import networkx as nx
#import folium
import matplotlib.pyplot as plt


In [2]:
AIRPORTS_FILE = "airports.dat"
ROUTES_FILE = "routes.dat"

# Load airports dataset
cols_airports = ["Airport ID", "Airport Name", "City", "IATA", "ICAO", "Latitude", "Longitude", "Altitude", 
                 "Timezone", "DST", "TzDatabaseTime", "Type", "Source"]

airports_df = pd.read_csv(AIRPORTS_FILE, header=None, names=cols_airports)

# Load routes dataset
cols_routes = ["Airline", "AirlineID", "SourceAirport", "SourceAirportID", 
               "DestinationAirport", "DestinationAirportID", "Codeshare", "Stops", "Equipment"]

routes_df = pd.read_csv(ROUTES_FILE, header=None, names=cols_routes)

# Display the first few rows
display(airports_df.head(), routes_df.head())


Unnamed: 0,Airport ID,Airport Name,City,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,TzDatabaseTime,Type,Source
1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


Unnamed: 0,Airline,AirlineID,SourceAirport,SourceAirportID,DestinationAirport,DestinationAirportID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [3]:
print("Missing values in airports dataset:\n", airports_df.isnull().sum())
print("\nMissing values in routes dataset:\n", routes_df.isnull().sum())


Missing values in airports dataset:
 Airport ID         0
Airport Name      49
City               0
IATA               0
ICAO               0
Latitude           0
Longitude          0
Altitude           0
Timezone           0
DST                0
TzDatabaseTime     0
Type               0
Source             0
dtype: int64

Missing values in routes dataset:
 Airline                     0
AirlineID                   0
SourceAirport               0
SourceAirportID             0
DestinationAirport          0
DestinationAirportID        0
Codeshare               53066
Stops                       0
Equipment                  18
dtype: int64


In [4]:
airports_df = airports_df.dropna(subset=["Airport Name"])

routes_df = routes_df.drop(columns=["Codeshare", "Equipment"])

print("Missing values in airports dataset:\n", airports_df.isnull().sum())
print("Missing values in routes dataset:\n", routes_df.isnull().sum())



Missing values in airports dataset:
 Airport ID        0
Airport Name      0
City              0
IATA              0
ICAO              0
Latitude          0
Longitude         0
Altitude          0
Timezone          0
DST               0
TzDatabaseTime    0
Type              0
Source            0
dtype: int64
Missing values in routes dataset:
 Airline                 0
AirlineID               0
SourceAirport           0
SourceAirportID         0
DestinationAirport      0
DestinationAirportID    0
Stops                   0
dtype: int64


In [5]:
print("Total Airports:", len(airports_df))
print("Total Routes:", len(routes_df))
print("\nTop 10 busiest airports (by outgoing routes):")
print(routes_df["SourceAirport"].value_counts().head(10))


Total Airports: 7649
Total Routes: 67663

Top 10 busiest airports (by outgoing routes):
SourceAirport
ATL    915
ORD    558
PEK    535
LHR    527
CDG    524
FRA    497
LAX    492
DFW    469
JFK    456
AMS    453
Name: count, dtype: int64


In [6]:
# Convert invalid latitude and longitude values to NaN
airports_df["Latitude"] = pd.to_numeric(airports_df["Latitude"], errors="coerce")
airports_df["Longitude"] = pd.to_numeric(airports_df["Longitude"], errors="coerce")

# Drop rows where Latitude or Longitude is NaN
airports_df = airports_df.dropna(subset=["Latitude", "Longitude"])

print(f"Cleaned dataset: {len(airports_df)} airports remaining.")


Cleaned dataset: 7649 airports remaining.


In [8]:
# # Create a map centered at an approximate global location
# import folium
# world_map = folium.Map(location=[20, 0], zoom_start=2)

# # Add airport markers
# for _, row in airports_df.iterrows():
#     folium.CircleMarker(
#         location=[row["Latitude"], row["Longitude"]],
#         radius=2,
#         color="blue",
#         fill=True,
#         fill_color="blue",
#     ).add_to(world_map)

# # Save the map
# world_map.save("airport_distribution.html")
# print("Airport distribution map saved as 'airport_distribution.html'")


ModuleNotFoundError: No module named 'folium'

In [16]:
def build_flight_graph(routes_df):
    """Builds a directed graph of the flight network."""
    G = nx.DiGraph()
    for _, row in routes_df.iterrows():
        G.add_edge(row["SourceAirport"], row["DestinationAirport"])
    return G

flight_graph = build_flight_graph(routes_df)

# Find the most connected airports (hubs)
degree_centrality = nx.degree_centrality(flight_graph)
top_hubs = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 hubs (most connected airports):")
for airport, centrality in top_hubs:
    print(f"Airport {airport}: {centrality:.4f}")



Top 10 hubs (most connected airports):
Airport FRA: 0.1393
Airport CDG: 0.1373
Airport AMS: 0.1352
Airport IST: 0.1335
Airport ATL: 0.1265
Airport PEK: 0.1203
Airport ORD: 0.1195
Airport MUC: 0.1110
Airport DME: 0.1104
Airport DFW: 0.1086


In [17]:
def find_shortest_route(graph, source, destination):
    """Finds the shortest path between two airports."""
    try:
        return nx.shortest_path(graph, source=str(source), target=str(destination))
    except nx.NetworkXNoPath:
        return None
    except nx.NodeNotFound:
        return None

source_airport = "507"  # Example: ATL (Atlanta)
destination_airport = "3797"  # Example: LAX (Los Angeles)

shortest_route = find_shortest_route(flight_graph, source_airport, destination_airport)
print("Shortest route:", shortest_route)


Shortest route: None


In [18]:
# Airports that have outbound flights but no inbound flights
outbound_airports = set(routes_df["SourceAirport"])
inbound_airports = set(routes_df["DestinationAirport"])
airports_with_no_incoming = outbound_airports - inbound_airports
airports_with_no_outgoing = inbound_airports - outbound_airports

print("Airports with only outbound flights:", len(airports_with_no_incoming))
print("Airports with only inbound flights:", len(airports_with_no_outgoing))


Airports with only outbound flights: 7
Airports with only inbound flights: 16


In [22]:
from networkx.algorithms.community import greedy_modularity_communities

# Detect communities in the flight network
communities = greedy_modularity_communities(flight_graph)

print(f"Detected {len(communities)} airport clusters.")
print("Example of first community (first 10 airports):", list(communities[0])[:10])


Detected 46 airport clusters.
Example of first community (first 10 airports): ['ASM', 'LNB', 'RXS', 'CPD', 'BYN', 'WJU', 'KUL', 'JZH', 'THL', 'NTL']


In [23]:
display(airports_df.head(50), routes_df.head(50))


Unnamed: 0,Airport ID,Airport Name,City,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,TzDatabaseTime,Type,Source
1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U,Pacific/Port_Moresby,airport,OurAirports
5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10.0,U,Pacific/Port_Moresby,airport,OurAirports
6,Wewak International Airport,Wewak,Papua New Guinea,WWK,AYWK,-3.58383,143.669006,19,10.0,U,Pacific/Port_Moresby,airport,OurAirports
7,Narsarsuaq Airport,Narssarssuaq,Greenland,UAK,BGBW,61.1605,-45.425999,112,-3.0,E,America/Godthab,airport,OurAirports
8,Godthaab / Nuuk Airport,Godthaab,Greenland,GOH,BGGH,64.190903,-51.678101,283,-3.0,E,America/Godthab,airport,OurAirports
9,Kangerlussuaq Airport,Sondrestrom,Greenland,SFJ,BGSF,67.012222,-50.711603,165,-3.0,E,America/Godthab,airport,OurAirports
10,Thule Air Base,Thule,Greenland,THU,BGTL,76.531197,-68.703201,251,-4.0,E,America/Thule,airport,OurAirports


Unnamed: 0,Airline,AirlineID,SourceAirport,SourceAirportID,DestinationAirport,DestinationAirportID,Stops
0,2B,410,AER,2965,KZN,2990,0
1,2B,410,ASF,2966,KZN,2990,0
2,2B,410,ASF,2966,MRV,2962,0
3,2B,410,CEK,2968,KZN,2990,0
4,2B,410,CEK,2968,OVB,4078,0
5,2B,410,DME,4029,KZN,2990,0
6,2B,410,DME,4029,NBC,6969,0
7,2B,410,DME,4029,TGK,\N,0
8,2B,410,DME,4029,UUA,6160,0
9,2B,410,EGO,6156,KGD,2952,0


In [30]:
AIRLINE_FILE = "airlines.dat"

airline_df = pd.read_csv(AIRLINE_FILE)


In [34]:
airline_df.head(50)


Unnamed: 0,-1,Unknown,\N,-,N/A,\N.1,\N.2,Y
0,1,Private flight,\N,-,,,,Y
1,2,135 Airways,\N,,GNL,GENERAL,United States,N
2,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
3,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N
4,5,213 Flight Unit,\N,,TFU,,Russia,N
5,6,223 Flight Unit State Airline,\N,,CHD,CHKALOVSK-AVIA,Russia,N
6,7,224th Flight Unit,\N,,TTF,CARGO UNIT,Russia,N
7,8,247 Jet Ltd,\N,,TWF,CLOUD RUNNER,United Kingdom,N
8,9,3D Aviation,\N,,SEC,SECUREX,United States,N
9,10,40-Mile Air,\N,Q5,MLA,MILE-AIR,United States,Y
