In [24]:
import pandas as pd
import networkx as nx
#import folium
import matplotlib.pyplot as plt


In [25]:
AIRPORTS_FILE = "airports.dat"
ROUTES_FILE = "routes.dat"
AIRLINE_FILE = "airlines.dat"

cols_airlines = ['ID', 'Name', 'Alias', 'IATA', 'ICAO', 'Callsign', 'Country', 'Active']

airline_df = pd.read_csv(AIRLINE_FILE, header=None, names=cols_airlines)

# Load airports dataset
cols_airports = ["Airport ID", "Airport Name", "City", "IATA", "ICAO", "Latitude", "Longitude", "Altitude", 
                 "Timezone", "DST", "TzDatabaseTime", "Type", "Source"]

airports_df = pd.read_csv(AIRPORTS_FILE, header=None, names=cols_airports)

# Load routes dataset
cols_routes = ["Airline", "AirlineID", "SourceAirport", "SourceAirportID", 
               "DestinationAirport", "DestinationAirportID", "Codeshare", "Stops", "Equipment"]

routes_df = pd.read_csv(ROUTES_FILE, header=None, names=cols_routes)

# Display the first few rows
display(airports_df.head(), routes_df.head(), airline_df.head())


Unnamed: 0,Airport ID,Airport Name,City,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,TzDatabaseTime,Type,Source
1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


Unnamed: 0,Airline,AirlineID,SourceAirport,SourceAirportID,DestinationAirport,DestinationAirportID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


Unnamed: 0,ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,-1,Unknown,\N,-,,\N,\N,Y
1,1,Private flight,\N,-,,,,Y
2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N


In [26]:
print("Missing values in airports dataset:\n", airports_df.isnull().sum())
print("\nMissing values in routes dataset:\n", routes_df.isnull().sum())
print("\nMissing values in routes dataset:\n", airline_df.isnull().sum())

Missing values in airports dataset:
 Airport ID         0
Airport Name      49
City               0
IATA               0
ICAO               0
Latitude           0
Longitude          0
Altitude           0
Timezone           0
DST                0
TzDatabaseTime     0
Type               0
Source             0
dtype: int64

Missing values in routes dataset:
 Airline                     0
AirlineID                   0
SourceAirport               0
SourceAirportID             0
DestinationAirport          0
DestinationAirportID        0
Codeshare               53066
Stops                       0
Equipment                  18
dtype: int64

Missing values in routes dataset:
 ID             0
Name           0
Alias        506
IATA        4627
ICAO          87
Callsign     808
Country       15
Active         0
dtype: int64


In [27]:
airports_df = airports_df.dropna(subset=["Airport Name"])

routes_df = routes_df.drop(columns=["Codeshare", "Equipment"])

airline_df = airline_df.drop(columns=["ID"])

print("Missing values in airports dataset:\n", airports_df.isnull().sum())
print("\nMissing values in routes dataset:\n", routes_df.isnull().sum())
print("\nMissing values in airline dataset:\n", airline_df.isnull().sum())



Missing values in airports dataset:
 Airport ID        0
Airport Name      0
City              0
IATA              0
ICAO              0
Latitude          0
Longitude         0
Altitude          0
Timezone          0
DST               0
TzDatabaseTime    0
Type              0
Source            0
dtype: int64

Missing values in routes dataset:
 Airline                 0
AirlineID               0
SourceAirport           0
SourceAirportID         0
DestinationAirport      0
DestinationAirportID    0
Stops                   0
dtype: int64

Missing values in airline dataset:
 Name           0
Alias        506
IATA        4627
ICAO          87
Callsign     808
Country       15
Active         0
dtype: int64


In [28]:
airline_df.head()

Unnamed: 0,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,Unknown,\N,-,,\N,\N,Y
1,Private flight,\N,-,,,,Y
2,135 Airways,\N,,GNL,GENERAL,United States,N
3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N


In [29]:
# Airline Dataset Cleaning

# Drop the first row unconditionally
airline_df = airline_df.iloc[1:]
airline_df = airline_df.reset_index(drop=True)

# Replace empty strings with NaN
airline_df = airline_df.replace('', "NaN")
airline_df = airline_df.replace('\\N', "NaN")

# Standardize "N/A" to NaN
airline_df = airline_df.replace('N/A', "NaN")

# Handle "-" specifically in the IATA column, if it represents missing data
airline_df['IATA'] = airline_df['IATA'].replace('-', "NaN")  # Corrected: Applies only to IATA column

# Convert 'Active' to boolean (True/False) and handle NaNs
airline_df['Active'] = airline_df['Active'].map({'Y': True, 'N': False})
airline_df['Active'] = airline_df['Active'].fillna(False)

print("\nMissing values in airline dataset:\n", airline_df.isnull().sum())
airline_df.head()


Missing values in airline dataset:
 Name           0
Alias        506
IATA        4627
ICAO          86
Callsign     808
Country       15
Active         0
dtype: int64


  airline_df['Active'] = airline_df['Active'].fillna(False)


Unnamed: 0,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,Private flight,,,,,,True
1,135 Airways,,,GNL,GENERAL,United States,False
2,1Time Airline,,1T,RNX,NEXTIME,South Africa,True
3,2 Sqn No 1 Elementary Flying Training School,,,WYT,,United Kingdom,False
4,213 Flight Unit,,,TFU,,Russia,False


In [30]:
# Drop Airports that are not active anymore (reduntant airports)

# Drop rows where 'Active' is False
airline_df = airline_df[airline_df['Active'] != False]  # Keep only rows where 'Active' is NOT False

# Reset the index after dropping rows
airline_df = airline_df.reset_index(drop=True)

In [31]:
print("\nMissing values in airline dataset:\n", airline_df.isnull().sum())
airline_df.head()



Missing values in airline dataset:
 Name          0
Alias       393
IATA        240
ICAO         28
Callsign    323
Country       2
Active        0
dtype: int64


Unnamed: 0,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,Private flight,,,,,,True
1,1Time Airline,,1T,RNX,NEXTIME,South Africa,True
2,40-Mile Air,,Q5,MLA,MILE-AIR,United States,True
3,Ansett Australia,,AN,AAA,ANSETT,Australia,True
4,Abacus International,,1B,,,Singapore,True


In [32]:
# print("Total Airports:", len(airports_df))
# print("Total Routes:", len(routes_df))
# print("\nTop 10 busiest airports (by outgoing routes):")
# print(routes_df["SourceAirport"].value_counts().head(10))


In [33]:
# # Convert invalid latitude and longitude values to NaN
# airports_df["Latitude"] = pd.to_numeric(airports_df["Latitude"], errors="coerce")
# airports_df["Longitude"] = pd.to_numeric(airports_df["Longitude"], errors="coerce")

# # Drop rows where Latitude or Longitude is NaN
# airports_df = airports_df.dropna(subset=["Latitude", "Longitude"])

# print(f"Cleaned dataset: {len(airports_df)} airports remaining.")


In [34]:
# def build_flight_graph(routes_df):
#     """Builds a directed graph of the flight network."""
#     G = nx.DiGraph()
#     for _, row in routes_df.iterrows():
#         G.add_edge(row["SourceAirport"], row["DestinationAirport"])
#     return G

# flight_graph = build_flight_graph(routes_df)

# # Find the most connected airports (hubs)
# degree_centrality = nx.degree_centrality(flight_graph)
# top_hubs = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
# print("\nTop 10 hubs (most connected airports):")
# for airport, centrality in top_hubs:
#     print(f"Airport {airport}: {centrality:.4f}")


In [35]:
# def find_shortest_route(graph, source, destination):
#     """Finds the shortest path between two airports."""
#     try:
#         return nx.shortest_path(graph, source=str(source), target=str(destination))
#     except nx.NetworkXNoPath:
#         return None
#     except nx.NodeNotFound:
#         return None

# source_airport = "507"  # Example: ATL (Atlanta)
# destination_airport = "3797"  # Example: LAX (Los Angeles)

# shortest_route = find_shortest_route(flight_graph, source_airport, destination_airport)
# print("Shortest route:", shortest_route)


In [36]:
# # Airports that have outbound flights but no inbound flights
# outbound_airports = set(routes_df["SourceAirport"])
# inbound_airports = set(routes_df["DestinationAirport"])
# airports_with_no_incoming = outbound_airports - inbound_airports
# airports_with_no_outgoing = inbound_airports - outbound_airports

# print("Airports with only outbound flights:", len(airports_with_no_incoming))
# print("Airports with only inbound flights:", len(airports_with_no_outgoing))


In [37]:
# from networkx.algorithms.community import greedy_modularity_communities

# # Detect communities in the flight network
# communities = greedy_modularity_communities(flight_graph)

# print(f"Detected {len(communities)} airport clusters.")
# print("Example of first community (first 10 airports):", list(communities[0])[:10])


In [38]:
# display(airports_df.head(50), routes_df.head(50))
