In [19]:
import pandas as pd

In [20]:
# Step 1: Load raw data and concatenate
raw_data_list = []
data = pd.DataFrame()

files = [
    "../Data/Airport_pas_datasets/avia_par_de.csv",
    "../Data/Airport_pas_datasets/avia_par_nl.csv",
    "../Data/Airport_pas_datasets/avia_par_lu.csv",
    "../Data/Airport_pas_datasets/avia_par_be.csv",
    "../Data/Airport_pas_datasets/avia_par_fr.csv",
    "../Data/Airport_pas_datasets/avia_par_es.csv"
]

for file in files:
    temp = pd.read_csv(file, encoding='Windows-1252')
    data = pd.concat([data, temp[['airp_pr', 'TIME_PERIOD', 'OBS_VALUE']]], ignore_index=True)

data.head(3)

Unnamed: 0,airp_pr,TIME_PERIOD,OBS_VALUE
0,DE_EDDB_BE_EBBR,2016,428055
1,DE_EDDB_BE_EBBR,2017,302160
2,DE_EDDB_BE_EBBR,2018,160682


In [21]:
# Step 2: Extract Origin and Destination columns (by airport code)
for idx, i in enumerate(data["airp_pr"]):
    org = i.split("_")[1]
    dest = i.split("_")[3]
    data.loc[idx, 'Origin'] = org
    data.loc[idx, 'Destination'] = dest

In [22]:
# Step 3: Group data by year
grouped_years = data.groupby("TIME_PERIOD")
grouped_years_2016 = grouped_years.get_group(2016)
grouped_years_2017 = grouped_years.get_group(2017)
grouped_years_2018 = grouped_years.get_group(2018)
grouped_years_2019 = grouped_years.get_group(2019)

In [23]:
# Step 4: Import the airport codes
file_codes = "../Data/Airport_codes_cities.csv"
codes = pd.read_csv(file_codes, encoding='Windows-1252')

# Step 5: Function to get the city name from the airport code
def get_city(airport_code):
    match = codes[codes["Airport_Code"] == airport_code]
    if not match.empty:
        return match["City"].values[0]
    return None

# Step 6: Add city names to the data (for both origin and destination airports)
def add_city_names(dataframe):
    dataframe = dataframe.copy()  # This creates a copy to avoid a warning
    dataframe.loc[:, 'Origin_City'] = dataframe['Origin'].apply(get_city)
    dataframe.loc[:, 'Destination_City'] = dataframe['Destination'].apply(get_city)
    return dataframe

# Add city names for each year's data
grouped_years_2016 = add_city_names(grouped_years_2016)
grouped_years_2017 = add_city_names(grouped_years_2017)
grouped_years_2018 = add_city_names(grouped_years_2018)
grouped_years_2019 = add_city_names(grouped_years_2019)

In [24]:
# Step 7: Function to calculate total passengers between city pairs
def passenger_data_by_city(dataset):
    # Ensure both cities are valid strings, and filter out rows with missing or invalid city data
    dataset = dataset.dropna(subset=['Origin_City', 'Destination_City']).copy()

    # Standardize city pair order by ensuring City_A is always alphabetically before City_B
    dataset['City_A'] = dataset[['Origin_City', 'Destination_City']].min(axis=1)
    dataset['City_B'] = dataset[['Origin_City', 'Destination_City']].max(axis=1)

    # Also align the airport codes with the correct city pair
    dataset['Airport_A'] = dataset.apply(lambda x: x['Origin'] if x['City_A'] == x['Origin_City'] else x['Destination'], axis=1)
    dataset['Airport_B'] = dataset.apply(lambda x: x['Destination'] if x['City_B'] == x['Destination_City'] else x['Origin'], axis=1)

    # Group by city pairs and airport codes, and sum the passengers
    citypairs = dataset.groupby(['City_A', 'City_B', 'Airport_A', 'Airport_B'], as_index=False).agg({'OBS_VALUE': 'sum'})
    
    # Rename columns
    citypairs.rename(columns={'OBS_VALUE': 'Total passengers'}, inplace=True)

    return citypairs

# Step 8: Get city pair data for each year
citypairs_2016 = passenger_data_by_city(grouped_years_2016)
citypairs_2017 = passenger_data_by_city(grouped_years_2017)
citypairs_2018 = passenger_data_by_city(grouped_years_2018)
citypairs_2019 = passenger_data_by_city(grouped_years_2019)

In [25]:
# Step 9: Sort city pairs by total passengers
def sort_cities(citypairs):
    return citypairs.sort_values(by=['Total passengers'], ascending=False)

citypairs_sorted_2016 = sort_cities(citypairs_2016)
citypairs_sorted_2017 = sort_cities(citypairs_2017)
citypairs_sorted_2018 = sort_cities(citypairs_2018)
citypairs_sorted_2019 = sort_cities(citypairs_2019)

In [28]:
# Step 10: Export results to Excel
citypairs_sorted_2016.to_excel("2016_cities.xlsx", index=False)
citypairs_sorted_2017.to_excel("2017_cities.xlsx", index=False)
citypairs_sorted_2018.to_excel("2018_cities.xlsx", index=False)
citypairs_sorted_2019.to_excel("2019_cities.xlsx", index=False)

ModuleNotFoundError: No module named 'openpyxl'