In this notebook, the distance data of the NUTS regions for our model is preprocessed to be ready for analysis.

In [1]:
import pandas as pd

# Import the distances file, which contains the distance in meters between all NUTS 3 regions
distances = pd.read_excel("Distance_data_mod.xlsx")
distances.head()

Unnamed: 0,NUTS_3_code,NUTS_3_name,Level_code,Country_code,FRK23,FRK24,AT313,FI200,FR102,EL611,...,UKL21,UKL22,UKL23,UKL24,SE125,SE211,SE212,SE332,SI031,SI032
0,BE334,Arr. Waremme,3,BE,664853.301476,601083.235441,707639.457403,1407366.0,279408.791725,1787538.0,...,576278.135514,603574.939872,629562.45448,624095.419127,1222729.0,969351.4,917127.936158,1989939.0,919276.036951,903881.092166
1,BE335,Arr. Verviers - communes francophones,3,BE,649008.475436,582750.420169,654450.292995,1394150.0,299330.374136,1736905.0,...,629381.776463,656372.395362,682249.29665,677360.923825,1214264.0,957196.0,901528.127301,1991252.0,866213.518475,851055.599751
2,BE336,Bezirk Verviers - Deutschsprachige Gemeinschaft,3,BE,633229.956099,566050.712775,630854.232115,1398653.0,302374.692927,1710628.0,...,652976.080354,679329.572696,707793.154298,701874.417231,1221222.0,962561.0,905210.09687,2002439.0,841108.719594,825630.871077
3,BE341,Arr. Arlon,3,BE,557380.107676,491315.393987,643459.035309,1476048.0,237793.752546,1689355.0,...,651288.816319,673433.528068,722011.111482,706428.023493,1299638.0,1040454.0,982257.593141,2080974.0,840254.131376,821429.448546
4,BE342,Arr. Bastogne,3,BE,602051.264178,536239.090918,654189.187836,1439286.0,260377.819972,1718031.0,...,632287.558641,656724.947879,695169.492919,684292.156195,1260627.0,1002781.0,946135.641189,2038812.0,858485.117134,841360.038984


In [2]:
# And import the airport codes file which links the airport codes with the respective NUTS 3 code
df_two = pd.read_excel("Airport_codes_cities.xlsx")
df_two.head()

Unnamed: 0,Airport Code,NUTS_3_code,City,Country
0,EBAW,BE211,Antwerp,Belgium
1,EBBR,BE100,Brussels,Belgium
2,EBCI,BE322,Charleroi,Belgium
3,EBLG,BE332,Liege,Belgium
4,EBOS,BE255,Ostend,Belgium


In [5]:
# Drop all rows with city codes that we are not interested in from the distances file
distances = distances[distances['NUTS_3_code'].isin(df_two['NUTS_3_code'])]

# All columns with codes that are outside the scope of our research are also dropped
all_codes = distances["NUTS_3_code"].astype(str).unique()
current_codes = distances.columns.astype(str)
codes_to_keep = [code for code in current_codes if code in all_codes]
codes_to_keep = ["NUTS_3_code", "NUTS_3_name", "Country_code"] + codes_to_keep
distances_updated = distances[codes_to_keep]

# After changing the index of the set we get the updated distance data file
dis = distances_updated.set_index('NUTS_3_code')
dis.head()

# Export the file to an excel document
dis.to_excel('distance_data_processed.xlsx', index=True)

In [6]:
# Now create a function that makes it easy to access the distance between two places using this new file
def get_distance(nuts_code_1, nuts_code_2, df):
    
    # Return an error when one of the NUTS codes is not in the dataset
    if nuts_code_1 not in df.index:
        return f"NUTS code '{nuts_code_1}' not found in the data."
    if nuts_code_2 not in df.index:
        return f"NUTS code '{nuts_code_2}' not found in the data."
    
    distance_value = df.at[nuts_code_1, nuts_code_2]
    return distance_value

In [23]:
# Use the OD data to add the column 'distance'
df = pd.read_excel("2019_cities.xlsx")
df.head()

# Merge the files to add the NUTS 3 code of each region to the OD dataset
new_df = df.merge(df_two[['Airport Code', 'NUTS_3_code']], how = 'left', left_on = 'City_A', right_on = 'Airport Code')
new_df = new_df.drop(['Airport Code'], axis = 1)
new_df = new_df.rename(columns = {'NUTS_3_code' : 'NUTS_3_code_A'})
new_df = new_df.merge(df_two[['Airport Code', 'NUTS_3_code']], how = 'left', left_on = 'City_B', right_on = 'Airport Code')
new_df = new_df.drop(['Airport Code'], axis = 1)
new_df = new_df.rename(columns = {'NUTS_3_code' : 'NUTS_3_code_B'})

# Then using the NUTS 3 data and the above defined function add the distance between the two regions as a new column
new_df['Distance'] = new_df.apply(lambda row: get_distance(row['NUTS_3_code_A'], row['NUTS_3_code_B'], dis), axis=1) 


# NOTE: This cell needs to be executed for each data file: 2016, 2017, 2018, 2019

In [24]:
# Additionally, to add the already existing railways we read the railways map
railways_df = pd.read_excel("railways.xlsx")
railways_df.head()

# First group the railway data by column and make a set out of it
railways_cities_by_column = railways_df.iloc[1:].stack().groupby(level=1).apply(set)  

# Function to check if both cities are connected (within the same column group) and if it is later or earlier than the 
def check_connection_by_column(City_A, City_B):
    for connected_cities in railways_cities_by_column:
        if City_A in connected_cities and City_B in connected_cities:
            return 1 
    return 0 

new_df['has_connection'] = new_df.apply(lambda row: check_connection_by_column(row['City_A_Name'], row['City_B_Name']), axis=1)
new_df.head()
new_df.to_excel('2019_cities_final.xlsx', index=False)