In [1]:
# Creating scatterplot graph comparing Flight Passengers between City Pairs vs. Distance

import pandas as pd
import openpyxl
import plotly.express as px

In [2]:
file_path = 'C:/Users/alvor/OneDrive/Desktop/Masters Work/Winter 2024/TIL 6022/Group Project/Python-Project-Group-3/CITIES_FINAL.xlsx'
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,City_A_Name,City_B_Name,City_A,City_B,Total passengers,NUTS_3_code_A,NUTS_3_code_B,Distance,has_connection
0,Paris,Toulouse,LFPO,LFBO,4716804,FR107,FRJ23,610672.306409,0
1,Barcelona,Madrid,LEBL,LEMD,4657590,ES511,ES300,496744.78662,1
2,Nice,Paris,LFMN,LFPO,4249404,FRL03,FR107,645376.754517,0
3,Berlin,Munich,EDDT,EDDM,3878625,DE300,DE212,501031.167189,0
4,Berlin,Frankfurt,EDDT,EDDF,3869795,DE300,DE712,423686.712363,0


In [50]:
# Creating list to store yearly dataFrames
sheets = ['2016', '2017', '2018', '2019']
data_frames = []

# Loop through each sheet, add a 'Year' column, and append it to data_frames list
for sheet in sheets:
    df = pd.read_excel(file_path, sheet_name=sheet)
    df['Year'] = sheet  # Add a Year column for each DataFrame
    data_frames.append(df)

# Concatenate all data into a single DataFrame
df_all = pd.concat(data_frames, ignore_index=True)

# Concatenate all data into a single DataFrame
df_all = pd.concat(data_frames, ignore_index=True)

# Convert Distance from meters to kilometers
df_all['Distance'] = df_all['Distance'] / 1000

# Extract relevant columns using the actual column names in the file
df_all = df_all[['Total passengers', 'NUTS_3_code_A', 'NUTS_3_code_B', 'Distance', 'Year', 'City_A_Name', 'City_B_Name', 'has_connection']]

# Combining city names for the custom hover name
df_all['City_Pair'] = df_all['City_A_Name'] + " & " + df_all['City_B_Name']

# Converts "has_connection" column to string for plotly express will see it as a binary option (not continuous
df_all['has_connection'] = df_all['has_connection'].astype(str)

# Filter to Top 50 Total passenger volumes
df_all = df_all.sort_values(by='Total passengers', ascending=False).groupby('Year').head(100)

# Sorting year slider in ascending order, aka making sure the slider is correctly laid out
df_all = df_all.sort_values(by=['Year', 'Total passengers'], ascending=[True, False])

# Map the 'has_connection' column to specific colors (why isn't the red showing up??)
color_map = {'0': 'blue', '1': '#F31F1F'} # Medium-dark red for 1

# Create a scatter plot using Plotly Express
fig = px.scatter(
    df_all,
    x='Distance',
    y='Total passengers',
    color = 'has_connection',
    color_discrete_map = color_map,
    animation_frame='Year',
    hover_name='City_Pair',
   hover_data={'NUTS_3_code_A': True, 'NUTS_3_code_B': True, 'Distance': True, 'Total passengers': True, 'Year': False,'City_A_Name': False, 'City_B_Name': False},
    title="Air Passenger Volumes vs Distance Between City Pairs (2016-2019)",
    labels={"Distance": "Distance (km)", "Total passengers": "Total Yearly Passengers"},
)

# Update hover template to show City Pair with custom text
fig.update_traces(hovertemplate="<br>".join([
    "City Pair: %{hovertext}",
    "Distance: %{x} km",
    "Passengers: %{y}"
]))
# Ensure Legend Displays as Binary (0 and 1)
fig.update_layout(legend_title_text='HSR Connection Status')

# Update legend to rename "0" to "No pre-existing connection" and "1" to "Already connected"
fig.for_each_trace(lambda t: t.update(name="No pre-existing connection" if t.name == "0" else "Already connected"))


fig.show()

## Note: Paris to tolouse has a TGV connection?