Scatter Plot Distance-Population

In [1]:
import pandas as pd
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# Read the datasets
pop_filepath = '../Data/AirportCodes_Cities_NUTS_Population.xlsx'
citypairs_filepath = '../Data/CITIES_FINAL.xlsx'

pop = pd.read_excel(pop_filepath)
pop = pop.rename(columns={pop.columns[0]: 'City_Index'}).set_index('City_Index')

years = [2016, 2017, 2018, 2019]
citypairs = []

for y in range(len(years)):
    citypairs.append(pd.read_excel(citypairs_filepath, sheet_name=str(years[y])))

In [3]:
# Merge population data with city pairs
dp_data = []

for y in range(len(citypairs)):

    df = citypairs[y][['City_A_Name', 'City_B_Name', 'NUTS_3_code_A', 'NUTS_3_code_B', 'Distance',]]
    df.insert(loc=2, column='City_Pair_Name', value=None)

    df = df.copy()
    df['City_Pair_Name'] = df['City_A_Name'] + ' - ' + df['City_B_Name']


    df['Combined_Pop'] = None

    # Merge for City A population
    df = df.merge(pop[['NUTS_3_code', str(years[y])]], 
                left_on='NUTS_3_code_A', 
                right_on='NUTS_3_code', 
                how='left').rename(columns={str(years[y]): 'Population_A'})

    # Merge for City B population
    df = df.merge(pop[['NUTS_3_code', str(years[y])]], 
                left_on='NUTS_3_code_B', 
                right_on='NUTS_3_code', 
                how='left').rename(columns={str(years[y]): 'Population_B'})

    # Drop the extra NUTS_3_code columns created during merging
    df = df.drop(columns=['NUTS_3_code_x', 'NUTS_3_code_y'])
    df = df.drop_duplicates()

    df['Combined_Pop'] = df['Population_A'] + df['Population_B']
    df['Distance_km'] = df['Distance'] / 1000
    df['Year'] = years[y]

    dp_data.append(df)

filepath = '../Data/city_pairs_population.xlsx'

with pd.ExcelWriter(filepath) as writer:
    for y, pairs in zip(years, dp_data):
        pairs.to_excel(writer, sheet_name=str(y), index=False)


In [None]:
# Create figures 
figures = []

for f in range(len(years)):
    fig = px.scatter(
        dp_data[f], 
        x='Distance_km', 
        y='Combined_Pop', 
        hover_name='City_Pair_Name',
        title=f'Distance vs Combined Population ({years[f]})',
        labels={'Distance_km': 'Distance (km)', 'Combined_Pop': f'Combined Population ({years[f]})'},
        color='City_Pair_Name'
    )

    fig.update_layout(showlegend=False)
    fig.add_vline(x=150, line_dash="dash", line_color="red", annotation_text="150 km", annotation_position="top")
    fig.add_vline(x=1200, line_dash="dash", line_color="red", annotation_text="1200 km", annotation_position="top")

    figures.append(fig)

In [None]:
# Show figures

n_years = len(years)

# Create subplots
fig = make_subplots(rows=n_years, cols=1, 
                    subplot_titles=[f"Scatter Plot for {year}" for year in years])

# Define the vertical lines' x-coordinates
vertical_lines_x = [150, 1200] 

# Add each figure to the corresponding subplot
for i, scatter_fig in enumerate(figures):
    # Add scatter plot data
    for trace in scatter_fig.data:
        fig.add_trace(
            go.Scatter(
                x=trace.x,
                y=trace.y,
                mode='markers',
                name=trace.name  
            ),
            row=i + 1, col=1  # Position each scatter plot in a new row
        )
    
    # Add vertical lines to the corresponding subplot
    for x in vertical_lines_x:
        fig.add_vline(x=x, line=dict(color='red', dash='dash'), row=i + 1, col=1)

# Define a height for the overall figure
total_height = 1800  

# Update layout for the entire figure
fig.update_layout(
    title="Distance vs Combined Population Over the Years",
    height=total_height,  
    showlegend=False 
)

# Adjust the y-axis height for each subplot
for i in range(n_years):
    fig.update_yaxes(title_text="Combined Population", row=i + 1, col=1)

# Show the combined figure with subplots
fig.show()

In [6]:
# Create top 10 list

# Initialize a list to store the top 10 city pairs for each year
top_city_pairs = []

# Define the distance range
min_distance = 150
max_distance = 1200

for y, df in enumerate(dp_data):
    # Filter for the specified distance range
    filtered_df = df[(df['Distance_km'] >= min_distance) & (df['Distance_km'] <= max_distance)]
    
    # Sort by combined population in descending order and select the top 10
    top_pairs = filtered_df.sort_values(by='Combined_Pop', ascending=False).head(10)

    top_pairs['Year'] = years[y]  # Add year column

    top_city_pairs.append(top_pairs)

filepath = '../Data/top_city_pairs_population.xlsx'

with pd.ExcelWriter(filepath) as writer:
    for y, top_pairs in zip(years, top_city_pairs):
        top_pairs.to_excel(writer, sheet_name=str(y), index=False)


In [None]:
# Bar graph for top 10s

# Loop through each year's top city pairs data and create a bar graph
for top_pairs, year in zip(top_city_pairs, years):
    # Sort the data for better visualization
    top_pairs = top_pairs.sort_values(by='Combined_Pop', ascending=True)
    
    # Create the bar chart
    fig = px.bar(
        top_pairs,
        x='Combined_Pop',
        y='City_Pair_Name', 
        orientation='h',
        title=f'Top 10 City Pairs by Population - {year}',
        labels={'Combined_Pop': 'Combined Population', 'City_Pair_Name': 'City Pair'},
        color='City_Pair_Name'
    )
    
    # Display the chart
    fig.show()

In [None]:
# Bar graph with slider

# Combine all years' top city pairs data into a single DataFrame
all_years_data = pd.concat(top_city_pairs, ignore_index=True)

# Create the animated bar chart with a slider for years
fig = px.bar(
    all_years_data,
    x='Combined_Pop',
    y='City_Pair_Name',
    color='City_Pair_Name',
    orientation='h',
    animation_frame='Year',  
    title='Top 10 City Pairs by Population Over Years',
    labels={'Combined_Pop': 'Combined Population', 'City_Pair_Name': 'City Pair'}
)

# Adjust layout for better spacing and visibility
fig.update_layout(
    xaxis_title='Combined Population',
    yaxis_title='City Pair',
    transition={'duration': 500},
    width=800,
    height=600
)

# Show the figure
fig.show()

In [None]:
# Create plot with slider

dp_concat = pd.concat(dp_data, ignore_index=True)

x_range = [dp_concat['Distance_km'].min(), dp_concat['Distance_km'].max()]
y_range = [dp_concat['Combined_Pop'].min(), dp_concat['Combined_Pop'].max()]

# Generate unique colors for each city pair
city_pairs = dp_concat['City_Pair_Name'].unique()
color_map = {city: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, city in enumerate(city_pairs)}
dp_concat['Color'] = dp_concat['City_Pair_Name'].map(color_map)

# Create a scatter plot with a slider for each year
fig = go.Figure()

# Loop over each year and add a scatter plot trace
for year in dp_concat['Year'].unique():
    filtered_data = dp_concat[dp_concat['Year'] == year]

    # Create hover text with City Pair Name, Distance, and Combined Population
    hover_text = (
        "City Pair: " + filtered_data['City_Pair_Name'] + "<br>" +
        "Distance: " + filtered_data['Distance_km'].astype(str) + " km<br>" +
        "Combined Population: " + filtered_data['Combined_Pop'].astype(str)
    )
    
    fig.add_trace(go.Scatter(
        x=filtered_data['Distance_km'],
        y=filtered_data['Combined_Pop'],
        mode='markers',
        name=str(year),
        visible=False,  # Make all traces invisible initially
        marker=dict(color=filtered_data['Color']),
        hovertext=hover_text,
        hoverinfo="text"
    ))

# Make the first trace (first year) visible
fig.data[0].visible = True

# Create slider steps
steps = []
for i, year in enumerate(dp_concat['Year'].unique()):
    step = dict(
        method="update",
        label=str(year),
        args=[{"visible": [False] * len(fig.data)}],
    )
    step["args"][0]["visible"][i] = True  # Toggle visibility for the selected year
    steps.append(step)

# Add slider to figure
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Year: "},
    pad={"t": 50},
    steps=steps
)]

# Update layout with fixed axis ranges and constant title
fig.update_layout(
    sliders=sliders,
    title="Combined Population vs Distance of City Pairs (2016-2019)",
    xaxis=dict(title="Distance (km)", range=x_range),
    yaxis=dict(title="Combined Population", range=y_range)
)

fig.show()
