In [1]:
# -----------------------------
# Data manipulation
# -----------------------------
import pandas as pd
import numpy as np
import json
import ast

# -----------------------------
# Plotting
# -----------------------------
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from io import StringIO

## Data Cleaning

In [2]:
# URL for rank_by_year.csv
url_rank = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-09-09/rank_by_year.csv"

# Fetch CSV via requests (disable SSL verification temporarily)
response = requests.get(url_rank, verify=False)
csv_data = StringIO(response.text)

# Read into pandas DataFrame
rank_by_year = pd.read_csv(csv_data)

# Quick check
print("Rank by Year:")
print(rank_by_year)



Rank by Year:
     code                country       region  rank  visa_free_count  year
0      AF            Afghanistan         ASIA   116               26  2021
1      AF            Afghanistan         ASIA   106               26  2020
2      AF            Afghanistan         ASIA   106               30  2018
3      AF            Afghanistan         ASIA   104               24  2017
4      AF            Afghanistan         ASIA   104               25  2016
...   ...                    ...          ...   ...              ...   ...
3945   PS  Palestinian Territory  MIDDLE EAST   102               37  2019
3946   PS  Palestinian Territory  MIDDLE EAST   105               37  2022
3947   PS  Palestinian Territory  MIDDLE EAST   103               38  2023
3948   PS  Palestinian Territory  MIDDLE EAST    98               40  2024
3949   PS  Palestinian Territory  MIDDLE EAST    93               39  2025

[3950 rows x 6 columns]


In [3]:

# --- Count NaN (missing) values per column ---
print("Missing (NaN) values per column:")
print(rank_by_year.isna().sum())

# --- Count total NaN values ---
total_missing = rank_by_year.isna().sum().sum()
print(f"\nTotal missing (NaN) values in the DataFrame: {total_missing}")

# --- Count zero values per column ---
print("\nZero values per column:")
print((rank_by_year == 0).sum())

# --- Count total zero values ---
total_zeros = (rank_by_year == 0).sum().sum()
print(f"\nTotal zero values in the DataFrame: {total_zeros}")


Missing (NaN) values per column:
code               20
country             0
region              0
rank                0
visa_free_count     0
year                0
dtype: int64

Total missing (NaN) values in the DataFrame: 20

Zero values per column:
code                 0
country              0
region               0
rank                 0
visa_free_count    447
year                 0
dtype: int64

Total zero values in the DataFrame: 447


In [4]:

# Replace all 0s with NaN so we can drop them together
rank_by_year = rank_by_year.replace(0, np.nan)

# Drop all rows that have any NaN (including those converted from 0)
rank_by_year = rank_by_year.dropna()

# Confirm cleanup
print(f"Remaining NaN values: {rank_by_year .isna().sum().sum()}")
print(f"Remaining zero values: {(rank_by_year == 0).sum().sum()}")
print(rank_by_year.head())



Remaining NaN values: 0
Remaining zero values: 0
  code      country region  rank  visa_free_count  year
0   AF  Afghanistan   ASIA   116             26.0  2021
1   AF  Afghanistan   ASIA   106             26.0  2020
2   AF  Afghanistan   ASIA   106             30.0  2018
3   AF  Afghanistan   ASIA   104             24.0  2017
4   AF  Afghanistan   ASIA   104             25.0  2016


In [5]:
# URL for country_lists.csv
url_country = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-09-09/country_lists.csv"

# Fetch CSV via requests (disable SSL verification temporarily)
response = requests.get(url_country, verify=False)
csv_data = StringIO(response.text)

# Read into pandas DataFrame
country_lists = pd.read_csv(csv_data)

# Quick check
print("Country Lists:")
country_lists.head()



Country Lists:


Unnamed: 0,code,country,visa_required,visa_online,visa_on_arrival,visa_free_access,electronic_travel_authorisation
0,PS,Palestinian Territory,"[[{""code"":""AF"",""name"":""Afghanistan""},{""code"":""...","[[{""code"":""AG"",""name"":""Antigua and Barbuda""},{...","[[{""code"":""BD"",""name"":""Bangladesh""},{""code"":""B...","[[{""code"":""BO"",""name"":""Bolivia""},{""code"":""CK"",...","[[{""code"":""LK"",""name"":""Sri Lanka""},{""code"":""KE..."
1,AD,Andorra,"[[{""code"":""AF"",""name"":""Afghanistan""},{""code"":""...","[[{""code"":""AO"",""name"":""Angola""},{""code"":""AZ"",""...","[[{""code"":""BH"",""name"":""Bahrain""},{""code"":""BD"",...","[[{""code"":""JP"",""name"":""Japan""},{""code"":""AL"",""n...","[[{""code"":""AU"",""name"":""Australia""},{""code"":""CA..."
2,VA,Vatican City,"[[{""code"":""AF"",""name"":""Afghanistan""},{""code"":""...","[[{""code"":""AZ"",""name"":""Azerbaijan""},{""code"":""B...","[[{""code"":""BH"",""name"":""Bahrain""},{""code"":""BD"",...","[[{""code"":""AL"",""name"":""Albania""},{""code"":""AD"",...","[[{""code"":""AU"",""name"":""Australia""},{""code"":""CA..."
3,SM,San Marino,"[[{""code"":""AF"",""name"":""Afghanistan""},{""code"":""...","[[{""code"":""AZ"",""name"":""Azerbaijan""},{""code"":""B...","[[{""code"":""BH"",""name"":""Bahrain""},{""code"":""BD"",...","[[{""code"":""JP"",""name"":""Japan""},{""code"":""AL"",""n...","[[{""code"":""AU"",""name"":""Australia""},{""code"":""CA..."
4,MC,Monaco,"[[{""code"":""AF"",""name"":""Afghanistan""},{""code"":""...","[[{""code"":""AZ"",""name"":""Azerbaijan""},{""code"":""B...","[[{""code"":""BH"",""name"":""Bahrain""},{""code"":""BD"",...","[[{""code"":""JP"",""name"":""Japan""},{""code"":""AL"",""n...","[[{""code"":""AU"",""name"":""Australia""},{""code"":""CA..."


In [6]:
json_cols = [
    'visa_required',
    'visa_online',
    'visa_on_arrival',
    'visa_free_access',
    'electronic_travel_authorisation'
]

import json

def clean_json_field(text):
    """Convert the messy stringified JSON fields into clean Python lists of dicts."""
    if pd.isna(text):
        return []
    try:
        data = json.loads(text)
        # Many fields are [[{...}]] — unwrap the extra list
        if isinstance(data, list) and len(data) == 1 and isinstance(data[0], list):
            data = data[0]
        return data
    except Exception as e:
        # Optional: print which rows failed to parse
        # print("Error parsing:", text[:80], e)
        return []
    
for col in json_cols:
    country_lists[col] = country_lists[col].apply(clean_json_field)

# Quick check
print("Cleaned Country Lists:")
country_lists.head()


Cleaned Country Lists:


Unnamed: 0,code,country,visa_required,visa_online,visa_on_arrival,visa_free_access,electronic_travel_authorisation
0,PS,Palestinian Territory,"[{'code': 'AF', 'name': 'Afghanistan'}, {'code...","[{'code': 'AG', 'name': 'Antigua and Barbuda'}...","[{'code': 'BD', 'name': 'Bangladesh'}, {'code'...","[{'code': 'BO', 'name': 'Bolivia'}, {'code': '...","[{'code': 'LK', 'name': 'Sri Lanka'}, {'code':..."
1,AD,Andorra,"[{'code': 'AF', 'name': 'Afghanistan'}, {'code...","[{'code': 'AO', 'name': 'Angola'}, {'code': 'A...","[{'code': 'BH', 'name': 'Bahrain'}, {'code': '...","[{'code': 'JP', 'name': 'Japan'}, {'code': 'AL...","[{'code': 'AU', 'name': 'Australia'}, {'code':..."
2,VA,Vatican City,"[{'code': 'AF', 'name': 'Afghanistan'}, {'code...","[{'code': 'AZ', 'name': 'Azerbaijan'}, {'code'...","[{'code': 'BH', 'name': 'Bahrain'}, {'code': '...","[{'code': 'AL', 'name': 'Albania'}, {'code': '...","[{'code': 'AU', 'name': 'Australia'}, {'code':..."
3,SM,San Marino,"[{'code': 'AF', 'name': 'Afghanistan'}, {'code...","[{'code': 'AZ', 'name': 'Azerbaijan'}, {'code'...","[{'code': 'BH', 'name': 'Bahrain'}, {'code': '...","[{'code': 'JP', 'name': 'Japan'}, {'code': 'AL...","[{'code': 'AU', 'name': 'Australia'}, {'code':..."
4,MC,Monaco,"[{'code': 'AF', 'name': 'Afghanistan'}, {'code...","[{'code': 'AZ', 'name': 'Azerbaijan'}, {'code'...","[{'code': 'BH', 'name': 'Bahrain'}, {'code': '...","[{'code': 'JP', 'name': 'Japan'}, {'code': 'AL...","[{'code': 'AU', 'name': 'Australia'}, {'code':..."


In [7]:
visa_cols = [
    'visa_required',
    'visa_online',
    'visa_on_arrival',
    'visa_free_access',
    'electronic_travel_authorisation'
]

from pandas import json_normalize

flat_frames = []

for col in visa_cols:
    temp = country_lists[['code', 'country', col]].explode(col)
    temp = temp.dropna(subset=[col])
    temp['visa_type'] = col
    temp['to_code'] = temp[col].apply(lambda x: x.get('code') if isinstance(x, dict) else None)
    temp['to_name'] = temp[col].apply(lambda x: x.get('name') if isinstance(x, dict) else None)
    temp = temp.drop(columns=[col])
    flat_frames.append(temp)

# Combine all into one DataFrame
flat_df = pd.concat(flat_frames, ignore_index=True)

# Remove rows missing target country codes
flat_df = flat_df.dropna(subset=['to_code'])

# Drop duplicates if any
flat_df = flat_df.drop_duplicates(subset=['code', 'to_code', 'visa_type']).reset_index(drop=True)

# Optional: rename columns for clarity
flat_df.rename(columns={
    'code': 'from_code',
    'country': 'from_country'
}, inplace=True)


# Quick check
print("Flattened Visa Data:")
flat_df.head()

Flattened Visa Data:


Unnamed: 0,from_code,from_country,visa_type,to_code,to_name
0,PS,Palestinian Territory,visa_required,AF,Afghanistan
1,PS,Palestinian Territory,visa_required,DZ,Algeria
2,PS,Palestinian Territory,visa_required,AD,Andorra
3,PS,Palestinian Territory,visa_required,AO,Angola
4,PS,Palestinian Territory,visa_required,AI,Anguilla


In [8]:
visa_summary = flat_df.groupby('visa_type').size().reset_index(name='count')
print(visa_summary)


                         visa_type  count
0  electronic_travel_authorisation   1382
1                 visa_free_access  15066
2                  visa_on_arrival   5316
3                      visa_online   5817
4                    visa_required  17392


In [9]:
visa_by_country = (
    flat_df.groupby(['from_country', 'visa_type'])
    .size()
    .reset_index(name='destination_count')
    .sort_values(['from_country', 'visa_type'])
)

visa_pivot = visa_by_country.pivot(
    index='from_country',
    columns='visa_type',
    values='destination_count'
).fillna(0).astype(int)
visa_pivot = visa_pivot.reset_index()
print("Visa Counts by Country:")
visa_pivot.head()

Visa Counts by Country:


visa_type,from_country,electronic_travel_authorisation,visa_free_access,visa_on_arrival,visa_online,visa_required
0,Afghanistan,3,6,16,43,158
1,Albania,6,88,29,29,74
2,Algeria,2,26,27,39,132
3,Andorra,16,120,35,23,32
4,Angola,2,26,20,37,141


In [10]:
# Group by year and region
region_stats_yearly = rank_by_year.groupby(['year', 'region']).agg(
    avg_visa_free=('visa_free_count', 'mean'),      # average visa-free count
    max_visa_free=('visa_free_count', 'max'),       # strongest passport in the region
    top_country=('country', lambda x: rank_by_year.loc[x.index, 'visa_free_count'].idxmax())
).reset_index()

# Fix top_country to show the country name
region_stats_yearly['top_country'] = region_stats_yearly['top_country'].apply(lambda idx: rank_by_year.loc[idx, 'country'])

print(region_stats_yearly)



     year       region  avg_visa_free  max_visa_free           top_country
0    2006       AFRICA      34.021739           65.0          South Africa
1    2006     AMERICAS      80.818182          130.0         United States
2    2006         ASIA      43.387097          128.0                 Japan
3    2006    CARIBBEAN      54.076923           71.0               Bahamas
4    2006       EUROPE      96.409091          130.0               Denmark
..    ...          ...            ...            ...                   ...
121  2025         ASIA      86.000000          193.0             Singapore
122  2025    CARIBBEAN     126.230769          163.0              Barbados
123  2025       EUROPE     165.265306          189.0               Denmark
124  2025  MIDDLE EAST      77.733333          184.0  United Arab Emirates
125  2025      OCEANIA     124.357143          187.0           New Zealand

[126 rows x 5 columns]


## Data Visualisation

In [11]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# --- Filter to 2015–2025 only ---
rank_filtered = rank_by_year[(rank_by_year['year'] >= 2015) & (rank_by_year['year'] <= 2025)]
years = sorted(rank_filtered['year'].unique())
regions = rank_filtered['region'].unique()

# --- Assign consistent colors per region ---
colors = px.colors.qualitative.Plotly[:len(regions)]
region_colors = dict(zip(regions, colors))

# --- Initialize figure ---
fig = go.Figure()

# --- Box plots per region per year ---
for year in years:
    df_year = rank_filtered[rank_filtered['year'] == year]
    for region in regions:
        df_region = df_year[df_year['region'] == region]
        fig.add_trace(go.Box(
            y=df_region['visa_free_count'],
            name=region,
            marker=dict(color=region_colors[region]),
            line=dict(color=region_colors[region], width=2),
            boxmean='sd',
            fillcolor='rgba(0,0,0,0)',
            customdata=df_region['country'],
            hovertemplate="<b>Region:</b> "+region+
                          "<br><b>Country:</b> %{customdata}"+
                          "<br><b>Visa-Free:</b> %{y}<extra></extra>",
            visible=True if year == years[0] else False
        ))

# --- Bar plots per region per year (initially hidden) ---
for year in years:
    df_year = rank_filtered[rank_filtered['year'] == year]
    for region in regions:
        df_region = df_year[df_year['region'] == region].sort_values('visa_free_count', ascending=False)
        fig.add_trace(go.Bar(
            x=df_region['country'],
            y=df_region['visa_free_count'],
            marker_color=region_colors[region],
            name=f"{region} - {year}",
            hovertemplate="Country: %{x}<br>Visa-Free: %{y}<extra></extra>",
            visible=False  # initially hidden
        ))

# --- Slider steps ---
steps = []
box_traces_per_year = len(regions)
bar_traces_per_year = len(regions)

for i, year in enumerate(years):
    visibility = [False] * len(fig.data)
    start = i * box_traces_per_year
    end = start + box_traces_per_year
    for j in range(start, end):
        visibility[j] = True
    steps.append(dict(
        method="update",
        label=str(year),
        args=[{"visible": visibility},
              {"title": f"Visa-Free Access Distribution by Region — {year}",
               "yaxis": {"title": "Visa-Free Destinations", "range": [0, 210],
                         "showgrid": True, "gridcolor": "lightgrey", "dtick": 10},
               "xaxis": {"tickangle": 0, "showgrid": True, "gridcolor": "lightgrey"}}]
    ))

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Year: "},
    pad={"t": 70},
    steps=steps
)]

# --- Buttons to switch between views ---
buttons = []

# Box plot button
buttons.append(dict(
    label="📦 Box Plot",
    method="update",
    args=[{"visible": [True]*box_traces_per_year + [False]*(len(fig.data)-box_traces_per_year)},
          {"title": f"Visa-Free Access Distribution by Region — {years[0]}",
           "yaxis": {"title": "Visa-Free Destinations", "range": [0, 210],
                     "showgrid": True, "gridcolor": "lightgrey", "dtick": 10},
           "xaxis": {"tickangle": 0, "showgrid": True, "gridcolor": "lightgrey"}}]
))

# Region bar plot buttons (show 2025 data by default)
year_2025_index = years.index(2025)  # find index of 2025
for r_idx, region in enumerate(regions):
    visibility = [False]*len(fig.data)
    # Calculate bar trace index for 2025
    bar_start = len(years)*box_traces_per_year + year_2025_index*bar_traces_per_year + r_idx
    if bar_start < len(fig.data):
        visibility[bar_start] = True
    buttons.append(dict(
        label=f"🏳️ {region}",
        method="update",
        args=[{"visible": visibility},
              {"title": f"Visa-Free Access by Country — {region} (2025)",
               "yaxis": {"title": "Visa-Free Destinations", "range": [0, 210],
                         "showgrid": True, "gridcolor": "lightgrey", "dtick": 10},
               "xaxis": {"tickangle": -45, "showgrid": True, "gridcolor": "lightgrey"}}]
    ))

# --- Final layout ---
fig.update_layout(
    sliders=sliders,
    updatemenus=[dict(
        type="dropdown",
        showactive=True,
        buttons=buttons,
        x=1.02,
        xanchor="left",
        y=1.15,
        yanchor="top"
    )],
    width=1500,
    height=850,
    template='plotly_white',
    plot_bgcolor='rgba(245,245,245,1)',
    paper_bgcolor='white',
    margin=dict(l=80, r=40, t=100, b=180),
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgrey',
        tickangle=0,
        tickfont=dict(size=10)
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgrey',
        tick0=0,
        dtick=10,
        range=[0, 210]
    )
)

fig.show()




In [None]:
import plotly.express as px
import pandas as pd

# Sort and filter data
rank_part2 = rank_by_year.sort_values(by=["country", "year"], ascending=True)
filtered_data = rank_part2[(rank_part2["year"] >= 2015) & (rank_part2["year"] <= 2025)]

# --- Create animated choropleth ---
fig = px.choropleth(
    filtered_data,
    locations="country",
    locationmode="country names",
    color="visa_free_count",
    animation_frame="year",
    color_continuous_scale=[
        "#fff5f0", "#fcbba1", "#fc9272", "#fb6a4a",
        "#ef3b2c", "#cb181d", "#a50f15", "#67000d"
    ],  # vivid red scale — clearer contrasts
    range_color=(filtered_data['visa_free_count'].min(), filtered_data['visa_free_count'].max()),
    projection="natural earth",
    title="Global Visa-Free Access Evolution (2015–2025)",
    hover_name="country",
    hover_data={"visa_free_count": True, "year": True}
)

# --- Layout customization ---
fig.update_layout(
    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor="gray",
        landcolor="rgb(240,240,240)",
        projection_type="natural earth"
    ),
    width=1600,
    height=900,
    margin=dict(l=0, r=0, t=60, b=0),
    coloraxis_colorbar=dict(
        title=dict(
            text="Visa-Free<br>Destinations",
            font=dict(size=14, family="Arial Black")
        ),
        tickfont=dict(size=12)
    ),
    title=dict(
        font=dict(size=26, family="Arial Black"),
        x=0.5
    )
)

# --- Smooth animation transitions ---
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 800   # ms per frame
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['easing'] = 'cubic-in-out'

fig.show()



NameError: name 'data' is not defined