# 2023 Holidays

<font size="4">As usual, 2023 wasn't easy. But we got through it, and now I propose to explore it's holidays - days when we had the opportunity to take a break and spend time with family, friends, pets, ourselves. Have fun!</font>

In [1]:
from lets_plot import *
from lets_plot.mapping import as_discrete
from lets_plot.geo_data import *

The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).


In [2]:
LetsPlot.setup_html()

In [3]:
top_size = 10
month_color = {
    "December": "#ff0000",
    "January": "#c71585",
    "February": "#ee82ee",
    "March": "#8a2be2",
    "April": "#0000ff",
    "May": "#0d98ba",
    "June": "#00ff00",
    "July": "#9acd32",
    "August": "#ffff00",
    "September": "#ffae42",
    "October": "#ffa500",
    "November": "#ff5349",
}
religion_color = {
    "Orthodox": "#8856a7",
    "Christian": "#e6550d",
    "Hebrew": "#2ca25f",
    "Muslim": "#3182bd",
    "Hinduism": "#fa9fb5",
}

In [4]:
def get_data():
    import os
    import pandas as pd
    data_dir = "data"
    paths = (os.path.join(data_dir, f) for f in os.listdir(data_dir))
    dfs = []
    for file_path in filter(os.path.isfile, paths):
        dfs.append(pd.read_csv(file_path))
    return pd.concat(dfs, ignore_index=True)

def prepare_data(df):
    import pandas as pd
    name_replace = {
        "New Year": "New Year's Day",
    }
    type_replace = {
        "['Season']": "Season",
        "['Clock change/Daylight Saving Time']": "DST",
        "['Sporting event']": "Sport",
    }
    df.columns = ["date", "holiday_name" ,"type", "country_name", "country_code"]
    df["date"] = pd.to_datetime(df["date"], format='mixed', utc=True)
    df["month"] = df["date"].dt.month
    df["month_name"] = df["date"].dt.month_name()
    df["day_of_year"] = df["date"].dt.dayofyear
    df["day_of_month"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.dayofweek
    df["day_of_week_name"] = df["date"].dt.day_name()
    df["date_name"] = df["month_name"].str.cat(df["day_of_month"].astype(str), sep=", ")
    df["holiday_name"] = df["holiday_name"].replace(name_replace)
    df["is_national"] = df["type"].str.contains("'National holiday'").map({True: "yes", False: "no"})
    df["religion"] = df["type"].str.extract(r"(Orthodox|Christian|Hebrew|Muslim|Hinduism)")
    df["is_religious"] = (~df["religion"].isna()).map({True: "yes", False: "no"})
    df["holiday_type"] = df["type"].map(type_replace).fillna("Holiday").astype(str)
    df.drop(columns=["type", "country_code"], inplace=True)
    df.drop_duplicates(subset=["country_name", "holiday_name", "day_of_year"], inplace=True)
    return df

def filter_df(df, col, value):
    result = df[df[col] == value].reset_index(drop=True)
    result.drop(columns=[col], inplace=True)
    return result

def get_distinct_holidays_df(df):
    return df.drop_duplicates(subset=["holiday_name", "day_of_year"]).reset_index(drop=True)

def get_countries_gdf(df, country_col):
    return geocode_countries(df[country_col].unique()).ignore_not_found().inc_res().get_boundaries()

def get_top_df(df, col):
    return df[col].value_counts().to_frame().reset_index().iloc[:top_size]

In [5]:
full_df = prepare_data(get_data())
print(full_df.shape)
full_df.head()

(7098, 14)


Unnamed: 0,date,holiday_name,country_name,month,month_name,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious,holiday_type
0,2023-02-15 00:00:00+00:00,Liberation Day,Afghanistan,2,February,46,15,2,Wednesday,"February, 15",yes,,no,Holiday
1,2023-03-20 21:24:20+00:00,March Equinox,Afghanistan,3,March,79,20,0,Monday,"March, 20",no,,no,Season
2,2023-03-21 00:00:00+00:00,Nauruz,Afghanistan,3,March,80,21,1,Tuesday,"March, 21",no,,no,Holiday
3,2023-03-23 00:00:00+00:00,First Day of Ramadan,Afghanistan,3,March,82,23,3,Thursday,"March, 23",no,,no,Holiday
4,2023-04-22 00:00:00+00:00,Eid al-Fitr,Afghanistan,4,April,112,22,5,Saturday,"April, 22",yes,,no,Holiday


In [6]:
countries_gdf = get_countries_gdf(full_df, "country_name")
print(countries_gdf.shape)
countries_gdf.head()

(207, 3)


Unnamed: 0,country,found name,geometry
0,Afghanistan,Afghanistan,"MULTIPOLYGON (((60.87286 29.85855, 61.80162 30..."
1,Albania,Albania,"MULTIPOLYGON (((19.37279 41.85000, 19.40242 42..."
2,Algeria,Algeria,"MULTIPOLYGON (((-4.83333 25.00000, -8.66812 27..."
3,Andorra,Andorra,"MULTIPOLYGON (((1.72589 42.50270, 1.51284 42.4..."
4,Angola,Angola,"MULTIPOLYGON (((11.99615 -15.63033, 12.15321 -..."


In [7]:
holidays_df = filter_df(full_df, "holiday_type", "Holiday")
print(holidays_df.shape)
holidays_df.head()

(6089, 13)


Unnamed: 0,date,holiday_name,country_name,month,month_name,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious
0,2023-02-15 00:00:00+00:00,Liberation Day,Afghanistan,2,February,46,15,2,Wednesday,"February, 15",yes,,no
1,2023-03-21 00:00:00+00:00,Nauruz,Afghanistan,3,March,80,21,1,Tuesday,"March, 21",no,,no
2,2023-03-23 00:00:00+00:00,First Day of Ramadan,Afghanistan,3,March,82,23,3,Thursday,"March, 23",no,,no
3,2023-04-22 00:00:00+00:00,Eid al-Fitr,Afghanistan,4,April,112,22,5,Saturday,"April, 22",yes,,no
4,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,113,23,6,Sunday,"April, 23",yes,,no


In [8]:
distinct_holidays_df = get_distinct_holidays_df(holidays_df)
print(distinct_holidays_df.shape)
distinct_holidays_df.head()

(3177, 13)


Unnamed: 0,date,holiday_name,country_name,month,month_name,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious
0,2023-02-15 00:00:00+00:00,Liberation Day,Afghanistan,2,February,46,15,2,Wednesday,"February, 15",yes,,no
1,2023-03-21 00:00:00+00:00,Nauruz,Afghanistan,3,March,80,21,1,Tuesday,"March, 21",no,,no
2,2023-03-23 00:00:00+00:00,First Day of Ramadan,Afghanistan,3,March,82,23,3,Thursday,"March, 23",no,,no
3,2023-04-22 00:00:00+00:00,Eid al-Fitr,Afghanistan,4,April,112,22,5,Saturday,"April, 22",yes,,no
4,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,113,23,6,Sunday,"April, 23",yes,,no


In [9]:
national_holidays_df = filter_df(full_df, "is_national", 'yes')
print(national_holidays_df.shape)
national_holidays_df.head()

(3365, 13)


Unnamed: 0,date,holiday_name,country_name,month,month_name,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,religion,is_religious,holiday_type
0,2023-02-15 00:00:00+00:00,Liberation Day,Afghanistan,2,February,46,15,2,Wednesday,"February, 15",,no,Holiday
1,2023-04-22 00:00:00+00:00,Eid al-Fitr,Afghanistan,4,April,112,22,5,Saturday,"April, 22",,no,Holiday
2,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,113,23,6,Sunday,"April, 23",,no,Holiday
3,2023-04-24 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,114,24,0,Monday,"April, 24",,no,Holiday
4,2023-04-28 00:00:00+00:00,Afghan Victory Day,Afghanistan,4,April,118,28,4,Friday,"April, 28",,no,Holiday


In [10]:
distinct_national_holidays_df = get_distinct_holidays_df(national_holidays_df)
print(distinct_national_holidays_df.shape)
distinct_national_holidays_df.head()

(1670, 13)


Unnamed: 0,date,holiday_name,country_name,month,month_name,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,religion,is_religious,holiday_type
0,2023-02-15 00:00:00+00:00,Liberation Day,Afghanistan,2,February,46,15,2,Wednesday,"February, 15",,no,Holiday
1,2023-04-22 00:00:00+00:00,Eid al-Fitr,Afghanistan,4,April,112,22,5,Saturday,"April, 22",,no,Holiday
2,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,113,23,6,Sunday,"April, 23",,no,Holiday
3,2023-04-24 00:00:00+00:00,Eid al-Fitr Holiday,Afghanistan,4,April,114,24,0,Monday,"April, 24",,no,Holiday
4,2023-04-28 00:00:00+00:00,Afghan Victory Day,Afghanistan,4,April,118,28,4,Friday,"April, 28",,no,Holiday


In [11]:
def get_map_plot(data, title, trans=None):
    return ggplot() + \
        geom_livemap(const_size_zoomin=0) + \
        geom_map(aes(fill="count"), data=data, \
                 map=countries_gdf, map_join=["country_name", "country"],
                 tooltips=layer_tooltips().title("@country_name")\
                          .format("@count", 'd').line("holidays count|@count")) + \
        scale_fill_brewer(type='seq', palette='YlOrRd', trans=trans) + \
        ggtitle(title) + \
        theme_void()

gggrid([
    get_map_plot(holidays_df["country_name"].value_counts().to_frame().reset_index(), title="All holidays", trans='sqrt'),
    get_map_plot(national_holidays_df["country_name"].value_counts().to_frame().reset_index(), title="Only national holidays"),
    get_map_plot(holidays_df[
                     holidays_df["holiday_name"].str.contains("New Year", case=False)
                 ]["country_name"].value_counts().to_frame().reset_index(), title="New Year holidays"),
], ncol=1) + ggsize(600, 1200)

The first plot tells us that if you count all holidays (not just weekends), the US has a suspiciously high number of holidays (more than the number of days in a year).

The second plot shows how many national holidays (usually it is a weekends) there are.

The third plot shows who has how many New Year's holidays (not necessarily weekends).

In [12]:
us_holidays = holidays_df[holidays_df["country_name"] == "United States"].reset_index(drop=True)

ggplot(us_holidays) + \
    geom_dotplot(aes("day_of_year", fill="month_name"), \
                 binwidth=7, boundary=5, show_legend=False, \
                 tooltips=layer_tooltips().title("@month_name")\
                          .format("@..count..", 'd').line("week holidays count|@..count..")) + \
    scale_fill_manual(values=month_color) + \
    xlab("day of year") + ylab("count") + \
    ggtitle("US holidays")

Let's look closer to the US. Here are all the holidays that there are (by week).

In [13]:
ggplot(distinct_national_holidays_df) + \
    geom_density(aes("day_of_year"), fill="#2c7fb8") + \
    xlab("day of year") + \
    ggtitle("Distribution of national holidays during the year")

Over the course of a year, the greatest density of holidays is reached roughly between the 100th and 130th day.

In [14]:
ggplot(distinct_national_holidays_df) + \
    geom_density(aes("day_of_month", fill="..quantile..")) + \
    scale_fill_brewer(type='seq', palette='YlOrRd') + \
    facet_wrap(facets="month", order=list(range(1, 13))) + \
    xlab("day of month") + \
    ggtitle("Distribution of national holidays per month")

And here's how the holidays are distributed in each month. For some reason, they are not usually concentrated around the median.

In [15]:
ggplot(distinct_national_holidays_df) + \
    geom_pie(aes(fill=as_discrete("month_name", order_by="month")), \
             hole=.65, size=35, color='black', stroke=2, show_legend=False, \
             tooltips=layer_tooltips().line("@..proppct.."), \
             labels=layer_labels().line("@month_name").size(10)) + \
    scale_fill_manual(values=month_color) + \
    ggtitle("Proportion of holidays count per month") + \
    theme_void() + ggsize(600, 400)

The least festive month is December and the most festive month is April.

In [16]:
top_of_days_with_holiday_df = get_top_df(national_holidays_df, "day_of_year").sort_values(by="day_of_year").merge(
    national_holidays_df[["day_of_year", "date_name"]].drop_duplicates(), on="day_of_year"
)

ggplot(top_of_days_with_holiday_df) + \
    geom_bar(aes("count", as_discrete("day_of_year")), stat='identity', \
             orientation='y', fill="#2c7fb8", \
             labels=layer_labels().line("@date_name")) + \
    ggtitle("Number of all holidays in this day") + \
    theme(axis_title_y='blank', axis_text_y='blank')

If we look at which days of the year have the most national holidays (from all countries), the 1st of January is the most festive. 25 December and 1 May are not far behind.

In [17]:
ggplot(distinct_national_holidays_df) + \
    geom_pie(aes(fill=as_discrete("day_of_week_name", order_by="day_of_week")), \
             hole=.3, size=35, color='black', stroke=2, show_legend=False, \
             tooltips=layer_tooltips().line("@..proppct.."), \
             labels=layer_labels().line("@day_of_week_name")) + \
    ggtitle("Proportion of holidays count per week") + \
    theme_void() + ggsize(600, 400)

Looking at the distribution of national holidays by day of the week (relevant only for the year 2023), the most holidays fall on Monday and the least on Saturday.

In [18]:
top_of_holidays_df = national_holidays_df[national_holidays_df["holiday_name"].isin(
    get_top_df(national_holidays_df, "holiday_name")["holiday_name"].to_list()
)].sort_values(by="day_of_year").reset_index(drop=True)

ggplot(top_of_holidays_df) + \
    geom_count(aes("day_of_year", "holiday_name", group="date_name"), \
               alpha=.5, color="#2c7fb8", show_legend=False, \
               tooltips=layer_tooltips().title("@holiday_name")\
                        .line("date|@date_name")\
                        .line("number of countries|@..n..")) + \
    scale_size(range=[2, 20]) + \
    xlab("day of year") + ylab("holiday name") + \
    ggtitle("Top {0} national holidays".format(top_size)) + \
    ggsize(1000, 400)

These are the top 10 most popular national holidays. The most common is New Year's Day. For some holidays, the date can float - depending on the country. The most dispersed holiday is Independence Day - of course it is different for every country.

This plot should be taken with caution - I can only distinguish holidays by name, but it happens in the raw data that essentially the same holiday is named slightly differently in different countries.

In [19]:
def get_religious_pie(df, title):
    return ggplot(df) + \
        geom_pie(aes(fill="is_religious"), size=20, color='black', stroke=2, \
                 tooltips='none', labels=layer_labels().line("@..proppct..")) + \
        scale_fill_discrete(name="is religious") + \
        ggtitle(title) + \
        theme_void()

gggrid([
    get_religious_pie(distinct_holidays_df, "Overall holidays"),
    get_religious_pie(distinct_national_holidays_df, "National holidays"),
])

The vast majority of holidays are secular. Especially if only national holidays are taken into account.

In [20]:
def get_religions_pie(df, title):
    return ggplot(df[df["is_religious"] == "yes"]) + \
        geom_pie(aes(fill=as_discrete("religion", order_by="..count..")), \
                 hole=.5, size=30, color='black', stroke=2, \
                 tooltips='none', labels=layer_labels().line("@..proppct..")) + \
        scale_fill_manual(values=religion_color) + \
        ggtitle(title) + \
        theme_void()

gggrid([
    get_religions_pie(distinct_holidays_df, "Overall religious holidays"),
    get_religions_pie(distinct_national_holidays_df, "Overall national religious holidays"),
])

If we look only at religious holidays, Hebrew has the most of them. However, if we consider only national holidays, the greatest diversity is among the Orthodox.

<font size="4">Now 2024 is coming, and the entire Lets-Plot team wishes you all the best, and especially more time for yourself while AI does all the boring and routine part of the job for you!</font>