In [189]:
# Import libraries
import pandas as pd
import holidays
import pycountry
import re

In [190]:
cols = ["country", "alpha2", "date", "holiday"]
us_holidays = holidays.country_holidays("US", years=2010)
print(us_holidays)
for holiday in us_holidays:
    print(us_holidays[holiday])

{datetime.date(2010, 1, 1): "New Year's Day", datetime.date(2010, 12, 31): "New Year's Day (observed)", datetime.date(2010, 5, 31): 'Memorial Day', datetime.date(2010, 7, 4): 'Independence Day', datetime.date(2010, 7, 5): 'Independence Day (observed)', datetime.date(2010, 9, 6): 'Labor Day', datetime.date(2010, 11, 11): 'Veterans Day', datetime.date(2010, 11, 25): 'Thanksgiving', datetime.date(2010, 12, 25): 'Christmas Day', datetime.date(2010, 12, 24): 'Christmas Day (observed)', datetime.date(2010, 1, 18): 'Martin Luther King Jr. Day', datetime.date(2010, 2, 15): "Washington's Birthday", datetime.date(2010, 10, 11): 'Columbus Day'}
New Year's Day
New Year's Day (observed)
Memorial Day
Independence Day
Independence Day (observed)
Labor Day
Veterans Day
Thanksgiving
Christmas Day
Christmas Day (observed)
Martin Luther King Jr. Day
Washington's Birthday
Columbus Day


In [191]:
all_holidays = []
for country in pycountry.countries:
    try:
        cur_holidays = holidays.country_holidays(country.alpha_2, years=range(2010, 2025))
        for date in cur_holidays:
            all_holidays.append([country.name, country.alpha_2, date, cur_holidays[date]])
    except NotImplementedError as e:
        pass
        print(f'\t{country.name} has no holidays in the database')

	Afghanistan has no holidays in the database
	Anguilla has no holidays in the database
	Åland Islands has no holidays in the database
	Antarctica has no holidays in the database
	French Southern Territories has no holidays in the database
	Antigua and Barbuda has no holidays in the database
	Benin has no holidays in the database
	Bonaire, Sint Eustatius and Saba has no holidays in the database
	Saint Barthélemy has no holidays in the database
	Bermuda has no holidays in the database
	Bhutan has no holidays in the database
	Bouvet Island has no holidays in the database
	Central African Republic has no holidays in the database
	Cocos (Keeling) Islands has no holidays in the database
	Côte d'Ivoire has no holidays in the database
	Congo, The Democratic Republic of the has no holidays in the database
	Congo has no holidays in the database
	Cook Islands has no holidays in the database
	Comoros has no holidays in the database
	Cabo Verde has no holidays in the database
	Christmas Island has 



In [192]:
len(all_holidays)

30958

In [193]:
df = pd.DataFrame(all_holidays, columns=cols)
df.head()

Unnamed: 0,country,alpha2,date,holiday
0,Aruba,AW,2016-01-01,New Year's Day
1,Aruba,AW,2016-01-25,Betico Day
2,Aruba,AW,2016-02-08,Carnival Monday
3,Aruba,AW,2016-03-18,National Anthem and Flag Day
4,Aruba,AW,2016-03-25,Good Friday


In [194]:
df.dtypes

country    object
alpha2     object
date       object
holiday    object
dtype: object

In [195]:
df["date"] = pd.to_datetime(df["date"])
df.dtypes

country            object
alpha2             object
date       datetime64[ns]
holiday            object
dtype: object

In [196]:
df.set_index("date").loc["2024-06"]["country"].drop_duplicates().size

# We see that 88 countries celebrate a holiday in June 

88

In [197]:
# Which holidays are celebrated in more than one country?
df.set_index("date").loc["2024-06"].groupby("holiday")["country"].count().sort_values(ascending=False).head(10)

holiday
Eid al-Adha (estimated)                 30
Eid al-Adha Holiday (estimated)         18
Arafat Day (estimated)                   7
Juneteenth National Independence Day     7
Sunday                                   5
Eid al-Adha                              5
Independence Day                         4
Midsummer Day                            4
Eid-ul-Adha (estimated)                  3
Eid al-Adha (observed, estimated)        3
Name: country, dtype: int64

In [198]:
# dummy = df.set_index("date")
# filter_df = dummy.loc["2024-06"]
# filter_df.loc[filter_df["holiday"].str.contains("Juneteenth")]
df.set_index("date").loc["2024-06"].loc[lambda df: df["holiday"].str.contains("Juneteenth")]

Unnamed: 0_level_0,country,alpha2,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-06-19,American Samoa,AS,Juneteenth National Independence Day
2024-06-19,Guam,GU,Juneteenth National Independence Day
2024-06-19,Northern Mariana Islands,MP,Juneteenth National Independence Day
2024-06-19,Puerto Rico,PR,Juneteenth National Independence Day
2024-06-19,United States Minor Outlying Islands,UM,Juneteenth National Independence Day
2024-06-19,United States,US,Juneteenth National Independence Day
2024-06-19,"Virgin Islands, U.S.",VI,Juneteenth National Independence Day


In [199]:
# Remove all the words inbetween parantheses as well as the parantheses 
df["holiday"] = df["holiday"].str.replace(r"\([^)]*\)", "", regex=True).str.strip()

In [200]:
# Remove the word holiday from the holiday column
df["holiday"] = df["holiday"].str.replace(r"\s*Holiday\s*$", "", regex=True, flags=re.IGNORECASE)

In [201]:
df.set_index("date").loc["2024-06"].groupby("holiday")["country"].count().sort_values(ascending=False).head(10)

holiday
Eid al-Adha                             59
Arafat Day                               7
Juneteenth National Independence Day     7
Independence Day                         5
Sunday                                   5
Midsummer Day                            4
Midsummer Eve                            3
Eid-ul-Adha                              3
Eid-el-Kabir                             3
Youth Day                                2
Name: country, dtype: int64

In [202]:
new_years_celebrating = df.set_index("date").loc["2024-01-01"]["country"]

In [203]:
df.loc[lambda x: ~x["country"].isin(new_years_celebrating)]["country"].drop_duplicates()

3083                    Bangladesh
9079                      Ethiopia
12377                        India
12787    Iran, Islamic Republic of
13428                       Israel
19820                     Malaysia
21135                     Pakistan
23631                 Saudi Arabia
28322                      Ukraine
Name: country, dtype: object

In [204]:
df[~df["country"].isin(new_years_celebrating)]["country"].drop_duplicates()

3083                    Bangladesh
9079                      Ethiopia
12377                        India
12787    Iran, Islamic Republic of
13428                       Israel
19820                     Malaysia
21135                     Pakistan
23631                 Saudi Arabia
28322                      Ukraine
Name: country, dtype: object

In [206]:
df.loc[lambda df_: df_["date"].dt.year == "2024"].set_index(["country", "holiday"])

Unnamed: 0_level_0,Unnamed: 1_level_0,alpha2,date
country,holiday,Unnamed: 2_level_1,Unnamed: 3_level_1
