In [1]:
import requests
from bs4 import BeautifulSoup

# fetch wiki page
URL = "https://en.wikipedia.org/wiki/Federal_holidays_in_the_United_States"
page = requests.get(URL)

# build beautiful soup obj
html_page = BeautifulSoup(page.content, "html.parser")

# select holiday table
holiday_table = html_page.find("table", class_="wikitable")

In [2]:
# where to store
dict_holidays = {
    "date": [],
    "name": []
}

In [3]:
# extract from table 
for row in holiday_table.find_all("tr", attrs={"style": lambda s: s != "background:#efefef;"}):
    # extract useful infos
    holiday_date = row.find("th").text
    holiday_name = row.find("td").text

    # save
    dict_holidays["date"].append(holiday_date)
    dict_holidays["name"].append(holiday_name)

In [4]:
# create / save df
import pandas as pd


df = pd.DataFrame(dict_holidays)

In [5]:
df

Unnamed: 0,date,name
0,January 1(Fixed)\n,New Year's Day\n
1,January 15–21(Floating Monday)\n,"Birthday of Martin Luther King, Jr.\n"
2,February 15–21(Floating Monday)\n,Washington's Birthday\n
3,May 25–31(Floating Monday)\n,Memorial Day\n
4,June 19(Fixed)\n,Juneteenth National Independence Day\n
5,July 4(Fixed)\n,Independence Day\n
6,September 1–7(Floating Monday)\n,Labor Day\n
7,October 8–14(Floating Monday)\n,Columbus Day\n
8,November 11(Fixed)\n,Veterans Day\n
9,November 22–28(Floating Thursday)\n,Thanksgiving Day\n


In [6]:
# commune preprocessing: 
import re

# remove \n and (.)
pattern = r'\((.*?)\)|\n'

df["date"] = df["date"].apply(lambda s: re.sub(pattern, repl="", string=s))
df["name"] = df["name"].apply(lambda s: re.sub(pattern, repl="", string=s))

In [7]:
# check
df

Unnamed: 0,date,name
0,January 1,New Year's Day
1,January 15–21,"Birthday of Martin Luther King, Jr."
2,February 15–21,Washington's Birthday
3,May 25–31,Memorial Day
4,June 19,Juneteenth National Independence Day
5,July 4,Independence Day
6,September 1–7,Labor Day
7,October 8–14,Columbus Day
8,November 11,Veterans Day
9,November 22–28,Thanksgiving Day


In [8]:
# extract month, start day, duration

dict_month_day_duration = {
    "month": [],
    "start": [],
    "duration": []
}

for i in range(len(df)):
    # get entry
    holiday_date = df.loc[i, "date"]

    # split
    pattern = r'\ |–'
    month, *day = re.split(pattern, string=holiday_date)

    # get month num
    month_num = pd.Timestamp(f"{month} 2000").month
    dict_month_day_duration["month"].append(month_num)
    
    # get start duration day
    start, end = int(day[0]), int(day[-1])

    dict_month_day_duration["start"].append(start)
    dict_month_day_duration["duration"].append(end - start + 1)

In [9]:
dict_month_day_duration

{'month': [1, 1, 2, 5, 6, 7, 9, 10, 11, 11, 12],
 'start': [1, 15, 15, 25, 19, 4, 1, 8, 11, 22, 25],
 'duration': [1, 7, 7, 7, 1, 1, 7, 7, 1, 7, 1]}

In [15]:
# arrange in df and concat
df_extra_info = pd.DataFrame(dict_month_day_duration)

df_holidays = pd.concat([df, df_extra_info], axis=1)

In [16]:
df_holidays

Unnamed: 0,date,name,month,start,duration
0,January 1,New Year's Day,1,1,1
1,January 15–21,"Birthday of Martin Luther King, Jr.",1,15,7
2,February 15–21,Washington's Birthday,2,15,7
3,May 25–31,Memorial Day,5,25,7
4,June 19,Juneteenth National Independence Day,6,19,1
5,July 4,Independence Day,7,4,1
6,September 1–7,Labor Day,9,1,7
7,October 8–14,Columbus Day,10,8,7
8,November 11,Veterans Day,11,11,1
9,November 22–28,Thanksgiving Day,11,22,7


In [17]:
# save
df_holidays.to_csv("../data/holidays.csv", index=False)

In [10]:
a, *b = [1, 2]
b

[2]

In [11]:
import re

s = "NNights are (not) the best (worst) team.\n"

s_re = re.sub(pattern=r'\((.*?)\)|\n', repl="", string=s)

s_re
# print(s_re)
# re.sub(pattern="  ", repl=" ", string=s_re)

'NNights are  the best  team.'