# Choose a Data Set

You can choose to analyze any data that you would like! Remember, you need 1000 rows of non-null data in order to get 5 points for the "Data" criteria of my [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing). Consider looking at [Kaggle](https://www.kaggle.com/datasets) or [free APIs](https://free-apis.github.io/#/browse) for datasets of this size. Alternatively, you can scrape the web to make your own dataset! :D

Once you have chosen your dataset, please read your data into a dataframe and call `.info()` below. If you don't call `info` I will give you 0 points for the first criteria described on the [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing).

In [43]:
import requests
import pandas as pd

# Base API URL
BASE_URL = "https://openholidaysapi.org"

# Function to fetch holidays (public or school)
def fetch_holidays(endpoint, country_code, valid_from, valid_to, language="EN"):
    url = f"{BASE_URL}/{endpoint}?countryIsoCode={country_code}&languageIsoCode={language}&validFrom={valid_from}&validTo={valid_to}"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            return response.json()
        except Exception as e:
            print(f"Error parsing JSON for {endpoint} in {country_code}: {e}")
            return []
    else:
        print(f"Failed to fetch {endpoint} for {country_code}: {response.text}")
        return []

# List of supported countries
countries = [
    "AL", "AD", "AT", "BY", "BE", "BG", "HR", "CZ", "EE", "FR", "DE", "HU", "IE", "IT",
    "LV", "LI", "LT", "LU", "MT", "MD", "MC", "NL", "PL", "PT", "RO", "SM", "RS", "SK",
    "SI", "ES", "CH", "VA"
]

# Date range for holidays
valid_from = "2023-01-01"
valid_to = "2023-12-31"

# Initialize list to collect all holidays
all_holidays = []

# Fetch data for each country and holiday type
for country in countries:
    # Fetch Public Holidays
    public_holidays = fetch_holidays("PublicHolidays", country, valid_from, valid_to)
    for holiday in public_holidays:
        all_holidays.append({
            "country": country,
            "date": holiday.get("date", "N/A"),
            "name": holiday.get("name", "Unknown"),
            "type": "Public"
        })
    
    # Fetch School Holidays
    school_holidays = fetch_holidays("SchoolHolidays", country, valid_from, valid_to)
    for holiday in school_holidays:
        all_holidays.append({
            "country": country,
            "date": holiday.get("date", "N/A"),
            "name": holiday.get("name", "Unknown"),
            "type": "School"
        })

# Convert to DataFrame
df = pd.DataFrame(all_holidays)

# Clean data: remove rows with missing dates
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])

# Save to CSV for reuse
df.to_csv("holidays_combined_data.csv", index=False)

# Summary of the dataset
print("Data collection complete.")
print(df.info())


Data collection complete.
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   country  0 non-null      object        
 1   date     0 non-null      datetime64[ns]
 2   name     0 non-null      object        
 3   type     0 non-null      object        
dtypes: datetime64[ns](1), object(3)
memory usage: 0.0+ bytes
None


In [13]:
# Load the dataset
df = pd.read_csv("debugged_holidays_data.csv")

# Quick overview of the data
print(df.info())
print(df.head())

# Filter valid data (optional, based on structure)
df = df[df["date"] != "N/A"]
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.month

# Prepare data for analysis
holidays_by_country = df.groupby("country")["name"].count().reset_index(name="holiday_count")
holidays_by_type = df.groupby(["country", "type"])["name"].count().reset_index(name="type_count")
holidays_by_month = df.groupby("month")["name"].count().reset_index(name="monthly_count")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  652 non-null    object 
 1   date     0 non-null      float64
 2   name     652 non-null    object 
 3   type     652 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.5+ KB
None
  country  date                                               name    type
0      AL   NaN     [{'language': 'EN', 'text': "New Year's Day"}]  Public
1      AL   NaN  [{'language': 'EN', 'text': "New Year's Day Ho...  Public
2      AL   NaN         [{'language': 'EN', 'text': 'Summer Day'}]  Public
3      AL   NaN  [{'language': 'EN', 'text': 'Nowruz (Persian N...  Public
4      AL   NaN      [{'language': 'EN', 'text': 'Easter Sunday'}]  Public


In [31]:
import requests

# API base URL
BASE_URL = "https://openholidaysapi.org"

# Fetch list of countries
countries_url = f"{BASE_URL}/Countries"
response = requests.get(countries_url)
if response.status_code == 200:
    countries = response.json()
    country_codes = [country["isoCode"] for country in countries]
    print("Supported countries:", country_codes)
else:
    print("Failed to fetch countries:", response.status_code)


Supported countries: ['AD', 'AL', 'AT', 'BE', 'BG', 'BY', 'CH', 'CZ', 'DE', 'EE', 'ES', 'FR', 'HR', 'HU', 'IE', 'IT', 'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'MT', 'NL', 'PL', 'PT', 'RO', 'RS', 'SI', 'SK', 'SM', 'VA']


In [47]:
import requests
import pandas as pd

def fetch_holidays(endpoint, country_code, valid_from, valid_to, language="EN"):
    url = f"https://openholidaysapi.org/{endpoint}?countryIsoCode={country_code}&languageIsoCode={language}&validFrom={valid_from}&validTo={valid_to}"
    response = requests.get(url)
    
    # Debugging: Print the raw response
    print(f"Fetching {endpoint} for {country_code}: Status Code {response.status_code}")
    if response.status_code == 200:
        try:
            data = response.json()
            print(f"Response data for {country_code} from {endpoint}: {data[:5]}")  # Show a snippet of the data
            return data
        except Exception as e:
            print(f"Error parsing JSON for {country_code} in {endpoint}: {e}")
            return []
    else:
        print(f"Failed to fetch data for {country_code} from {endpoint}: {response.text}")
        return []

# Fetch holidays for a single country to test
test_country = "DE"  # Example: Germany
test_valid_from = "2023-01-01"
test_valid_to = "2023-12-31"

# Fetch and print both public and school holidays
public_holidays = fetch_holidays("PublicHolidays", test_country, test_valid_from, test_valid_to)
school_holidays = fetch_holidays("SchoolHolidays", test_country, test_valid_from, test_valid_to)

print(f"Public holidays: {public_holidays}")
print(f"School holidays: {school_holidays}")


Fetching PublicHolidays for DE: Status Code 200
Response data for DE from PublicHolidays: [{'id': '94498ba1-28e3-4dcd-b28f-d7aaf4cc56d7', 'startDate': '2023-01-01', 'endDate': '2023-01-01', 'type': 'Public', 'name': [{'language': 'EN', 'text': "New Year's Day"}], 'regionalScope': 'Regional', 'temporalScope': 'FullDay', 'nationwide': True}, {'id': '04f86fc2-424a-46e5-befd-2dce33158470', 'startDate': '2023-01-06', 'endDate': '2023-01-06', 'type': 'Public', 'name': [{'language': 'EN', 'text': 'Epiphany'}], 'regionalScope': 'Regional', 'temporalScope': 'FullDay', 'nationwide': False, 'subdivisions': [{'code': 'DE-BY', 'shortName': 'BY'}, {'code': 'DE-BW', 'shortName': 'BW'}, {'code': 'DE-ST', 'shortName': 'ST'}]}, {'id': '88848d07-197b-4b4f-91f6-3cb24a378384', 'startDate': '2023-03-08', 'endDate': '2023-03-08', 'type': 'Public', 'name': [{'language': 'EN', 'text': "International Women's Day"}], 'regionalScope': 'Regional', 'temporalScope': 'FullDay', 'nationwide': False, 'subdivisions': [{

In [49]:
import requests
import pandas as pd

# Fetch holidays function
def fetch_holidays(endpoint, country_code, valid_from, valid_to, language="EN"):
    url = f"https://openholidaysapi.org/{endpoint}?countryIsoCode={country_code}&languageIsoCode={language}&validFrom={valid_from}&validTo={valid_to}"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            return response.json()
        except Exception as e:
            print(f"Error parsing JSON for {endpoint}: {e}")
            return []
    else:
        print(f"Failed to fetch {endpoint} for {country_code}: {response.status_code}")
        return []

# Parameters
country_code = "DE"  # Germany
valid_from = "2023-01-01"
valid_to = "2023-12-31"

# Fetch public and school holidays
public_holidays = fetch_holidays("PublicHolidays", country_code, valid_from, valid_to)
school_holidays = fetch_holidays("SchoolHolidays", country_code, valid_from, valid_to)

# Combine data
all_holidays = []

# Process public holidays
for holiday in public_holidays:
    all_holidays.append({
        "country": country_code,
        "date": holiday.get("startDate"),
        "name": holiday.get("name")[0]["text"] if holiday.get("name") else "Unknown",
        "type": "Public",
    })

# Process school holidays
for holiday in school_holidays:
    all_holidays.append({
        "country": country_code,
        "date": holiday.get("startDate"),
        "name": holiday.get("name")[0]["text"] if holiday.get("name") else "Unknown",
        "type": "School",
    })

# Create DataFrame
df = pd.DataFrame(all_holidays)

# Clean and format dates
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])  # Remove rows with invalid dates

# Count total data points
print(f"Total data points: {len(df)}")

# Save to CSV
df.to_csv("combined_holidays.csv", index=False)

# Display summary
print(df.info())
print(df.head())


Total data points: 147
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   country  147 non-null    object        
 1   date     147 non-null    datetime64[ns]
 2   name     147 non-null    object        
 3   type     147 non-null    object        
dtypes: datetime64[ns](1), object(3)
memory usage: 4.7+ KB
None
  country       date                       name    type
0      DE 2023-01-01             New Year's Day  Public
1      DE 2023-01-06                   Epiphany  Public
2      DE 2023-03-08  International Women's Day  Public
3      DE 2023-04-07                Good Friday  Public
4      DE 2023-04-09              Easter Sunday  Public


# My Question

### How do the frequency and timing of public and school holidays vary across countries?

# My Analysis

### Analays type 1: Frequency Table (Number of Holidays by Country)

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Load the cleaned dataset
try:
    df = pd.read_csv("holidays_data_cleaned.csv")
except FileNotFoundError:
    raise FileNotFoundError("The file 'holidays_data_cleaned.csv' was not found. Ensure the file exists.")

# Ensure all necessary columns exist
required_columns = {"date", "type", "country"}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset does not have the required columns: {required_columns}.")

# Convert 'date' column to datetime and handle missing or invalid dates
df["date"] = pd.to_datetime(df["date"], errors="coerce")
if df["date"].isna().all():
    raise ValueError("All entries in the 'date' column are invalid or missing.")

# Drop rows with missing required data
df = df.dropna(subset=["date", "type", "country"])

# Analysis 1: Frequency Analysis - Count Holidays by Type
holiday_counts = df["type"].value_counts()
print("Holiday counts by type:")
print(holiday_counts)

# Bar chart for holiday types
plt.figure(figsize=(10, 6))
plt.bar(holiday_counts.index, holiday_counts.values, color="skyblue", edgecolor="black")
plt.title("Frequency of Holidays by Type")
plt.xlabel("Holiday Type")
plt.ylabel("Number of Holidays")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# Analysis 2: Statistical Analysis - Mean and Median Number of Holidays per Country
holidays_per_country = df["country"].value_counts()
mean_holidays = holidays_per_country.mean()
median_holidays = holidays_per_country.median()

print(f"Mean holidays per country: {mean_holidays}")
print(f"Median holidays per country: {median_holidays}")

# Box plot to visualize the distribution of holidays across countries
plt.figure(figsize=(10, 6))
plt.boxplot(holidays_per_country, vert=False, patch_artist=True, boxprops=dict(facecolor="lightblue"))
plt.title("Distribution of Holidays Across Countries")
plt.xlabel("Number of Holidays")
plt.tight_layout()
plt.show()

# Analysis 3: Histogram - Distribution of Holidays Across Months
# Extract month from date
df["month"] = df["date"].dt.month
if df["month"].isna().all():
    print("No valid data available for 'month'. Histogram cannot be created.")
else:
    # Count holidays by month
    holidays_by_month = df["month"].value_counts().sort_index()

    # Histogram for holidays by month
    plt.figure(figsize=(10, 6))
    plt.bar(holidays_by_month.index, holidays_by_month.values, color="lightgreen", edgecolor="black")
    plt.title("Distribution of Holidays Across Months")
    plt.xlabel("Month")
    plt.ylabel("Number of Holidays")
    plt.xticks(range(1, 13))
    plt.tight_layout()
    plt.show()

# Analysis 4 (Bonus): Relative Frequency of Holidays by Type
total_holidays = len(df)
relative_frequency = holiday_counts / total_holidays

print("\nRelative frequency of holidays by type:")
print(relative_frequency)

# Pie chart for relative frequencies
plt.figure(figsize=(8, 8))
plt.pie(relative_frequency, labels=holiday_counts.index, autopct="%1.1f%%", colors=plt.cm.Paired.colors)
plt.title("Relative Frequency of Holidays by Type")
plt.tight_layout()
plt.show()


ValueError: All entries in the 'date' column are invalid or missing.

### Analysis 2: Probability Distribution (Holidays Across Months)

In [29]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("debugged_holidays_data.csv")

# Check the first few rows of the 'date' column
print("First 20 rows of 'date' column:")
print(df["date"].head(20))

# Check unique values in the 'date' column
print("\nUnique values in 'date' column:")
print(df["date"].unique())

# Convert 'date' column to datetime, coercing errors to NaT (invalid dates will be set to NaT)
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Check the number of NaN values after conversion
print("\nNumber of NaN values in 'date':", df["date"].isna().sum())

# Drop rows with NaN values in 'date'
df = df.dropna(subset=["date"])

# Extract the month from the 'date' column
df["month"] = df["date"].dt.month

# Check for any invalid months
print("\nValue counts for 'month':")
print(df["month"].value_counts())

# Ensure 'month' column contains valid integers (1-12)
df = df[(df["month"] >= 1) & (df["month"] <= 12)]

# Plot the histogram if valid data is available
if len(df) > 0:
    plt.hist(df["month"], bins=12, edgecolor="black")
    plt.title("Distribution of Holidays Across Months")
    plt.xlabel("Month")
    plt.ylabel("Number of Holidays")
    plt.xticks(range(1, 13))
    plt.show()
else:
    print("\nNo valid data available for 'month'. Histogram cannot be created.")


First 20 rows of 'date' column:
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
Name: date, dtype: float64

Unique values in 'date' column:
[nan]

Number of NaN values in 'date': 652

Value counts for 'month':
Series([], Name: count, dtype: int64)

No valid data available for 'month'. Histogram cannot be created.


# My Answer

### Write your answer here.