# Healthcare Graphs Analysis
This Jupyter Notebook retrieves, processes, and visualises life expectancy and healthcare costs over the years in Singapore using data from:
- [**Life Expectancy dataset**](https://tablebuilder.singstat.gov.sg/table/TS/M810501#!)
- [**CPI dataset**](https://data.gov.sg/datasets/d_de7e93a1d0e22c790516a632747bf7f0/view?dataExplorerPage=9)
- [**Gross Income dataset**](https://data.gov.sg/datasets/d_52760e82e8786bac11cca40eb29d1a93/view)
## Libraries

In [None]:
import pandas as pd
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from typing import List

# Constants for API endpoints
BASE_DATA_GOV_URL = "https://data.gov.sg/api/action/datastore_search"
BASE_SINGSTAT_URL = "https://tablebuilder.singstat.gov.sg/api/table/tabledata"

## Data Retrieval Functions 

In [None]:
# function to fetch data from data.gov.sg 
def fetch_datagov_dataset(dataset_id: str, limit: int=10_000_000) -> pd.DataFrame:
    """
    Fetch dataset from data.gov.sg API.

    Args: 
        dataset_id (str): The dataset ID from data.gov.sg.
        limit (int, optional): Number of records to fetch. Defaults to 10 million.

    Returns: 
        pd.DataFrame: Data retrieved in DataFrame format.
    """
    response = requests.get(BASE_DATA_GOV_URL, params={"resource_id": dataset_id, "limit": limit})
    data = response.json()
    
    if not data["success"]:
        raise Error(f"Failed to fetch dataset ({dataset_id}).")
        
    return pd.DataFrame(data["result"]["records"])

In [None]:
def parse_singstat_2d_data(raw_data: dict) -> pd.DataFrame:
    """
    Parse the JSON data from SingStat API into a structured DataFrame.

    Args: 
        raw_data (dict): Raw JSON data from SingState API.

    Returns: 
        pd.DataFrame: Parsed DataFrame with years as columns.
    """
    rows = raw_data.get("Data", {}).get("row", [])
    records = []

    for row in rows:
        series_name = row.get("rowText", "")
        columns = row.get("columns", [])
        series_data = {col["key"]: col["value"] for col in columns}
        series_data["Series"] = series_name
        records.append(series_data)

    df = pd.DataFrame(records)
    df.set_index("Series")
    df.columns.name = "Year"
    return df

def fetch_singstat_dataset(dataset_id: str, limit: int = 10_000_000) -> pd.DataFrame:
    """ 
    Fetch dataset from SingStat API.

    Args: 
        dataset_id (str): The dataset ID from SingStat.
        limit (int, optional): Number of records to fetch. Defaults to 10 million.

    Returns:
        pd.DataFrame: Data retrieved in DataFrame format.
    """
    response = requests.get(
        f"{BASE_SINGSTAT_URL}/{dataset_id}",
        params={"limit": limit},
        headers={
            "Accept": "application/json",
            "User-Agent": "curl/8.11.1", # Need to fake the user agent because SingStat blocks `python-requests`.
        }
    )
    data = response.json()
    return parse_singstat_2d_data(data)

## Fetch Data

In [None]:
# Fetch Consumer Price Index (CPI) dataset from data.gov.sg
datagov_dataset_id = "d_de7e93a1d0e22c790516a632747bf7f0"
cpi_df = fetch_datagov_dataset(datagov_dataset_id)

# Fetch Life Expectancy dataset from SingStat
sing_stat_dataset_id = "M810501"
lx_df = fetch_singstat_dataset(sing_stat_dataset_id)

# Fetch gross income 
income_dataset_id = "d_52760e82e8786bac11cca40eb29d1a93"
income_df = fetch_datagov_dataset(income_dataset_id)

## Data Exploration CPI

In [None]:
cpi_df.head()

In [None]:
cpi_df.describe()

In [None]:
cpi_df.isnull().sum()

In [None]:
cpi_df.dtypes

## Data Exploration Life Expectancy

In [None]:
lx_df.head()

In [None]:
lx_df.describe()

In [None]:
lx_df.isnull().sum()

In [None]:
lx_df.dtypes

## Data Exploration Income

In [None]:
income_df.head()

In [None]:
income_df.describe()

In [None]:
lx_df.isnull().sum()

In [None]:
lx_df.dtypes

## Life Expectancy vs Healthcare Cost
## Data Cleaning & Processing

In [None]:
def get_yearly_avg(cpi_df: pd.DataFrame, y_range: int = 11) -> pd.DataFrame:
    """
    Calculate yearly average healthcare costs from monthly CPI data.

    Args:
        cpi_df (pd.DataFrame): CPI dataset.
        y_range (int, optional): Number of years to consider. Defaults to 11.

    Returns: 
        pd.DataFrame: Yearly average healthcare cost.
    """
    current_year = 2023
    year_range = [str(current_year-x) for x in range(y_range)]

    yearly_avg = {}
    for year in year_range:
        month_col = [col for col in cpi_df.columns if col.startswith(year)]

        if month_col: 
            cpi_df[month_col] = cpi_df[month_col].apply(pd.to_numeric)
            yearly_avg[year] = cpi_df[month_col].sum(axis=1) / len(month_col)
            
    return pd.DataFrame(yearly_avg)


In [None]:
def format_life_expectancy(lx_df: pd.DataFrame, series_name: str ,y_range: int=11) -> pd.DataFrame:
    """
    Format and filter Life Expectancy dataset.

    Args:
        lx_df (pd.DataFrame): Life Expectancy dataset.
        series_name (str): The series name to extract. 
        y_range (int, optional): Number of years to soncider. Default to 11.

    Returns: 
        pd.DataFrame: Filtered and formatted DataFrame.
    """
    formated_df = lx_df[lx_df["Series"] == series_name].drop(columns=["Series"])
    years_to_use = [str(2023-x) for x in range(y_range)]

    return formated_df[years_to_use]

In [None]:
# Filter CPI dataset to retain only healthcare data
health_care_df = cpi_df[cpi_df['DataSeries'].str.contains("Health Care")].reset_index(drop=True)

# Calculate yearly healthcare cost average
hc_yearly_df = get_yearly_avg(health_care_df, y_range=22)

In [None]:
# Format life expectancy data
series_name = "Total Life Expectancy At Birth (Residents)"
lx_yearly_df = format_life_expectancy(lx_df, series_name, y_range=22)

In [None]:
# Clean and merge both datasets
lx_yearly_df = lx_yearly_df.reset_index(drop=True)
merged_df = pd.concat([lx_yearly_df, hc_yearly_df])
merged_df.index = ["Life Expectancy", "Healthcare Average"]

In [None]:
# Transpose and format for visualisation 
merged_df = merged_df.T.reset_index().rename(columns={"index": "Year"})

In [None]:
# Make sure all numbers are numeric.
merged_df = merged_df.apply(pd.to_numeric)

In [None]:
merged_df = merged_df[merged_df["Year"] >= 2014]
merged_df

## Visualisation

In [None]:
# Dual-axis line chart
fig1, ax1 = plt.subplots(figsize=(10, 5))

# First Line: Life Expectancy 
sns.lineplot(data=merged_df, x="Year", y="Life Expectancy", marker="o", ax=ax1, color="blue", label="Life Expectancy", legend=False)  
ax1.set_ylabel("Life Expectancy", color="blue")
ax1.tick_params(axis='y', labelcolor="blue")

# Second Line: Healthcare Cost Average
ax2 = ax1.twinx()
sns.lineplot(data=merged_df, x="Year", y="Healthcare Average", marker='s', ax=ax2, color="red", label="Healthcare  Cost Average", legend=False)
ax2.set_ylabel(" Healthcare Cost Average", color="red")
ax2.tick_params(axis='y', labelcolor="red")

# Legend & Labels
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc=0)

# Title and Layout 
ax1.set_xlabel("Year")
ax1.set_title("Life Expectancy vs. Healthcare Cost Average (Dual-Axis line chart)")
fig1.tight_layout(rect=[0, 0, 0.85, 1])

# Make x-axis ticks use a step size of 1.
ax1.xaxis.set_major_locator(plticker.MultipleLocator(base=1.0))

# Show plot
plt.show()

## Healthcare Cost vs Income Growth
## Data Cleaning & Processing

In [None]:
def process_income(income_df: pd.DataFrame, row_name: str, year_range: int = 11, skip_years: list[str] = []) -> pd.DataFrame:
    income_df["DataSeries"].str.strip()
    gross_income_df = income_df[income_df["DataSeries"].str.contains(row_name)].reset_index(drop=True)
    gross_income_df = gross_income_df.drop(columns=["_id"], errors="ignore")

    current_year = 2024
    # y_range = [str(current_year-x) for x in range(year_range)]
    available_years = set(map(str, income_df.columns))
    y_range = [
        str(current_year - x)
        for x in range(year_range)
        if str(current_year - x) in available_years and str(current_year - x) not in map(str, skip_years)
    ]

    gross_income_df.index = ["Income"]
    
    return gross_income_df[y_range]

In [None]:
def process_healthcare(cpi_df: pd.DataFrame, row_lists: List[str], year_range: int = 11, skip_years: List[str] = []) -> pd.DataFrame: 
    cpi_df["DataSeries"] = cpi_df["DataSeries"].str.strip()
    hc_df = cpi_df[cpi_df["DataSeries"].isin(row_lists)].reset_index(drop=True)
    hc_df = hc_df.drop(columns=["_id"], errors="ignore")

    current_year = 2024
    # y_range = [str(current_year-x) for x in range(year_range)]
    available_years = {col[:4] for col in cpi_df.columns if col[:4].isdigit()}
    y_range = [
        str(current_year - x)
        for x in range(year_range)
        if str(current_year - x) in available_years and str(current_year - x) not in map(str, skip_years)
    ]

    yearly_avg = {}
    row_names  = []
    for index, row in hc_df.iterrows():
        row_name = row["DataSeries"]
        row_names.append(row_name)

        yearly_avg[row_name] = {}


        for year in y_range: 
            month_col = [col for col in hc_df.columns if col.startswith(year)]
            if month_col: 
                row_data = pd.to_numeric(row[month_col], errors="coerce").dropna()

                if not row_data.empty: 
                    yearly_avg[row_name][year] = row_data.mean()
                else:
                    yearly_avg[row_name][year] = None

    return pd.DataFrame(yearly_avg).T

In [None]:
process_income_df = process_income(income_df, "Median", year_range=22, skip_years=["2005"])
process_income_df

In [None]:
health_care_col = [
    "Medicines & Health Products",
    "Outpatient Services",
    "Hospital Services",
    "Health Insurance"
]
process_healthcare_df = process_healthcare(cpi_df=cpi_df, row_lists=health_care_col, year_range=22, skip_years=["2005"])
process_healthcare_df

### Merge income and healthcare

In [None]:
income_healthcare_df = pd.concat([process_income_df, process_healthcare_df])
income_healthcare_df = income_healthcare_df.transpose()
income_healthcare_df = income_healthcare_df[::-1]

# Convert to numeric because all the data are strings.
income_healthcare_df = income_healthcare_df.apply(pd.to_numeric, errors="coerce")

# Convert index to number.
income_healthcare_df.index = income_healthcare_df.index.astype(int)

# Filter to 2014 and beyond.
income_healthcare_df = income_healthcare_df[income_healthcare_df.index >= 2014]

income_healthcare_df

## Visualisation

In [None]:
fig2, ax3 = plt.subplots(figsize=(12,6))

# stackbar chart
income_healthcare_df[["Medicines & Health Products", "Outpatient Services", "Hospital Services", "Health Insurance"]].plot(kind="bar", stacked=True, alpha=0.5, ax=ax3, cmap="coolwarm")
ax3.set_ylabel("Healthcare Cost Index")

# label and title
ax3.set_xlabel("Year")
ax3.set_title("Healthcare Costs vs Wage Growth Over Time")

# show legend
ax3.legend(title="Healthcare Cost Categories", bbox_to_anchor=(1.05, 1), loc="upper left")


# line chart for income
# `ax3.get_xticks()` must be specified as the x values.
# See: # https://stackoverflow.com/questions/49894161/line-doesnt-show-over-barplot
ax4 = ax3.twinx()
sns.lineplot(x=ax3.get_xticks(), y=income_healthcare_df["Income"], marker="o", color="black", linewidth=2, ax=ax4)
ax4.set_ylabel("Income", color="black")

# Rotate x axis labels 
plt.xticks(rotation=45)

plt.show()

### Percentage change healthcare Costs vs Wage Growth 

In [None]:
row_list = [
    "Health Care"
]
healthcare_df = process_healthcare(cpi_df=cpi_df, row_lists=row_list, year_range=22, skip_years=["2005"])
healthcare_df = healthcare_df.transpose()
healthcare_df = healthcare_df[::-1]
healthcare_df["Health Care % Change"] = healthcare_df["Health Care"].pct_change() * 100
healthcare_df.index = healthcare_df.index.set_names("Year")
healthcare_df.index = pd.to_numeric(healthcare_df.index)
healthcare_df

In [None]:
temp_income_df = process_income_df.stack().reset_index()
temp_income_df= temp_income_df.drop("level_0", axis=1)
temp_income_df = temp_income_df.rename({
    "level_1": "Year",
    0: "Income",
}, axis=1)
temp_income_df = temp_income_df.apply(pd.to_numeric)
temp_income_df = temp_income_df.sort_values("Year")
temp_income_df["Income % Change"] = temp_income_df["Income"].pct_change() * 100
temp_income_df = temp_income_df.set_index("Year")
temp_income_df

In [None]:
# healthcare_income_percentage_df = pd.concat([healthcare_df, temp_income_df])
# healthcare_income_percentage_df.loc["2003"]
# healthcare_income_percentage_df = healthcare_income_percentage_df.drop(healthcare_income_percentage_df.loc["2003"].index)
# healthcare_income_percentage_df

healthcare_income_percentage_df = pd.merge(healthcare_df, temp_income_df, left_index=True, right_index=True, how="outer")

healthcare_income_percentage_df = healthcare_income_percentage_df.drop("2003", errors="ignore")

# Convert year to integer.
healthcare_income_percentage_df.index = healthcare_income_percentage_df.index.astype(int)

# Set index name to "Year".
healthcare_income_percentage_df.index = healthcare_income_percentage_df.index.set_names(["Year"])

# Filter to 2014 and beyond.
healthcare_income_percentage_df = healthcare_income_percentage_df[
    healthcare_income_percentage_df.index >= 2014
]
healthcare_income_percentage_df

In [None]:
# Unpivot to long format in preparation for the category plot.
healthcare_income_percentage_df = healthcare_income_percentage_df.reset_index().melt(
    id_vars="Year",
    value_vars=["Health Care % Change", "Income % Change"],
    var_name="Category",
    value_name="% Change"
)
healthcare_income_percentage_df = healthcare_income_percentage_df.replace("Health Care % Change", "Healthcare")
healthcare_income_percentage_df = healthcare_income_percentage_df.replace("Income % Change", "Income")
healthcare_income_percentage_df

### Visualisation 

In [None]:
sns.catplot(
    data=healthcare_income_percentage_df,
    x="Year",
    y="% Change",
    hue="Category",
    kind="bar",
    height=5,
    aspect=2.5,
)

## Breakdown of Healthcare CPI 

In [None]:
healthcare_cols = [
    "Medicines & Health Products",
    "Medicines & Vitamins",
    "Medical Products",
    "Outpatient Services",
    "Fees At Polyclinics",
    "Fees At GP Clinics",
    "Fees At Specialist Clinics",
    "Dental Services",
    "Paramedical Services",
    "Hospital Services",
    "Health Insurance"
]

all_healthcare_df = process_healthcare(cpi_df=cpi_df, row_lists=healthcare_cols, year_range=22)
all_healthcare_df = all_healthcare_df.fillna(method="ffill")
all_healthcare_df = all_healthcare_df.transpose()
all_healthcare_df = all_healthcare_df[::-1]

# Convert year index to integer.
all_healthcare_df.index = all_healthcare_df.index.astype(int)

# Filter to 2014 and beyond.
all_healthcare_df = all_healthcare_df[all_healthcare_df.index >= 2014]

all_healthcare_df

### Visualisation 

In [None]:
fig4, ax7 = plt.subplots(figsize=(12,6))

all_healthcare_df.plot(kind="bar", stacked=True, ax=ax7, cmap="coolwarm", alpha=0.75)

ax7.set_xlabel("Year")
ax7.set_ylabel("CPI Value")
ax7.set_title("Breakdown of Healthcare CPI Components Over Time")

ax7.legend(title="Healthcare Components", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.xticks(rotation=45)

plt.show()