# Necessity Graphs 
## Datasets
- [General CPI](https://data.gov.sg/datasets/d_ba8a05c8908b5e1dc13540286d585f8a/view)
- [Gross Income](https://data.gov.sg/datasets/d_52760e82e8786bac11cca40eb29d1a93/view)

In [None]:
import pandas as pd
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
import re

## global variables

In [None]:
BASE_DATA_GOV_URL = "https://data.gov.sg/api/action/datastore_search"

## retrieve data

In [None]:
# function to fetch data from data.gov.sg 
def fetch_datagov_dataset(dataset_id: str, limit: int=10_000_000) -> pd.DataFrame:
    """
    Fetch dataset from data.gov.sg API.

    Args: 
        dataset_id (str): The dataset ID from data.gov.sg.
        limit (int, optional): Number of records to fetch. Defaults to 10 million.

    Returns: 
        pd.DataFrame: Data retrieved in DataFrame format.
    """
    response = requests.get(BASE_DATA_GOV_URL, params={"resource_id": dataset_id, "limit": limit})
    data = response.json()
    
    if not data["success"]:
        raise Error(f"Failed to fetch dataset ({dataset_id}).")
        
    return pd.DataFrame(data["result"]["records"])

In [None]:
# Fetch Consumer Price Index (CPI) dataset from data.gov.sg
datagov_dataset_id = "d_de7e93a1d0e22c790516a632747bf7f0"
cpi_df = fetch_datagov_dataset(datagov_dataset_id)

# Fetch Gross Income dataset from data.gov.sg 
income_dataset_id = "d_52760e82e8786bac11cca40eb29d1a93"
gross_income_df = fetch_datagov_dataset(income_dataset_id)

## Data Exploration CPI

In [None]:
cpi_df.head()

In [None]:
cpi_df.describe()

In [None]:
cpi_df.isnull().sum()

In [None]:
cpi_df.dtypes

In [None]:
cpi_df["DataSeries"].to_list()

## Data Exploration Gross Income

In [None]:
gross_income_df.head()

In [None]:
gross_income_df.describe()

In [None]:
gross_income_df.isnull().sum()

In [None]:
gross_income_df.dtypes

## Data Cleaning & Processing

In [None]:
cpi_df[cpi_df["DataSeries"].str.contains("Health Care")]

## Rows to use 
- Food Excl Food Serving Services
- Transport
- Housing & Utilities
- Telecommunication Services
- Health Care

In [None]:
def process_cpi(cpi_df: pd.DataFrame, row_list: List[str], year_range: int = 11, skip_years: List[str] = []) -> pd.DataFrame:
    cpi_df["DataSeries"] = cpi_df["DataSeries"].str.strip()
    temp_df = cpi_df[cpi_df["DataSeries"].isin(row_list)].reset_index(drop=True)
    temp_df = temp_df.drop(columns=["_id"], errors="ignore")

    current_year = 2024
    available_years = {col[:4] for col in cpi_df.columns if col[:4].isdigit()}
    y_range = [
        str(current_year - x)
        for x in range(year_range)
        if str(current_year - x) in available_years and str(current_year - x) not in map(str, skip_years)
    ]

    yearly_avg = {}
    row_names = []
    for index, row in temp_df.iterrows():
        row_name = row["DataSeries"]
        row_names.append(row_name)

        yearly_avg[row_name] = {}

        for year in y_range: 
            month_col = [col for col in temp_df.columns if col.startswith(year)]
            if month_col: 
                row_data = pd.to_numeric(row[month_col], errors="coerce").dropna()

                if not row_data.empty:
                    yearly_avg[row_name][year] = row_data.mean()
                else:
                    yearly_avg[row_name][year] = None 
    return pd.DataFrame(yearly_avg).T

In [None]:
def process_income(income_df: pd.DataFrame, row_name: str, year_range: int = 11, skip_years: list[str] = []) -> pd.DataFrame:
    income_df["DataSeries"].str.strip()
    temp_df = income_df[income_df["DataSeries"].str.contains(row_name)].reset_index(drop=True)
    temp_df = temp_df.drop(columns=["_id"], errors="ignore")

    current_year = 2024
    available_years = set(map(str, income_df.columns))
    y_range = [
        str(current_year - x)
        for x in range(year_range)
        if str(current_year - x) in available_years and str(current_year - x) not in map(str, skip_years)
    ]

    temp_df.index = ["Income"]
    return temp_df[y_range]

In [None]:
n_list = [
    "Food Excl Food Serving Services",
    "Transport",
    "Housing & Utilities",
    "Telecommunication Services",
    "Health Care"
]
necessity_df = process_cpi(cpi_df=cpi_df, row_list=n_list)
necessity_df

In [None]:
income_df = process_income(income_df=gross_income_df, row_name="Median")
income_df = income_df.transpose()
income_df

In [None]:
#total_necessity_df = pd.DataFrame({
#    "Total Necessity": necessity_df.mean(axis=0)
#})
necessity_df = necessity_df.transpose()
necessity_df

In [None]:
necessity_income_df = pd.merge(necessity_df, income_df, left_index=True, right_index=True, how="outer")
necessity_income_df = necessity_income_df.apply(pd.to_numeric, errors="coerce")
necessity_income_df

# convert year to integer
# necessity_income_df.index = necessity_income_df.index.astype(int)

# set index name to year 
# necessity_income_df.index = necessity_income_df.index.set_names(["Year"])

# necessity_income_df["Income"] = pd.to_numeric(necessity_income_df["Income"], errors="coerce")

# necessity_income_df

## Visualisation

In [None]:
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting CPI components on the left y-axis
cpi_components = ['Food Excl Food Serving Services', 'Housing & Utilities', 
                  'Health Care', 'Transport', 'Telecommunication Services']
for component in cpi_components:
    ax1.plot(necessity_income_df.index, necessity_income_df[component], label=component)

ax1.set_xlabel('Year')
ax1.set_ylabel('CPI Index', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.grid(True)

# Creating the second y-axis for Income
ax2 = ax1.twinx()
ax2.plot(necessity_income_df.index, necessity_income_df['Income'], color='red', label='Income', linestyle='--')
ax2.set_ylabel('Income', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Title and Legend
plt.title('Cost of Living and Income Trends (2014-2024)')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.show()

## Affordability Index Percentage Over Time 

## Data cleaning and processing 

**Affordabilty Index Algorithm**
$$\text{Affordability Index} = \left(\frac{\text{Necessity Avg}}{\text{Income}}\right) \times 100$$

In [None]:
affordability_df = necessity_income_df.copy()

affordability_df["Affordability Index"] = (affordability_df["Total Necessity"] / affordability_df["Income"]) * 100

affordability_df = affordability_df[["Affordability Index"]]

affordability_df

### Visualisation

In [None]:
fig2, ax3 = plt.subplots(figsize=(12, 6))

sns.lineplot(x=affordability_df.index, y=affordability_df["Affordability Index"], marker="o", color="blue", label="Affordability Index", ax=ax3)

# labels and title
ax3.set_xlabel("Year")
ax3.set_ylabel("Affordability Index (%)")
ax3.set_title("Affordability Index Over Time")

# add grid for better readability 
ax3.grid(True, linestyle="--", alpha=0.6)

plt.xticks(rotation=45)

plt.show()

## Necessity Breakdown 

### Data Cleaning & Processing

In [None]:
necessity_breakdown_df = necessity_df.copy()
necessity_breakdown_df = necessity_breakdown_df.transpose()
necessity_breakdown_df = necessity_breakdown_df[::-1]
necessity_breakdown_df

### Visualisation

In [None]:
fig3, ax4 = plt.subplots(figsize=(12, 6))

# plot stacked bar chart 
necessity_breakdown_df.plot(kind="bar", stacked=True, ax=ax4, cmap="coolwarm", alpha=0.75)

# labels and title
ax4.set_xlabel("Year")
ax4.set_ylabel("CPI Value")
ax4.set_title("Breakdown of Basic Necessities Over Time")

# legend
ax4.legend(title="Basic Necessities Components", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.xticks(rotation=45)

plt.show()