In [13]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Merging National Data

In [14]:
# population 
# ======================
URL_DATA = './staging/silver/national_population_silver_data.parquet'
national_population = pd.read_parquet(URL_DATA)

# gdp 
# ======================
URL_DATA = './staging/silver/gdp_national_silver_data.parquet'
gdp_national = pd.read_parquet(URL_DATA)


# inflation 
# ======================
URL_DATA = './staging/silver/inflation_national_silver_data.parquet'
inflation_national = pd.read_parquet(URL_DATA)

# labour force 
# ======================
URL_DATA = './staging/silver/Labour_force_national_silver_data.parquet'
Labour_force_national = pd.read_parquet(URL_DATA)

# household mean income 
# ======================
URL_DATA = './staging/silver/house_hold_income_national_silver_data.parquet'
house_hold_income_national = pd.read_parquet(URL_DATA)

# relative poverty 
# ======================
URL_DATA = './staging/silver/house_hold_poverty_national_silver_data.parquet'
house_hold_poverty_national = pd.read_parquet(URL_DATA)



In [15]:

national_data = national_population.merge(gdp_national, on='year', how='left')

national_data = national_data.merge(inflation_national, on='year', how='left')

national_data = national_data.merge(Labour_force_national, on='year', how='left')

national_data = national_data.merge(house_hold_income_national, on='year', how='left')

national_data = national_data.merge(house_hold_poverty_national, on='year', how='left')

national_data.fillna(0, inplace=True)

national_data

Unnamed: 0,year,population_national,gdp_national(RM),gdp_national_per_capita(RM),Inflation(%),labour_force_national,employed_persons_national,unemployed_persons_national,outside_labour_force_national,unemployed_rate_national(%),participation_rate_national(%),income_mean_national(RM),income_median_national(RM),poverty_relative_national_median_income(%)
0,1970,10881800,73709680000.0,6773390000.0,1.8,0.0,0.0,0.0,0.0,0.0,0.0,264.0,166.0,0.0
1,1971,11159700,81106200000.0,7265660000.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,288.5,181.2,0.0
2,1972,11441300,88720810000.0,7751699000.0,3.2,0.0,0.0,0.0,0.0,0.0,0.0,313.0,196.5,0.0
3,1973,11719800,99102100000.0,8455832000.0,10.6,0.0,0.0,0.0,0.0,0.0,0.0,337.5,211.8,0.0
4,1974,12001300,107346100000.0,8941871000.0,17.3,0.0,0.0,0.0,0.0,0.0,0.0,362.0,227.0,0.0
5,1975,12300300,108206000000.0,8798552000.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,433.5,267.5,0.0
6,1976,12588100,120718400000.0,9589923000.0,2.6,0.0,0.0,0.0,0.0,0.0,0.0,505.0,308.0,0.0
7,1977,12901100,130077700000.0,10082190000.0,4.8,0.0,0.0,0.0,0.0,0.0,0.0,562.7,348.3,0.0
8,1978,13200200,138733000000.0,10509520000.0,4.9,0.0,0.0,0.0,0.0,0.0,0.0,620.3,388.7,0.0
9,1979,13518300,151703300000.0,11223580000.0,3.7,0.0,0.0,0.0,0.0,0.0,0.0,678.0,429.0,0.0


### Nationa data extrapolation 

In [16]:
### National data extrapolation 

# Updated list of features to forecast
# List of features to forecast
features_to_extend = [
    'population_national',
    'gdp_national(RM)',
    'gdp_national_per_capita(RM)',
    'Inflation(%)',
    'labour_force_national',
    'employed_persons_national',
    'unemployed_persons_national',
    'outside_labour_force_national',
    'unemployed_rate_national(%)',
    'participation_rate_national(%)',
    'income_mean_national(RM)',
    'income_median_national(RM)',
    'poverty_relative_national_median_income(%)'
]



# Sort data by year
df = national_data.sort_values('year').reset_index(drop=True)

# Define future years to forecast
target_years = [2023, 2024, 2025]

# Loop through each feature to forecast its values
for feature in features_to_extend:
    # Keep only non-zero, non-null values
    clean = df[df[feature] != 0][['year', feature]].dropna()

    # Calculate average yearly change (slope)
    slope = (clean[feature].iloc[-1] - clean[feature].iloc[0]) / \
            (clean['year'].iloc[-1] - clean['year'].iloc[0])

    # Get last known value and year
    last_val = clean[feature].iloc[-1]
    last_year = clean['year'].iloc[-1]

    # Generate projected values for each target year
    for year in target_years:
        if year >= last_year + 1:
            value = last_val + slope * (year - last_year)

            # If the year already exists, only update if value is 0
            if year in df['year'].values:
                if df.loc[df['year'] == year, feature].values[0] == 0:
                    df.loc[df['year'] == year, feature] = value
            else:
                # Add new row if year doesn't exist
                df = pd.concat([
                    df,
                    pd.DataFrame([{'year': year, feature: value}])
                ], ignore_index=True)

# Final cleanup: sort by year and fill any remaining NaNs with 0
df = df.sort_values('year').reset_index(drop=True).fillna(0)

national_data = df.round(0)
national_data


Unnamed: 0,year,population_national,gdp_national(RM),gdp_national_per_capita(RM),Inflation(%),labour_force_national,employed_persons_national,unemployed_persons_national,outside_labour_force_national,unemployed_rate_national(%),participation_rate_national(%),income_mean_national(RM),income_median_national(RM),poverty_relative_national_median_income(%)
0,1970,10881800,73709680000.0,6773390000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,166.0,0.0
1,1971,11159700,81106200000.0,7265660000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,288.0,181.0,0.0
2,1972,11441300,88720810000.0,7751699000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,313.0,196.0,0.0
3,1973,11719800,99102100000.0,8455832000.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,338.0,212.0,0.0
4,1974,12001300,107346100000.0,8941871000.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,362.0,227.0,0.0
5,1975,12300300,108206000000.0,8798552000.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,434.0,268.0,0.0
6,1976,12588100,120718400000.0,9589923000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,505.0,308.0,0.0
7,1977,12901100,130077700000.0,10082190000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,563.0,348.0,0.0
8,1978,13200200,138733000000.0,10509520000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,620.0,389.0,0.0
9,1979,13518300,151703300000.0,11223580000.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,678.0,429.0,0.0


### National data column aggregation

In [17]:
# gdp growth growth rate %
# ===========================

national_data['gdp_national_growth_rate(%)'] = national_data['gdp_national(RM)'].pct_change() * 100
national_data['gdp_national_growth_rate(%)'] = national_data['gdp_national_growth_rate(%)'].round(0)

# gdp per capita growth rate %
# ===========================

national_data['gdp_national_per_capita_growth_rate(%)'] = national_data['gdp_national_per_capita(RM)'].pct_change() * 100
national_data['gdp_national_per_capita_growth_rate(%)'] = national_data['gdp_national_per_capita_growth_rate(%)'].round(0)

# median income growth rate %
# ===========================

national_data['income_median_growth_rate(%)'] = national_data['income_median_national(RM)'].pct_change() * 100
national_data['income_median_growth_rate(%)'] = national_data['income_median_growth_rate(%)'].round(0)

# income inequality %
# ===========================

national_data['income_inequality(%)'] = (
    (national_data['income_mean_national(RM)'] - national_data['income_median_national(RM)']) /
    national_data['income_mean_national(RM)']
) * 100

national_data['income_inequality(%)'] = national_data['income_inequality(%)'].round(0)
national_data.fillna(0, inplace=True)
national_data

Unnamed: 0,year,population_national,gdp_national(RM),gdp_national_per_capita(RM),Inflation(%),labour_force_national,employed_persons_national,unemployed_persons_national,outside_labour_force_national,unemployed_rate_national(%),participation_rate_national(%),income_mean_national(RM),income_median_national(RM),poverty_relative_national_median_income(%),gdp_national_growth_rate(%),gdp_national_per_capita_growth_rate(%),income_median_growth_rate(%),income_inequality(%)
0,1970,10881800,73709680000.0,6773390000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,166.0,0.0,0.0,0.0,0.0,37.0
1,1971,11159700,81106200000.0,7265660000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,288.0,181.0,0.0,10.0,7.0,9.0,37.0
2,1972,11441300,88720810000.0,7751699000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,313.0,196.0,0.0,9.0,7.0,8.0,37.0
3,1973,11719800,99102100000.0,8455832000.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,338.0,212.0,0.0,12.0,9.0,8.0,37.0
4,1974,12001300,107346100000.0,8941871000.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,362.0,227.0,0.0,8.0,6.0,7.0,37.0
5,1975,12300300,108206000000.0,8798552000.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,434.0,268.0,0.0,1.0,-2.0,18.0,38.0
6,1976,12588100,120718400000.0,9589923000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,505.0,308.0,0.0,12.0,9.0,15.0,39.0
7,1977,12901100,130077700000.0,10082190000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,563.0,348.0,0.0,8.0,5.0,13.0,38.0
8,1978,13200200,138733000000.0,10509520000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,620.0,389.0,0.0,7.0,4.0,12.0,37.0
9,1979,13518300,151703300000.0,11223580000.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,678.0,429.0,0.0,9.0,7.0,10.0,37.0


In [18]:
growth_df = national_data[['year', 'income_median_national(RM)', 'income_median_growth_rate(%)']]
growth_df

Unnamed: 0,year,income_median_national(RM),income_median_growth_rate(%)
0,1970,166.0,0.0
1,1971,181.0,9.0
2,1972,196.0,8.0
3,1973,212.0,8.0
4,1974,227.0,7.0
5,1975,268.0,18.0
6,1976,308.0,15.0
7,1977,348.0,13.0
8,1978,389.0,12.0
9,1979,429.0,10.0


### National data quality check

In [19]:
# data formatting
# ========================

national_data['income_inequality(%)'] = national_data['income_inequality(%)'].astype('float64')
national_data['income_mean_national(RM)'] = national_data['income_mean_national(RM)'].astype('float64')
national_data['income_median_national(RM)'] = national_data['income_median_national(RM)'].astype('float64')
national_data['income_median_growth_rate(%)'] = national_data['income_median_growth_rate(%)'].astype('float64')

national_data['poverty_relative_national_median_income(%)'] = national_data['poverty_relative_national_median_income(%)'] / 100
national_data['Inflation(%)'] = national_data['Inflation(%)'] / 100
national_data['unemployed_rate_national(%)'] = national_data['unemployed_rate_national(%)'] / 100
national_data['income_inequality(%)'] = national_data['income_inequality(%)'] / 100
national_data['income_median_growth_rate(%)'] = national_data['income_median_growth_rate(%)'] / 100
national_data['gdp_national_growth_rate(%)'] = national_data['gdp_national_growth_rate(%)'] / 100
national_data['gdp_national_per_capita_growth_rate(%)'] = national_data['gdp_national_per_capita_growth_rate(%)'] / 100
national_data['participation_rate_national(%)'] = national_data['participation_rate_national(%)'] / 100

national_data['labour_force_national'] = national_data['labour_force_national'].astype(int)
national_data['employed_persons_national'] = national_data['employed_persons_national'].astype(int)
national_data['unemployed_persons_national'] = national_data['unemployed_persons_national'].astype(int)
national_data['outside_labour_force_national'] = national_data['outside_labour_force_national'].astype(int)

national_data.replace([np.inf, -np.inf], np.nan, inplace=True)
national_data = national_data.fillna(0)

national_data


Unnamed: 0,year,population_national,gdp_national(RM),gdp_national_per_capita(RM),Inflation(%),labour_force_national,employed_persons_national,unemployed_persons_national,outside_labour_force_national,unemployed_rate_national(%),participation_rate_national(%),income_mean_national(RM),income_median_national(RM),poverty_relative_national_median_income(%),gdp_national_growth_rate(%),gdp_national_per_capita_growth_rate(%),income_median_growth_rate(%),income_inequality(%)
0,1970,10881800,73709680000.0,6773390000.0,0.02,0,0,0,0,0.0,0.0,264.0,166.0,0.0,0.0,0.0,0.0,0.37
1,1971,11159700,81106200000.0,7265660000.0,0.02,0,0,0,0,0.0,0.0,288.0,181.0,0.0,0.1,0.07,0.09,0.37
2,1972,11441300,88720810000.0,7751699000.0,0.03,0,0,0,0,0.0,0.0,313.0,196.0,0.0,0.09,0.07,0.08,0.37
3,1973,11719800,99102100000.0,8455832000.0,0.11,0,0,0,0,0.0,0.0,338.0,212.0,0.0,0.12,0.09,0.08,0.37
4,1974,12001300,107346100000.0,8941871000.0,0.17,0,0,0,0,0.0,0.0,362.0,227.0,0.0,0.08,0.06,0.07,0.37
5,1975,12300300,108206000000.0,8798552000.0,0.04,0,0,0,0,0.0,0.0,434.0,268.0,0.0,0.01,-0.02,0.18,0.38
6,1976,12588100,120718400000.0,9589923000.0,0.03,0,0,0,0,0.0,0.0,505.0,308.0,0.0,0.12,0.09,0.15,0.39
7,1977,12901100,130077700000.0,10082190000.0,0.05,0,0,0,0,0.0,0.0,563.0,348.0,0.0,0.08,0.05,0.13,0.38
8,1978,13200200,138733000000.0,10509520000.0,0.05,0,0,0,0,0.0,0.0,620.0,389.0,0.0,0.07,0.04,0.12,0.37
9,1979,13518300,151703300000.0,11223580000.0,0.04,0,0,0,0,0.0,0.0,678.0,429.0,0.0,0.09,0.07,0.1,0.37


In [20]:
national_data.dtypes

year                                            int32
population_national                             int64
gdp_national(RM)                              float64
gdp_national_per_capita(RM)                   float64
Inflation(%)                                  float64
labour_force_national                           int64
employed_persons_national                       int64
unemployed_persons_national                     int64
outside_labour_force_national                   int64
unemployed_rate_national(%)                   float64
participation_rate_national(%)                float64
income_mean_national(RM)                      float64
income_median_national(RM)                    float64
poverty_relative_national_median_income(%)    float64
gdp_national_growth_rate(%)                   float64
gdp_national_per_capita_growth_rate(%)        float64
income_median_growth_rate(%)                  float64
income_inequality(%)                          float64
dtype: object

In [21]:
# save data
national_data.to_parquet("./staging/gold/national_data_gold_data.parquet", index=False)

## Merging State Data

In [22]:
# population 
# ======================
URL_DATA = './staging/silver/state_population_silver_data.parquet'
state_population = pd.read_parquet(URL_DATA)

# gdp 
# ======================
URL_DATA = './staging/silver/gdp_state_silver_data.parquet'
gdp_state = pd.read_parquet(URL_DATA)

# labour force 
# ======================
URL_DATA = './staging/silver/Labour_force_state_silver_data.parquet'
Labour_force_state = pd.read_parquet(URL_DATA)

# household mean income 
# ======================
URL_DATA = './staging/silver/house_hold_income_state_silver_data.parquet'
house_hold_income_state = pd.read_parquet(URL_DATA)

# household expenditure 
# ======================
URL_DATA = './staging/silver/house_hold_expenditure_silver_data.parquet'
house_hold_expenditure = pd.read_parquet(URL_DATA)

# relative poverty 
# ======================
URL_DATA = './staging/silver/house_hold_poverty_state_silver_data.parquet'
house_hold_poverty_state = pd.read_parquet(URL_DATA)


In [23]:

state_data = state_population.merge(gdp_state, on=['year', 'state'], how='left')

state_data = state_data.merge(Labour_force_state, on=['year', 'state'], how='left')

state_data = state_data.merge(house_hold_income_state, on=['year', 'state'], how='left')

state_data = state_data.merge(house_hold_expenditure, on=['year', 'state'], how='left')

state_data = state_data.merge(house_hold_poverty_state, on=['year', 'state'], how='left')

state_data.fillna(0, inplace=True)

state_data


Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_median_income(%)
0,1970,Johor,1325600,0.000000e+00,0.0,0.0,0.0,0.0,0.0,237.0,269.0,0.0,0.0,0.0
1,1971,Johor,1355400,0.000000e+00,0.0,0.0,0.0,0.0,0.0,273.2,269.0,0.0,0.0,0.0
2,1972,Johor,1385300,0.000000e+00,0.0,0.0,0.0,0.0,0.0,309.5,269.0,0.0,0.0,0.0
3,1973,Johor,1414500,0.000000e+00,0.0,0.0,0.0,0.0,0.0,345.8,269.0,0.0,0.0,0.0
4,1974,Johor,1444400,0.000000e+00,0.0,0.0,0.0,0.0,0.0,382.0,269.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769,2021,Kuala Lumpur,1964000,2.198837e+11,1040700.0,992500.0,48300.0,414000.0,5.0,12526.5,9663.5,7531.0,6061.0,11.0
770,2022,Kuala Lumpur,1961200,2.405175e+11,1060300.0,1019700.0,40500.0,397300.0,4.0,13325.0,10234.0,7823.0,6232.0,13.0
771,2023,Kuala Lumpur,2005700,2.493015e+11,1111700.0,1077900.0,33900.0,387200.0,3.0,0.0,0.0,0.0,0.0,0.0
772,2024,Kuala Lumpur,2067500,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tes = state_data[state_data["year"] == 2025]
tes["income_median_state(RM)"]

9       7260.0
19      4640.0
29      3807.0
39      6553.0
49      5512.0
59      5012.0
69      4738.0
79      4969.0
89      6862.0
99      4823.0
109     5250.0
119    10543.0
129     6202.0
139    10898.0
Name: income_median_state(RM), dtype: float64

## Data extrapolation 

In [25]:
# Make a copy of your full dataset
df_all_states = state_data.copy()

# List of features to extrapolate
features_to_extend = [
    'population_state',
    'gdp_per_state(RM)',
    'labour_force_state',
    'employed_persons_state',
    'unemployed_persons_state',
    'outside_labour_force_state',
    'unemployed_rate_state(%)',
    'income_mean_state(RM)',
    'income_median_state(RM)',
    'expenditure_mean_state(RM)',
    'expenditure_median_state(RM)',
    'poverty_relative_state_median_income(%)'
]

# Years to extrapolate
target_years = [2023, 2024, 2025]

# Final results list
final_result = []

# Loop through each unique state
for state in df_all_states['state'].unique():
    # Subset for the state
    state_df = df_all_states[df_all_states["state"] == state].copy()
    state_df = state_df.sort_values('year').reset_index(drop=True)

    for feature in features_to_extend:
        clean = state_df[(state_df[feature] != 0) & (~state_df[feature].isna())][['year', feature]]

        if clean.empty:
            continue  # Skip feature if no valid data

        slope = (clean[feature].iloc[-1] - clean[feature].iloc[0]) / \
                (clean['year'].iloc[-1] - clean['year'].iloc[0])

        last_val = clean[feature].iloc[-1]
        last_year = clean['year'].iloc[-1]

        for year in target_years:
            if year > last_year:
                forecast_val = last_val + slope * (year - last_year)

                if year in state_df['year'].values:
                    row_index = state_df[state_df['year'] == year].index[0]
                    if pd.isna(state_df.loc[row_index, feature]) or state_df.loc[row_index, feature] == 0:
                        state_df.loc[row_index, feature] = forecast_val
                else:
                    # Add a new row
                    new_row = {col: 0 for col in state_df.columns}
                    new_row['year'] = year
                    new_row['state'] = state
                    new_row[feature] = forecast_val
                    state_df = pd.concat([state_df, pd.DataFrame([new_row])], ignore_index=True)

    # Finalize this state's extrapolated data
    state_df = state_df.sort_values('year').reset_index(drop=True)
    state_df = state_df.fillna(0)

    # Append to full list
    final_result.append(state_df)

# Combine all states back
extrapolated_df = pd.concat(final_result, ignore_index=True)

state_data = extrapolated_df.round(0)

In [26]:
tes = state_data[state_data["year"] == 2016]
tes

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_median_income(%)
46,2016,Johor,3651800,116682200000.0,1639100.0,1580600.0,58500.0,820700.0,4.0,6928.0,5652.0,4167.0,3635.0,14.0
102,2016,Kedah,2119700,41155720000.0,910900.0,884300.0,26600.0,508400.0,3.0,4971.0,3811.0,3071.0,2428.0,16.0
158,2016,Kelantan,1796700,22475730000.0,657800.0,632900.0,24900.0,454800.0,4.0,4214.0,3079.0,2884.0,2444.0,12.0
214,2016,Melaka,901100,37712740000.0,400800.0,397300.0,3500.0,211900.0,1.0,6849.0,5588.0,4394.0,3617.0,11.0
270,2016,Negeri Sembilan,1099300,41771250000.0,479700.0,466900.0,12800.0,262100.0,3.0,5887.0,4579.0,3700.0,3128.0,16.0
326,2016,Pahang,1626700,50875220000.0,710500.0,692200.0,18300.0,366500.0,3.0,5012.0,3979.0,3332.0,2761.0,8.0
382,2016,Perak,2482100,65958410000.0,1023100.0,988200.0,34800.0,635500.0,3.0,5065.0,4006.0,3251.0,2850.0,14.0
438,2016,Perlis,251000,5569584000.0,103100.0,100100.0,3000.0,64400.0,3.0,4998.0,4204.0,3090.0,2927.0,12.0
494,2016,Pulau Pinang,1717700,82493280000.0,845500.0,827400.0,18100.0,379800.0,2.0,6771.0,5409.0,4219.0,3411.0,7.0
550,2016,Sabah,3802800,77518050000.0,1927100.0,1823700.0,103300.0,793400.0,5.0,5354.0,4110.0,2601.0,2111.0,18.0


### State data column aggregation


In [27]:
# GDP per capita 
# ===========================

state_data["gdp_per_capita_state(RM)"] = state_data["gdp_per_state(RM)"] / state_data["population_state"]
state_data['gdp_per_capita_state(RM)'] = state_data['gdp_per_capita_state(RM)'].round(0)

# gdp per capita growth rate %
# ===========================

state_data['gdp_state_per_capita_growth_rate(%)'] = state_data['gdp_per_capita_state(RM)'].pct_change() * 100
state_data['gdp_state_per_capita_growth_rate(%)'] = state_data['gdp_state_per_capita_growth_rate(%)'].round(0)

# median income growth rate %
# ===========================

state_data['state_income_median_growth_rate(%)'] = state_data['income_median_state(RM)'].pct_change() * 100
state_data['state_income_median_growth_rate(%)'] = state_data['state_income_median_growth_rate(%)'].round(0)

# median expenditure growth rate %
# ===========================

state_data['expenditure_median_state_growth_rate(%)'] = state_data['expenditure_median_state(RM)'].pct_change() * 100
state_data['expenditure_median_state_growth_rate(%)'] = state_data['expenditure_median_state_growth_rate(%)'].round(0)


### Financial health status of every state

In [28]:
from scipy.stats import zscore

# Load the dataset
df = state_data.copy()

# Filter rows with valid values
df_filtered = df[
    (df["income_median_state(RM)"] > 0) &
    (df["expenditure_median_state(RM)"] > 0) &
    (df["unemployed_rate_state(%)"] > 0) &
    (df["gdp_per_capita_state(RM)"] > 0)
].copy()

# --------------------------
# Survival income threshold and financial health analysis
# --------------------------
inflation_rate = 0.025  # 2.5%
buffer_factor = 1.15    # 15% safety buffer

# 1. Compute survival threshold
df_filtered["survival_income_state(RM)"] = (
    df_filtered["expenditure_median_state(RM)"] * (1 + inflation_rate) * buffer_factor
)

# 2. Compute z-scores for relevant features
z_income = zscore(df_filtered["income_median_state(RM)"])
z_expenditure = zscore(df_filtered["expenditure_median_state(RM)"])
z_unemployed = zscore(df_filtered["unemployed_rate_state(%)"])
z_gdp_per_capita = zscore(df_filtered["gdp_per_capita_state(RM)"])

# 3. Composite z-score for financial resilience
df_filtered["financial_resilience_score"] = (
    z_income - z_expenditure - z_unemployed + z_gdp_per_capita
)

# 4. Classify financial health based on score
def classify_financial_health(z):
    if z < -0.5:
        return "High Risk"
    elif z <= 0.5:
        return "Stable"
    else:
        return "Affluent"

df_filtered["financial_health_status"] = df_filtered["financial_resilience_score"].apply(classify_financial_health)

cols_to_round = ["gdp_per_capita_state(RM)", "survival_income_state(RM)", "financial_resilience_score"]
df_filtered[cols_to_round] = df_filtered[cols_to_round].round(2)

# Final result
state_data=df_filtered


In [29]:
test = state_data[state_data["year"]==2016] 
test

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),...,expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_median_income(%),gdp_per_capita_state(RM),gdp_state_per_capita_growth_rate(%),state_income_median_growth_rate(%),expenditure_median_state_growth_rate(%),survival_income_state(RM),financial_resilience_score,financial_health_status
46,2016,Johor,3651800,116682200000.0,1639100.0,1580600.0,58500.0,820700.0,4.0,6928.0,...,4167.0,3635.0,14.0,31952.0,5.0,4.0,inf,4284.76,-0.54,High Risk
102,2016,Kedah,2119700,41155720000.0,910900.0,884300.0,26600.0,508400.0,3.0,4971.0,...,3071.0,2428.0,16.0,19416.0,3.0,5.0,inf,2862.0,-0.1,Stable
158,2016,Kelantan,1796700,22475730000.0,657800.0,632900.0,24900.0,454800.0,4.0,4214.0,...,2884.0,2444.0,12.0,12509.0,3.0,6.0,inf,2880.86,-1.55,High Risk
214,2016,Melaka,901100,37712740000.0,400800.0,397300.0,3500.0,211900.0,1.0,6849.0,...,4394.0,3617.0,11.0,41852.0,3.0,5.0,inf,4263.54,2.12,Affluent
270,2016,Negeri Sembilan,1099300,41771250000.0,479700.0,466900.0,12800.0,262100.0,3.0,5887.0,...,3700.0,3128.0,16.0,37998.0,3.0,5.0,inf,3687.13,0.37,Stable
326,2016,Pahang,1626700,50875220000.0,710500.0,692200.0,18300.0,366500.0,3.0,5012.0,...,3332.0,2761.0,8.0,31275.0,2.0,8.0,inf,3254.53,0.13,Stable
382,2016,Perak,2482100,65958410000.0,1023100.0,988200.0,34800.0,635500.0,3.0,5065.0,...,3251.0,2850.0,14.0,26574.0,4.0,7.0,inf,3359.44,-0.12,Stable
438,2016,Perlis,251000,5569584000.0,103100.0,100100.0,3000.0,64400.0,3.0,4998.0,...,3090.0,2927.0,12.0,22190.0,3.0,9.0,inf,3450.2,-0.26,Stable
494,2016,Pulau Pinang,1717700,82493280000.0,845500.0,827400.0,18100.0,379800.0,2.0,6771.0,...,4219.0,3411.0,7.0,48025.0,4.0,7.0,inf,4020.72,1.7,Affluent
550,2016,Sabah,3802800,77518050000.0,1927100.0,1823700.0,103300.0,793400.0,5.0,5354.0,...,2601.0,2111.0,18.0,20384.0,3.0,5.0,inf,2488.34,-1.12,High Risk


In [30]:

national_data_selected = national_data[["year", "income_median_national(RM)", "income_mean_national(RM)"]]

state_data = state_data.merge(national_data_selected, on='year', how='left')
# state_data


# Compare and assign performance category
state_data['income_mean_compare_national'] = state_data.apply(
    lambda row: '1' if row['income_mean_state(RM)'] > row['income_mean_national(RM)'] else '0',
    axis=1
)

state_data['income_median_compare_national'] = state_data.apply(
    lambda row: '1' if row['income_median_state(RM)'] > row['income_median_national(RM)'] else '0',
    axis=1
)

state_data.drop(columns=["income_mean_national(RM)", "income_median_national(RM)"], inplace=True)


state_data

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),...,poverty_relative_state_median_income(%),gdp_per_capita_state(RM),gdp_state_per_capita_growth_rate(%),state_income_median_growth_rate(%),expenditure_median_state_growth_rate(%),survival_income_state(RM),financial_resilience_score,financial_health_status,income_mean_compare_national,income_median_compare_national
0,2016,Johor,3651800,1.166822e+11,1639100.0,1580600.0,58500.0,820700.0,4.0,6928.0,...,14.0,31952.0,5.0,4.0,inf,4284.76,-0.54,High Risk,0,1
1,2017,Johor,3697000,1.235613e+11,1673800.0,1616700.0,57100.0,824400.0,3.0,7290.0,...,14.0,33422.0,5.0,5.0,4.0,4442.71,0.29,Stable,1,1
2,2018,Johor,3749400,1.305859e+11,1745100.0,1693300.0,51900.0,788200.0,3.0,7651.0,...,15.0,34828.0,4.0,4.0,4.0,4601.84,0.35,Stable,1,1
3,2019,Johor,3761200,1.342259e+11,1805700.0,1756100.0,49600.0,761600.0,3.0,8013.0,...,15.0,35687.0,2.0,4.0,3.0,4759.79,0.39,Stable,1,1
4,2020,Johor,4009700,1.280736e+11,1990900.0,1920500.0,70300.0,826900.0,4.0,7264.0,...,14.0,31941.0,-10.0,-11.0,5.0,4994.36,-1.11,High Risk,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2021,Kuala Lumpur,1964000,2.198837e+11,1040700.0,992500.0,48300.0,414000.0,5.0,12526.0,...,11.0,111957.0,2.0,6.0,3.0,7144.40,1.66,Affluent,1,1
136,2022,Kuala Lumpur,1961200,2.405175e+11,1060300.0,1019700.0,40500.0,397300.0,4.0,13325.0,...,13.0,122638.0,10.0,6.0,3.0,7345.97,2.98,Affluent,1,1
137,2023,Kuala Lumpur,2005700,2.493015e+11,1111700.0,1077900.0,33900.0,387200.0,3.0,13607.0,...,13.0,124297.0,1.0,2.0,3.0,7544.00,3.77,Affluent,1,1
138,2024,Kuala Lumpur,2067500,2.578562e+11,1127288.0,1092883.0,34507.0,391617.0,3.0,13889.0,...,13.0,124719.0,0.0,2.0,3.0,7743.21,3.74,Affluent,1,1


In [31]:
test = state_data[state_data["year"]==2025] 
test

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),...,poverty_relative_state_median_income(%),gdp_per_capita_state(RM),gdp_state_per_capita_growth_rate(%),state_income_median_growth_rate(%),expenditure_median_state_growth_rate(%),survival_income_state(RM),financial_resilience_score,financial_health_status,income_mean_compare_national,income_median_compare_national
9,2025,Johor,4205900,157755400000.0,2140480.0,2085593.0,54888.0,857429.0,3.0,8995.0,...,17.0,37508.0,3.0,2.0,3.0,6054.06,-0.15,Stable,1,1
19,2025,Kedah,2228000,54840170000.0,981988.0,953768.0,28215.0,520661.0,3.0,5859.0,...,10.0,24614.0,2.0,2.0,4.0,4400.27,-0.71,High Risk,0,0
29,2025,Kelantan,1907700,29208010000.0,703149.0,678505.0,24744.0,525663.0,4.0,5158.0,...,12.0,15311.0,2.0,2.0,3.0,3948.81,-1.92,High Risk,0,0
39,2025,Melaka,1052500,49647500000.0,524393.0,516024.0,8473.0,222393.0,2.0,8507.0,...,14.0,47171.0,2.0,2.0,4.0,6463.09,0.28,Stable,0,0
49,2025,Negeri Sembilan,1244600,54783630000.0,570051.0,551700.0,18246.0,290963.0,3.0,7163.0,...,12.0,44017.0,2.0,2.0,3.0,4997.9,0.03,Stable,0,0
59,2025,Pahang,1678200,68814720000.0,757949.0,742885.0,15059.0,406034.0,2.0,6094.0,...,8.0,41005.0,2.0,2.0,3.0,4584.16,0.75,Affluent,0,0
69,2025,Perak,2574900,87404540000.0,1173688.0,1128600.0,44983.0,604046.0,4.0,6098.0,...,14.0,33945.0,3.0,2.0,2.0,4227.0,-0.91,High Risk,0,0
79,2025,Perlis,297800,6590754000.0,134137.0,128276.0,5861.0,83317.0,4.0,5983.0,...,14.0,22131.0,2.0,2.0,2.0,4206.96,-1.23,High Risk,0,0
89,2025,Pulau Pinang,1803300,125409700000.0,952063.0,931127.0,20832.0,380446.0,2.0,8727.0,...,19.0,69545.0,4.0,2.0,4.0,5912.61,1.78,Affluent,0,1
99,2025,Sabah,3759600,85552110000.0,1822278.0,1684571.0,137707.0,772593.0,8.0,6497.0,...,12.0,22756.0,1.0,2.0,4.0,3654.12,-3.89,High Risk,0,0


In [32]:
state_data.dtypes

year                                         int32
state                                       object
population_state                             int64
gdp_per_state(RM)                          float64
labour_force_state                         float64
employed_persons_state                     float64
unemployed_persons_state                   float64
outside_labour_force_state                 float64
unemployed_rate_state(%)                   float64
income_mean_state(RM)                      Float64
income_median_state(RM)                    Float64
expenditure_mean_state(RM)                 float64
expenditure_median_state(RM)               float64
poverty_relative_state_median_income(%)    float64
gdp_per_capita_state(RM)                   float64
gdp_state_per_capita_growth_rate(%)        float64
state_income_median_growth_rate(%)         Float64
expenditure_median_state_growth_rate(%)    float64
survival_income_state(RM)                  float64
financial_resilience_score     

## State Data Formatting

In [33]:
# data formatting
# ========================

state_data['income_mean_state(RM)'] = state_data['income_mean_state(RM)'].astype('float64')
state_data['income_median_state(RM)'] = state_data['income_median_state(RM)'].astype('float64')
state_data['state_income_median_growth_rate(%)'] = state_data['state_income_median_growth_rate(%)'].astype('float64')

state_data['labour_force_state'] = state_data['labour_force_state'].astype(int)
state_data['employed_persons_state'] = state_data['employed_persons_state'].astype(int)
state_data['unemployed_persons_state'] = state_data['unemployed_persons_state'].astype(int)
state_data['outside_labour_force_state'] = state_data['outside_labour_force_state'].astype(int)

state_data['unemployed_rate_state(%)'] = state_data['unemployed_rate_state(%)'] / 100
state_data['poverty_relative_state_median_income(%)'] = state_data['poverty_relative_state_median_income(%)'] / 100
state_data['gdp_state_per_capita_growth_rate(%)'] = state_data['gdp_state_per_capita_growth_rate(%)'] / 100
state_data['state_income_median_growth_rate(%)'] = state_data['state_income_median_growth_rate(%)'] / 100
state_data['expenditure_median_state_growth_rate(%)'] = state_data['expenditure_median_state_growth_rate(%)'] / 100
state_data['financial_resilience_score'] = state_data['financial_resilience_score'] / 100


# Replace inf with NaN
state_data.replace([np.inf, -np.inf], np.nan, inplace=True)
state_data = state_data.fillna(0)

state_data


Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),...,poverty_relative_state_median_income(%),gdp_per_capita_state(RM),gdp_state_per_capita_growth_rate(%),state_income_median_growth_rate(%),expenditure_median_state_growth_rate(%),survival_income_state(RM),financial_resilience_score,financial_health_status,income_mean_compare_national,income_median_compare_national
0,2016,Johor,3651800,1.166822e+11,1639100,1580600,58500,820700,0.04,6928.0,...,0.14,31952.0,0.05,0.04,0.00,4284.76,-0.0054,High Risk,0,1
1,2017,Johor,3697000,1.235613e+11,1673800,1616700,57100,824400,0.03,7290.0,...,0.14,33422.0,0.05,0.05,0.04,4442.71,0.0029,Stable,1,1
2,2018,Johor,3749400,1.305859e+11,1745100,1693300,51900,788200,0.03,7651.0,...,0.15,34828.0,0.04,0.04,0.04,4601.84,0.0035,Stable,1,1
3,2019,Johor,3761200,1.342259e+11,1805700,1756100,49600,761600,0.03,8013.0,...,0.15,35687.0,0.02,0.04,0.03,4759.79,0.0039,Stable,1,1
4,2020,Johor,4009700,1.280736e+11,1990900,1920500,70300,826900,0.04,7264.0,...,0.14,31941.0,-0.10,-0.11,0.05,4994.36,-0.0111,High Risk,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2021,Kuala Lumpur,1964000,2.198837e+11,1040700,992500,48300,414000,0.05,12526.0,...,0.11,111957.0,0.02,0.06,0.03,7144.40,0.0166,Affluent,1,1
136,2022,Kuala Lumpur,1961200,2.405175e+11,1060300,1019700,40500,397300,0.04,13325.0,...,0.13,122638.0,0.10,0.06,0.03,7345.97,0.0298,Affluent,1,1
137,2023,Kuala Lumpur,2005700,2.493015e+11,1111700,1077900,33900,387200,0.03,13607.0,...,0.13,124297.0,0.01,0.02,0.03,7544.00,0.0377,Affluent,1,1
138,2024,Kuala Lumpur,2067500,2.578562e+11,1127288,1092883,34507,391617,0.03,13889.0,...,0.13,124719.0,0.00,0.02,0.03,7743.21,0.0374,Affluent,1,1


In [34]:
state_data.dtypes

year                                         int32
state                                       object
population_state                             int64
gdp_per_state(RM)                          float64
labour_force_state                           int64
employed_persons_state                       int64
unemployed_persons_state                     int64
outside_labour_force_state                   int64
unemployed_rate_state(%)                   float64
income_mean_state(RM)                      float64
income_median_state(RM)                    float64
expenditure_mean_state(RM)                 float64
expenditure_median_state(RM)               float64
poverty_relative_state_median_income(%)    float64
gdp_per_capita_state(RM)                   float64
gdp_state_per_capita_growth_rate(%)        float64
state_income_median_growth_rate(%)         float64
expenditure_median_state_growth_rate(%)    float64
survival_income_state(RM)                  float64
financial_resilience_score     

In [35]:
# save data
state_data.to_parquet("./staging/gold/state_data_gold_data.parquet", index=False)