In [291]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [292]:
# states golden data 
# ======================
URL_DATA = './staging/gold/state_data_gold_data.parquet'
national_states = pd.read_parquet(URL_DATA)

In [293]:
national_states

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_mean_income(%)
0,1970,Johor,1325600,0.000000e+00,0.0,0.0,0.0,0.0,0.0,237.0,269.0,0.0,0.0,0.0
1,1971,Johor,1355400,0.000000e+00,0.0,0.0,0.0,0.0,0.0,273.2,269.0,0.0,0.0,0.0
2,1972,Johor,1385300,0.000000e+00,0.0,0.0,0.0,0.0,0.0,309.5,269.0,0.0,0.0,0.0
3,1973,Johor,1414500,0.000000e+00,0.0,0.0,0.0,0.0,0.0,345.8,269.0,0.0,0.0,0.0
4,1974,Johor,1444400,0.000000e+00,0.0,0.0,0.0,0.0,0.0,382.0,269.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2020,Kuala Lumpur,1982100,2.174470e+11,1049500.0,1006400.0,43100.0,408300.0,4.0,11728.0,9093.0,7239.0,5890.0,9.0
756,2021,Kuala Lumpur,1964000,2.198837e+11,1040700.0,992500.0,48300.0,414000.0,5.0,12526.5,9663.5,7531.0,6061.0,11.0
757,2022,Kuala Lumpur,1961200,2.405175e+11,1060300.0,1019700.0,40500.0,397300.0,4.0,13325.0,10234.0,7823.0,6232.0,13.0
758,2023,Kuala Lumpur,2005700,2.493015e+11,1111700.0,1077900.0,33900.0,387200.0,3.0,0.0,0.0,0.0,0.0,0.0


In [294]:
# Make a copy of your full dataset
df_all_states = national_states.copy()

# List of features to extrapolate
features_to_extend = [
    'population_state',
    'gdp_per_state(RM)',
    "labour_force_state",
    "employed_persons_state",
    "unemployed_persons_state",
    "outside_labour_force_state",
    'unemployed_rate_state(%)',
    "income_mean_state(RM)",
    'income_median_state(RM)',
    "expenditure_mean_state(RM)",
    'expenditure_median_state(RM)',
    'poverty_relative_state_mean_income(%)'
]

# Years to extrapolate
target_years = [2023, 2024, 2025]

# Final results list
final_result = []

# Loop through each unique state
for state in df_all_states['state'].unique():
    # Subset for the state
    state_df = df_all_states[df_all_states["state"] == state].copy()
    state_df = state_df.sort_values('year').reset_index(drop=True)

    for feature in features_to_extend:
        clean = state_df[(state_df[feature] != 0) & (~state_df[feature].isna())][['year', feature]]

        if clean.empty:
            continue  # Skip feature if no valid data

        slope = (clean[feature].iloc[-1] - clean[feature].iloc[0]) / \
                (clean['year'].iloc[-1] - clean['year'].iloc[0])

        last_val = clean[feature].iloc[-1]
        last_year = clean['year'].iloc[-1]

        for year in target_years:
            if year > last_year:
                forecast_val = last_val + slope * (year - last_year)

                if year in state_df['year'].values:
                    row_index = state_df[state_df['year'] == year].index[0]
                    if pd.isna(state_df.loc[row_index, feature]) or state_df.loc[row_index, feature] == 0:
                        state_df.loc[row_index, feature] = forecast_val
                else:
                    # Add a new row
                    new_row = {col: 0 for col in state_df.columns}
                    new_row['year'] = year
                    new_row['state'] = state
                    new_row[feature] = forecast_val
                    state_df = pd.concat([state_df, pd.DataFrame([new_row])], ignore_index=True)

    # Finalize this state's extrapolated data
    state_df = state_df.sort_values('year').reset_index(drop=True)
    state_df = state_df.fillna(0)

    # Append to full list
    final_result.append(state_df)

# Combine all states back
extrapolated_df = pd.concat(final_result, ignore_index=True)

national_states_x = extrapolated_df.round(0)

In [295]:
national_states_x

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_mean_income(%)
0,1970,Johor,1325600.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,237.0,269.0,0.0,0.0,0.0
1,1971,Johor,1355400.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,273.0,269.0,0.0,0.0,0.0
2,1972,Johor,1385300.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,310.0,269.0,0.0,0.0,0.0
3,1973,Johor,1414500.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,346.0,269.0,0.0,0.0,0.0
4,1974,Johor,1444400.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,382.0,269.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769,2021,Kuala Lumpur,1964000.0,2.198837e+11,1040700.0,992500.0,48300.0,414000.0,5.0,12526.0,9664.0,7531.0,6061.0,11.0
770,2022,Kuala Lumpur,1961200.0,2.405175e+11,1060300.0,1019700.0,40500.0,397300.0,4.0,13325.0,10234.0,7823.0,6232.0,13.0
771,2023,Kuala Lumpur,2005700.0,2.493015e+11,1111700.0,1077900.0,33900.0,387200.0,3.0,13607.0,10456.0,8088.0,6400.0,13.0
772,2024,Kuala Lumpur,2067500.0,2.578562e+11,1127288.0,1092883.0,34507.0,391617.0,3.0,13889.0,10677.0,8353.0,6569.0,13.0


In [296]:
national_states_y = national_states_x[national_states_x["state"] == "Johor"]
national_states_y

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_mean_income(%)
0,1970,Johor,1325600.0,0.0,0.0,0.0,0.0,0.0,0.0,237.0,269.0,0.0,0.0,0.0
1,1971,Johor,1355400.0,0.0,0.0,0.0,0.0,0.0,0.0,273.0,269.0,0.0,0.0,0.0
2,1972,Johor,1385300.0,0.0,0.0,0.0,0.0,0.0,0.0,310.0,269.0,0.0,0.0,0.0
3,1973,Johor,1414500.0,0.0,0.0,0.0,0.0,0.0,0.0,346.0,269.0,0.0,0.0,0.0
4,1974,Johor,1444400.0,0.0,0.0,0.0,0.0,0.0,0.0,382.0,269.0,0.0,0.0,0.0
5,1975,Johor,1477000.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,320.0,0.0,0.0,0.0
6,1976,Johor,1508400.0,0.0,0.0,0.0,0.0,0.0,0.0,513.0,370.0,0.0,0.0,0.0
7,1977,Johor,1541900.0,0.0,0.0,0.0,0.0,0.0,0.0,586.0,419.0,0.0,0.0,0.0
8,1978,Johor,1573400.0,0.0,0.0,0.0,0.0,0.0,0.0,658.0,469.0,0.0,0.0,0.0
9,1979,Johor,1606400.0,0.0,0.0,0.0,0.0,0.0,0.0,731.0,518.0,0.0,0.0,0.0


In [297]:
import pandas as pd
from scipy.stats import zscore

# Load the dataset
df = national_states_x.copy()

# Filter rows with valid values
df_filtered = df[
    (df["income_median_state(RM)"] > 0) &
    (df["expenditure_median_state(RM)"] > 0) &
    (df["unemployed_rate_state(%)"] > 0) &
    (df["gdp_per_state(RM)"] > 0) &
    (df["population_state"] > 0)
].copy()

# --------------------------
# Compute GDP per capita
# --------------------------
df_filtered["gdp_per_capita_state(RM)"] = df_filtered["gdp_per_state(RM)"] / df_filtered["population_state"]

# --------------------------
# Survival income threshold and financial health analysis
# --------------------------
inflation_rate = 0.025  # 2.5%
buffer_factor = 1.15    # 15% safety buffer

# 1. Compute survival threshold
df_filtered["survival_income_state"] = (
    df_filtered["expenditure_median_state(RM)"] * (1 + inflation_rate) * buffer_factor
)

# 2. Compute z-scores for relevant features
z_income = zscore(df_filtered["income_median_state(RM)"])
z_expenditure = zscore(df_filtered["expenditure_median_state(RM)"])
z_unemployed = zscore(df_filtered["unemployed_rate_state(%)"])
z_gdp_per_capita = zscore(df_filtered["gdp_per_capita_state(RM)"])

# 3. Composite z-score for financial resilience
df_filtered["financial_resilience_score"] = (
    z_income - z_expenditure - z_unemployed + z_gdp_per_capita
)

# 4. Classify financial health based on score
def classify_financial_health(z):
    if z < -0.5:
        return "High Risk"
    elif z <= 0.5:
        return "Stable"
    else:
        return "Affluent"

df_filtered["financial_health_status"] = df_filtered["financial_resilience_score"].apply(classify_financial_health)

# Final result
df_filtered


Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_mean_income(%),gdp_per_capita_state(RM),survival_income_state,financial_resilience_score,financial_health_status
46,2016,Johor,3651800.0,1.166822e+11,1639100.0,1580600.0,58500.0,820700.0,4.0,6928.0,5652.0,4167.0,3635.0,14.0,31951.976286,4284.75625,-0.543586,High Risk
47,2017,Johor,3697000.0,1.235613e+11,1673800.0,1616700.0,57100.0,824400.0,3.0,7290.0,5910.0,4387.0,3769.0,14.0,33422.052204,4442.70875,0.286266,Stable
48,2018,Johor,3749400.0,1.305859e+11,1745100.0,1693300.0,51900.0,788200.0,3.0,7651.0,6169.0,4606.0,3904.0,15.0,34828.491225,4601.84000,0.350828,Stable
49,2019,Johor,3761200.0,1.342259e+11,1805700.0,1756100.0,49600.0,761600.0,3.0,8013.0,6427.0,4826.0,4038.0,15.0,35686.998298,4759.79250,0.394229,Stable
50,2020,Johor,4009700.0,1.280736e+11,1990900.0,1920500.0,70300.0,826900.0,4.0,7264.0,5690.0,4998.0,4237.0,14.0,31940.934234,4994.36375,-1.104608,High Risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769,2021,Kuala Lumpur,1964000.0,2.198837e+11,1040700.0,992500.0,48300.0,414000.0,5.0,12526.0,9664.0,7531.0,6061.0,11.0,111957.088086,7144.40375,1.664619,Affluent
770,2022,Kuala Lumpur,1961200.0,2.405175e+11,1060300.0,1019700.0,40500.0,397300.0,4.0,13325.0,10234.0,7823.0,6232.0,13.0,122637.945136,7345.97000,2.989625,Affluent
771,2023,Kuala Lumpur,2005700.0,2.493015e+11,1111700.0,1077900.0,33900.0,387200.0,3.0,13607.0,10456.0,8088.0,6400.0,13.0,124296.526898,7544.00000,3.774703,Affluent
772,2024,Kuala Lumpur,2067500.0,2.578562e+11,1127288.0,1092883.0,34507.0,391617.0,3.0,13889.0,10677.0,8353.0,6569.0,13.0,124718.824365,7743.20875,3.747215,Affluent


In [298]:
df_filtered_X = df_filtered[df_filtered["year"]==2025] 
df_filtered_X

Unnamed: 0,year,state,population_state,gdp_per_state(RM),labour_force_state,employed_persons_state,unemployed_persons_state,outside_labour_force_state,unemployed_rate_state(%),income_mean_state(RM),income_median_state(RM),expenditure_mean_state(RM),expenditure_median_state(RM),poverty_relative_state_mean_income(%),gdp_per_capita_state(RM),survival_income_state,financial_resilience_score,financial_health_status
55,2025,Johor,4239276.0,157755400000.0,2140480.0,2085593.0,54888.0,857429.0,3.0,8995.0,7260.0,5930.0,5136.0,17.0,37212.817059,6054.06,-0.156931,Stable
111,2025,Kedah,2240241.0,54840170000.0,981988.0,953768.0,28215.0,520661.0,3.0,5859.0,4640.0,4112.0,3733.0,10.0,24479.586013,4400.27375,-0.715097,High Risk
167,2025,Kelantan,1910374.0,29208010000.0,703149.0,678505.0,24744.0,525663.0,4.0,5158.0,3807.0,3816.0,3350.0,12.0,15289.155945,3948.8125,-1.91837,High Risk
223,2025,Melaka,1058723.0,49647500000.0,524393.0,516024.0,8473.0,222393.0,2.0,8507.0,6553.0,6364.0,5483.0,14.0,46893.755024,6463.08625,0.271231,Stable
279,2025,Negeri Sembilan,1253798.0,54783630000.0,570051.0,551700.0,18246.0,290963.0,3.0,7163.0,5512.0,5167.0,4240.0,12.0,43694.147303,4997.9,0.022052,Stable
335,2025,Pahang,1689393.0,68814720000.0,757949.0,742885.0,15059.0,406034.0,2.0,6094.0,5012.0,4494.0,3889.0,8.0,40733.398475,4584.15875,0.737334,Affluent
391,2025,Perak,2586972.0,87404540000.0,1173688.0,1128600.0,44983.0,604046.0,4.0,6098.0,4738.0,4229.0,3586.0,14.0,33786.42724,4226.9975,-0.916145,High Risk
447,2025,Perlis,299972.0,6590754000.0,134137.0,128276.0,5861.0,83317.0,4.0,5983.0,4969.0,4206.0,3569.0,14.0,21971.229815,4206.95875,-1.24084,High Risk
503,2025,Pulau Pinang,1818767.0,125409700000.0,952063.0,931127.0,20832.0,380446.0,2.0,8727.0,6862.0,5874.0,5016.0,19.0,68953.138307,5912.61,1.757421,Affluent
559,2025,Sabah,3798572.0,85552110000.0,1822278.0,1684571.0,137707.0,772593.0,8.0,6497.0,4823.0,3712.0,3100.0,12.0,22522.177216,3654.125,-3.894497,High Risk


In [305]:

# df=national_states[["year", "state", "income_median_state(RM)", "expenditure_mean_state(RM)" ]]

# # Pivot the table
# pivot_df = df.pivot(index='year', columns='state', values='income_median_state(RM)')

# pivot_df
