*https://github.com/Aubrey-Bermuda-CA/IFRS9-MODELING-CHINA*

*3.4 Functional description: Read macro historical data and external forecast data, and use the Unobserved components model to predict macro data.*

In [1]:
import time
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.statespace.structural import UnobservedComponents

In [2]:
# set date range
start_date = '2010-03-31'
forecast_date = '2026-09-30'
date_index = pd.date_range(start=start_date, end=forecast_date, freq='Q')

# read macro data
macro_his_data = pd.read_csv(f'./macrofactor_historical_data.csv', index_col=[0], parse_dates=[0])
macro_fc_data = pd.read_csv(f'./macrofactor_forecast_data.csv', index_col=[0], parse_dates=[0])

# combine data
macro_data = macro_his_data.combine_first(macro_fc_data)

# Missing values ​​can be handled using interpolation. There are several methods to choose from:
# 'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial', 
# 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', 'cubicspline'
# macro_data = macro_data.interpolate(method='spline')

macro_data

Unnamed: 0_level_0,GDP_Const_Qtr,Ind_Pro_Mth,CPI_Mth,Core_CPI_Mth,PPI_Mth,FAI_Cum,Retail_Sales_Mth,Imports_Mth,Exports_Mth,M1,...,PPIRM_Mth,CGPI_Mth,70_Cities_Price_Mth,Power_Gen_Mth,Freight_Mth,Urban_Income_Cum,Urban_Exp_Cum,Nat_Housing_Index,Macro_Eco_Index,Housing_Sales_Cum
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-03-31,12.20,18.1,102.400,,5.91,26.4,18.00,66.27,24.21,29.94,...,11.46,105.6,9.5,17.6,15.800000,9.8,11.01,105.89,104.0,57.7
2010-06-30,10.80,13.7,102.900,,6.41,25.5,18.30,33.87,43.87,24.56,...,10.80,106.6,7.7,11.4,12.300000,10.2,9.89,105.06,102.6,25.4
2010-09-30,9.90,13.3,103.600,,4.33,24.5,18.84,24.38,25.08,20.87,...,7.10,106.1,6.2,8.1,11.900000,10.5,9.32,103.52,101.9,15.9
2010-12-31,9.90,13.5,104.600,,5.93,24.5,19.10,25.94,17.87,21.19,...,9.47,107.9,5.0,5.1,0.310000,11.3,9.84,101.79,103.6,18.3
2011-03-31,10.20,14.8,105.383,,7.31,25.0,17.40,27.50,35.76,15.00,...,10.53,109.3,3.7,14.8,14.300000,12.3,10.69,102.98,102.5,27.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-30,4.60,5.4,100.400,0.1,-2.80,3.4,3.20,0.30,2.40,-7.40,...,-2.20,98.3,-9.0,6.0,2.315156,4.5,5.00,92.40,,-22.7
2024-12-31,4.85,,1.800,0.7,-2.20,3.0,6.40,11.00,,-6.50,...,,,,,,,,,,
2025-03-31,4.00,,,,,3.6,,,,,...,,,,,,,,,,
2025-06-30,4.55,,,,,3.9,,,,,...,,,,,,,,,,


In [3]:
# The method used here comes from this article ：https://www.ecb.europa.eu/pub/pdf/scpwps/ecbwp499.pdf. 
# This is just a simple attempt, not a reproduction of the article.
# As concluded in the article, hpfilter obtain more stable predictive values.

# Use Hodrick-Prescott filter to separate trend terms and periodic terms
macro_data_fitted = pd.DataFrame(index=macro_data.index)

for column in macro_data:
    y = macro_data[column].dropna()
    cycle, trend = sm.tsa.filters.hpfilter(y)
    try:
        result = STL(cycle).fit()
        seasonal = result.seasonal
        seasonal = seasonal.rename(column)
        seasonal_smooth = seasonal[abs(seasonal - seasonal.mean()) <= 2.576 * seasonal.std()] # 99% confidence level value
        fitted = trend.add(seasonal_smooth, fill_value=0)
    except:
        seasonal = pd.DataFrame(index=trend.index, columns=[column])
        cycle_smooth = cycle[abs(cycle - cycle.mean()) <= 2.576 * cycle.std()]
        fitted = trend.add(cycle_smooth, fill_value=0)
    
    # # Create a graphics object
    # fig, ax1 = plt.subplots(figsize=(10, 5))
    # ax1.plot(y.index, y, label=column, color='g')
    # ax1.plot(y.index, fitted, label='Fitted', color='b')
    # ax1.set_xlabel('Date')
    # ax1.set_ylabel('Value and Fitted')
    # ax1.legend(loc='upper left')

    # # Create a secondary axis
    # ax2 = ax1.twinx() 
    # ax2.plot(y.index, seasonal, label='Seasonal', color='gray', linestyle='--')
    # ax2.set_ylabel('Seasonal', color='k')
    # ax2.legend(loc='upper right')
    # plt.title('Original, Seasonal and Fitted (HP filter)')
    # plt.show()

    macro_data_fitted = macro_data_fitted.merge(fitted.rename(column), left_index=True, right_index=True, how='left')

macro_data_fitted.to_csv(f'./macro_data_fitted.csv')

In [4]:
# to avoid Maximum Likelihood convergence warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='statsmodels')

df = macro_data_fitted
# Set the frequency
df = df.asfreq('Q-DEC')

# save model result
predict_data = pd.DataFrame(index=date_index)
print(f'macrofoctor_list：{df.columns.values}')

# timer
start_t = time.perf_counter()

# loss function, You can customize your own loss function
def func_calc_rmse(y, y_pred):
    try:
        res_rmse = np.sqrt(np.mean((y[-16:-4] - y_pred[-16:-4]) ** 2))
    except:
        res_rmse = np.inf
    return res_rmse

# Unobserved components model
# The method used here is mentioned in this article ：https://www.ecb.europa.eu/pub/pdf/scpwps/ecb.wp2832~51322fbd6a.en.pdf
# This is just a simple attempt, not a reproduction of the article.
for column in df.columns:
    y = df[column]
    print(f'\n\nFitting model of: {column}')

    # Initialize optimal parameters and maximum loss function
    best_rmse = float('inf')
    
    # define level, trend, seasonal, cycle, autoregressive
    for i in range(4):
        model = UnobservedComponents(y,
                                     level='smooth trend',
                                     seasonal=4,
                                     cycle=True,
                                     autoregressive=i)
        
        # fit model
        result = model.fit()
        
        # predict
        y_pred = result.fittedvalues
        rmse = func_calc_rmse(y, y_pred)
        
        # Update optimal parameters
        if rmse < best_rmse:
            best_rmse = rmse
            best_result = result
            pr_in = y_pred
            pr_out = best_result.forecast(steps=12)
            
    print(best_result.summary())
    pr = pd.DataFrame({column:pd.concat([pr_in, pr_out])})
    predict_data = predict_data.merge(pr, left_index=True, right_index=True, how='left')

# Output prediction results
predict_data = macro_data.combine_first(predict_data) # This does not change the value of historical data
# predict_data = macro_data_fitted.combine_first(predict_data) # If you want to use filtered historical values
predict_data.to_csv(f'./3.4_predict_data.csv')

dur = (time.perf_counter() - start_t) / 60
print(f'\nThe prediction process took a total of {dur:,.2f} minutes.')

macrofoctor_list：['GDP_Const_Qtr' 'Ind_Pro_Mth' 'CPI_Mth' 'Core_CPI_Mth' 'PPI_Mth'
 'FAI_Cum' 'Retail_Sales_Mth' 'Imports_Mth' 'Exports_Mth' 'M1' 'M2'
 'Loans_RMB' 'Social_Financing' 'Manuf_PMI' 'Ind_Revenue_Cum' 'PPIRM_Mth'
 'CGPI_Mth' '70_Cities_Price_Mth' 'Power_Gen_Mth' 'Freight_Mth'
 'Urban_Income_Cum' 'Urban_Exp_Cum' 'Nat_Housing_Index' 'Macro_Eco_Index'
 'Housing_Sales_Cum']


Fitting model of: GDP_Const_Qtr
                           Unobserved Components Results                            
Dep. Variable:                GDP_Const_Qtr   No. Observations:                   63
Model:                         smooth trend   Log Likelihood                 -29.759
                   + stochastic seasonal(4)   AIC                             75.518
                                    + cycle   BIC                             91.721
                                    + AR(3)   HQIC                            81.800
Date:                      Tue, 17 Dec 2024                            

  test_statistic = numer_squared_sum / denom_squared_sum


                           Unobserved Components Results                            
Dep. Variable:              Ind_Revenue_Cum   No. Observations:                   63
Model:                         smooth trend   Log Likelihood                 -89.210
                   + stochastic seasonal(4)   AIC                            192.421
                                    + cycle   BIC                            206.598
                                    + AR(2)   HQIC                           197.917
Date:                      Tue, 17 Dec 2024                                         
Time:                              16:43:50                                         
Sample:                          03-31-2010                                         
                               - 09-30-2025                                         
Covariance Type:                        opg                                         
                       coef    std err          z      P>|z|     