*https://github.com/Aubrey-Bermuda-CA/IFRS9-MODELING-CHINA*

*2.1 Function Description: Iterate all macroeconomic factors and endog*

In [1]:
import time
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.diagnostic as dg

In [2]:
# read data
macro_data = pd.read_csv(f'./macrofactor_historical_data.csv', index_col=[0], parse_dates=[0])
endog_data = pd.read_csv(f'historical_endog.csv', index_col=[0], parse_dates=[0])

In [3]:
# Custom function: Calculate a lag period of 1 to n periods for all columns in a DataFrame
def calculate_lags(df, lags):
    for column in df.columns:
        for lag in lags:
            df[f'{column}_lag{lag}'] = df[column].shift(lag)
    return df

# Calculate the lag period of macroeconomic factors
lags = [1, 2, 3, 4]
macro_data_lag = calculate_lags(macro_data, lags)

  df[f'{column}_lag{lag}'] = df[column].shift(lag)


In [4]:
df = macro_data_lag

# get macro factor group information
group_df = pd.DataFrame({'name_lag': df.columns.tolist()})
group_df['name'] = group_df['name_lag'].str.split('_lag').str[0]
pivot_df = group_df.pivot(columns='name', values='name_lag')

# prepair data
GDP = 'GDP_Const_Qtr' # GDP is used here as a mandatory item, but it is not necessary to do so.
gdp_series = pivot_df[GDP].dropna().tolist()
pivot_df = pivot_df.drop(GDP, axis=1)

# calculate the number of equations
scale = sum(1 for _ in itertools.combinations(pivot_df.columns, 2) \
            for _ in itertools.product(gdp_series, pivot_df[_[0]].dropna(), pivot_df[_[1]].dropna()))

# timer
i = 0
start = time.perf_counter()

# results columns
res_columns= ['V1', 'V2', 'V3', 'Intercept', 'Rsqr', 'AdjRsqr', 'F_pval', 'c1', 'c2', 'c3', 'c0', 'p1', 'p2', 'p3', 'p0', \
              'BGtest', 'ADtest', 'BPtest', 'V1_VIF', 'V2_VIF', 'V3_VIF']
result = []
results = []

# iterate equations
# Here we take the OLS model as the basic model for example, and other models can also be customized.
# Hypothesis testing can also be customized as needed.
for factor in itertools.combinations(pivot_df, 2):
    x1 = pivot_df[factor[0]].dropna()
    x2 = pivot_df[factor[1]].dropna()
    for cartesian in itertools.product(gdp_series, x1, x2):
        # define the variables
        X = df[list(cartesian)]
        Y = endog_data[['endog']]
        
        # Get sample interval
        sample_range = pd.merge(X, Y, left_index=True, right_index=True).dropna()
        x_range = sample_range[list(cartesian)]
        # adding the constant term
        x_range['Intercept'] = 1
        y_range = sample_range[['endog']]
        
        # performing the regression and fitting the model
        fit = sm.OLS(endog=y_range, exog=x_range).fit()
        resid = fit.resid
        # print(ols_model.summary())
        
        Rsqr = fit.rsquared
        AdjRsqr = fit.rsquared_adj
        F_pVal = fit.f_pvalue
        coef = list(fit.params)
        pVal = list(fit.pvalues)
        
        # Breusch Godfrey test Autocorrelation p>0.05
        bgtest = dg.acorr_breusch_godfrey(fit)[1]

        # Anderson-Darling test Distribution p>0.05
        adtest = dg.normal_ad(resid)[1]
        
        # Breusch–Pagan test Heteroscedasticity p>0.05
        bptest = dg.het_breuschpagan(resid, exog_het=fit.model.exog)[1]
        
        # Variance inflation factor Multicollinearity <10
        # OLS, which is used in the python variance inflation factor calculation, does not add an intercept by default.
        VIF = pd.Series(np.linalg.inv(x_range.iloc[:, :-1].corr().to_numpy()).diagonal(), index=x_range.iloc[:, :-1].columns)[:3]
        
        # result
        result = list(cartesian) + ['Intercept'] + [Rsqr, AdjRsqr, F_pVal] + coef + pVal + [bgtest, adtest, bptest] + list(VIF)
        results.append(result)
        
        # Progress bar
        i += 1
        a = round(i / scale * 50) * "#"
        b = round((1 - i / scale) * 50) * " "
        c = (i / scale) * 100
        dur = time.perf_counter() - start
        print("\r{:^3.0f}%[{}->{}]{:.2f}s, iterating {} equation, a total of {}".format(c, a, b, dur, i, scale), end="")

# output
equation = pd.DataFrame(columns=res_columns, data=results)
equation.to_csv(f'equation.csv')

100%[##################################################->]139.01s, iterating 34500 equation, a total of 345000