# Project #2: Time Series Model

In [18]:
# Project #2: Time Series Model by Chris Yu Yan

import numpy as np
import pandas as pd
import datetime as dt
from statsmodels.tsa.stattools import adfuller
import pandas_datareader.data as web
import statsmodels.api as sm
from itertools import combinations   

### Step 1

Load and read the data

In [19]:
card_data = pd.read_excel('/Users/alukadawn/Desktop/Columbia/2024 Fall/CRA/Projects/Project #2 -Time Series Model/card.xlsx')
cre_data = pd.read_excel('/Users/alukadawn/Desktop/Columbia/2024 Fall/CRA/Projects/Project #2 -Time Series Model/CRE.xlsx')
card_data = pd.DataFrame(card_data)
cre_data = pd.DataFrame(cre_data)


Convert the card and cre data into charge-off percentages

In [20]:
card_data['card_chargeoff_pct'] = (card_data['chargeoffs'] / card_data['loans'])
cre_data['cre_chargeoff_pct'] = (cre_data['chargeoffs'] / cre_data['loans'])

Use ADF to test stationarity

In [21]:
card_adf_result = adfuller(card_data['card_chargeoff_pct'])
cre_adf_result = adfuller(cre_data['cre_chargeoff_pct'])

card_adf_statistic = card_adf_result[0]
card_adf_p_value = card_adf_result[1]

cre_adf_statistic = cre_adf_result[0]
cre_adf_p_value = cre_adf_result[1]

card_adf_output = f"Card Charge-Off ADF Statistic: {card_adf_statistic}, p-value: {card_adf_p_value}"
cre_adf_output = f"CRE Charge-Off ADF Statistic: {cre_adf_statistic}, p-value: {cre_adf_p_value}"
(card_adf_output, cre_adf_output)

('Card Charge-Off ADF Statistic: -2.836318810006879, p-value: 0.053270234034481015',
 'CRE Charge-Off ADF Statistic: -1.58897552846476, p-value: 0.4891290849010973')

Both data have p-value greater than 0.05, indicating non-stationarity.
However, the card data is only slightly above the 5% threshold, meaning the test is not quite rejecting the null hypothesis of non-stationarity. 


Therefore, we will proceed to treat the card data as stationary and take the difference of the cre data to improve stationarity.

In [22]:
cre_data['cre_chargeoff_pct'] = cre_data['cre_chargeoff_pct'].diff()

cre_adf_result = adfuller(cre_data['cre_chargeoff_pct'][~cre_data['cre_chargeoff_pct'].isna()])

cre_adf_statistic = cre_adf_result[0]
cre_adf_p_value = cre_adf_result[1]

cre_adf_output = f"CRE Charge-Off ADF Statistic: {cre_adf_statistic}, p-value: {cre_adf_p_value}"
cre_adf_output

'CRE Charge-Off ADF Statistic: -3.332627529259299, p-value: 0.013481545256871296'

The cre data is stationary after taking the first difference and the p-value from ADF test is less than 0.05. Now we can proceed with further calculations.

### Step 2

Download unemployment data (UNRATE), oil prices (DCOILBRENTEU), US GDP (GDP), 10-year minus 2-year treasury rates (T10Y2Y), and a volatility series of your choice. Here we will go with VIX.

In [23]:
unrate = web.DataReader("UNRATE", "fred", start = '2000-01-01')
oil_price = web.DataReader('DCOILBRENTEU', "fred", start = '2000-01-01')
gdp = web.DataReader("GDP", "fred", start = '2000-01-01')
t10y2y = web.DataReader('T10Y2Y', 'fred', start = '2000-01-01')
vix = web.DataReader('VIXCLS', 'fred', start = '2000-01-01')

Pull the economic data into a pandas data frame and find the ADF test statistics, adjust the data accordingly.

In [24]:
# The unrate data is monthly on the first of the month, so we will adjust the data to match the datetime.
unrate['date'] = [x.date() - dt.timedelta(days=1) for x in unrate.index]
unrate = unrate[unrate['date'] <= dt.date(2020,1,1)]
unrate['year'] = [x.year for x in unrate.date]
unrate['month'] = [x.month for x in unrate.date]
unrate = unrate[~unrate.UNRATE.isna()]
unrate = unrate[unrate.month.isin([3,6,9,12])]
unrate_adf_p_value = adfuller(unrate.UNRATE)[1]
unrate_adf_p_value 
# It's not stationary, so we will take the first difference to make it stationary.

0.3498738676195984

In [25]:
unrate['UNRATE'] = unrate['UNRATE'].diff()
unrate = unrate[~unrate.UNRATE.isna()]
unrate_adf_p_value = adfuller(unrate.UNRATE)[1]
unrate_adf_p_value

0.011544625714323992

Now we have done with the adjustment for unrate and checked its stationarity, let's proceed to other data.

In [26]:
# Oil Price
oil_price['date'] = [x.date() - dt.timedelta(days=1) for x in oil_price.index]
oil_price = oil_price[oil_price['date'] <= dt.date(2020, 1, 1)]
oil_price['year'] = [x.year for x in oil_price.date]
oil_price['month'] = [x.month for x in oil_price.date]
oil_price['day'] = [x.day for x in oil_price.date]
oil_price = oil_price.resample('ME').last()
oil_price = oil_price[~oil_price.DCOILBRENTEU.isna()]
oil_price = oil_price[oil_price.month.isin([3,6,9,12])]
# We tried to us .diff() but the data appears to be very messy because it was observed on a daily basis,
# so we will try a lag now.
oil_price['oil_price_lag'] = oil_price.DCOILBRENTEU.shift()
oil_price['oil_price_growth'] = (oil_price.DCOILBRENTEU - oil_price.oil_price_lag)/oil_price.oil_price_lag
oil_price = oil_price[~oil_price.oil_price_growth.isna()]
oil_price_adf_p_value = adfuller(oil_price.oil_price_growth)[1]
oil_price_adf_p_value

3.4730305410138694e-13

Create a GDP growth variable (GDP_t – GDP_t-1) / GDP_t-1.

In [27]:
# US GDP
gdp['date'] = [x.date() - dt.timedelta(days=1) for x in gdp.index]
gdp = gdp[gdp.date <= dt.date(2020, 1, 1)]
gdp['month'] = [x.month for x in gdp.date]
gdp['year'] = [x.year for x in gdp.date]
gdp['gdp_lag'] = gdp.GDP.shift()
gdp['gdp_growth'] = (gdp.GDP - gdp.gdp_lag)/gdp.gdp_lag
gdp = gdp[~gdp.gdp_growth.isna()]
gdp_adf_p_value = adfuller(gdp.gdp_growth)[1]
gdp_adf_p_value


5.035206322282689e-06

In [28]:
# 10-year minus 2-year treasury rates
t10y2y['date'] = [x.date() - dt.timedelta(days=1) for x in t10y2y.index]
t10y2y = t10y2y[t10y2y['date'] <= dt.date(2020, 3, 1)]
t10y2y['year'] = [x.year for x in t10y2y.date]
t10y2y['month'] = [x.month for x in t10y2y.date]
t10y2y = t10y2y.resample('ME').last()
t10y2y = t10y2y[~t10y2y.T10Y2Y.isna()]
t10y2y = t10y2y[t10y2y.month.isin([3,6,9,12])]

t10y2y['t10y2y_lag'] = t10y2y.T10Y2Y.shift()
t10y2y['t10y2y_growth'] = (t10y2y.T10Y2Y - t10y2y.t10y2y_lag)/t10y2y.t10y2y_lag
t10y2y = t10y2y[~t10y2y.t10y2y_growth.isna()]
t10y2y_adf_p_value = adfuller(t10y2y.t10y2y_growth)[1]
t10y2y_adf_p_value


3.3406876459272497e-08

In [29]:
# Volatility, here we chose VIX
vix['date'] = [x.date() - dt.timedelta(days=1) for x in vix.index]
vix = vix[vix['date'] <= dt.date(2020, 3, 1)]
vix['year'] = [x.year for x in vix.date]
vix['month'] = [x.month for x in vix.date]
vix = vix.resample('ME').last().dropna()
vix = vix[~vix.VIXCLS.isna()]
vix = vix[vix.month.isin([3,6,9,12])]

vix['vix_lag'] = vix.VIXCLS.shift()
vix['vix_growth'] = (vix.VIXCLS - vix.vix_lag)/vix.vix_lag
vix = vix[~vix.vix_growth.isna()]
vix_adf_p_value = adfuller(vix.vix_growth)[1]
vix_adf_p_value


0.008504485806052517

Combine the economic data with charge-off data

In [30]:
economic_data = pd.merge(unrate, oil_price[['oil_price_growth','month','year']], how='left', on=['month','year'])

economic_data = (
    pd.merge(economic_data, gdp[['gdp_growth','month','year']], how='left', on=['month', 'year'])
    .merge(t10y2y[['t10y2y_growth', 'month', 'year']], how='left', on=['month', 'year'])
    .merge(vix[['vix_growth', 'month', 'year']], how='left', on=['month', 'year'])
)

print(economic_data.head())

   UNRATE        date  year  month  oil_price_growth  gdp_growth  \
0    -0.2  2000-03-31  2000      3               NaN    0.024549   
1     0.2  2000-06-30  2000      6          0.316931    0.006874   
2    -0.1  2000-09-30  2000      9         -0.100063    0.011395   
3     0.3  2000-12-31  2000     12         -0.205489    0.003305   
4     0.2  2001-03-31  2001      3          0.040744    0.012299   

   t10y2y_growth  vix_growth  
0            NaN         NaN  
1      -0.255319   -0.189548  
2      -0.485714    0.052712  
3      -1.055556    0.305299  
4      74.000000    0.066667  


In [31]:
card_data['date'] = pd.to_datetime(card_data['date'])
card_data['year'] = card_data['date'].dt.year
card_data['month'] = card_data['date'].dt.month

cre_data['date'] = pd.to_datetime(cre_data['date'])
cre_data['year'] = cre_data['date'].dt.year
cre_data['month'] = cre_data['date'].dt.month

economic_data = pd.merge(economic_data, 
                      pd.merge(card_data[['card_chargeoff_pct', 'month', 'year']], 
                               cre_data[['cre_chargeoff_pct', 'month', 'year']], 
                               how='left', on=['month', 'year']),
                      how='left', on=['month', 'year'])
print(economic_data.head())


   UNRATE        date  year  month  oil_price_growth  gdp_growth  \
0    -0.2  2000-03-31  2000      3               NaN    0.024549   
1     0.2  2000-06-30  2000      6          0.316931    0.006874   
2    -0.1  2000-09-30  2000      9         -0.100063    0.011395   
3     0.3  2000-12-31  2000     12         -0.205489    0.003305   
4     0.2  2001-03-31  2001      3          0.040744    0.012299   

   t10y2y_growth  vix_growth  card_chargeoff_pct  cre_chargeoff_pct  
0            NaN         NaN                 NaN                NaN  
1      -0.255319   -0.189548                 NaN                NaN  
2      -0.485714    0.052712                 NaN                NaN  
3      -1.055556    0.305299                 NaN                NaN  
4      74.000000    0.066667            0.012672                NaN  


Run all possible AR1, three factors model. (one lag and three factors)

In [32]:
# Create lags for the charge-offs respectively
economic_data['card_chargeoff_lag'] = economic_data['card_chargeoff_pct'].shift()
economic_data['cre_chargeoff_lag'] = economic_data['cre_chargeoff_pct'].shift()

# Create factors 
factors = ['UNRATE', 'oil_price_growth', 'gdp_growth', 't10y2y_growth', 'vix_growth']

In [33]:
def ar1_models(data, dependent_var, lag_var, factors):
    best_r_squared = -np.inf
    best_model = None
    best_factors = None

    factor_combinations = list(combinations(factors, 3))

    for combo in factor_combinations:
        X_vars = [lag_var] + list(combo)
        model_data = data[[dependent_var] + X_vars].dropna()

        X = model_data[X_vars]
        y = model_data[dependent_var]
        X = sm.add_constant(X)

        model = sm.OLS(y, X).fit()

        if model.rsquared > best_r_squared: # Check if this model has the best R-squared so far
            best_r_squared = model.rsquared
            best_model = model
            best_factors = combo

    return best_factors, best_r_squared, best_model

# AR1 model for card_chargeoff_pct
best_card_factors, best_card_r_squared, best_card_model = ar1_models(
    economic_data, 'card_chargeoff_pct', 'card_chargeoff_lag', factors
)

print(f"Best Card Model Factors: {best_card_factors}, R-squared: {best_card_r_squared:.4f}")
print(best_card_model.summary())

Best Card Model Factors: ('UNRATE', 'oil_price_growth', 'gdp_growth'), R-squared: 0.8667
                            OLS Regression Results                            
Dep. Variable:     card_chargeoff_pct   R-squared:                       0.867
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                     113.8
Date:                Fri, 11 Oct 2024   Prob (F-statistic):           7.28e-30
Time:                        14:37:37   Log-Likelihood:                 373.72
No. Observations:                  75   AIC:                            -737.4
Df Residuals:                      70   BIC:                            -725.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------

### Comment on Results
- The Card charge-offs model is quite strong, with an R-squared explaining 86.7% of the variance in charge-off percentages. 
- The model is driven mostly by the lagged dependent variable and the unemployment rate given both the Card charge-off lag and UNRATE have p-values less than 0.001, suggesting that the historical Card charge-off data and unemployment rate data are strong predictors of current charge-off rates.
- However, oil price growth and GDP growth don't add much explanatory power given their p-values are greater than 0.05, even though the model fits the data well overall.

In [34]:
# AR1 models for cre_chargeoff_pct
best_cre_factors, best_cre_r_squared, best_cre_model = ar1_models(
    economic_data, 'cre_chargeoff_pct', 'cre_chargeoff_lag', factors
)

print(f"\nBest CRE Model Factors: {best_cre_factors}, R-squared: {best_cre_r_squared:.4f}")
print(best_cre_model.summary())


Best CRE Model Factors: ('UNRATE', 'gdp_growth', 'vix_growth'), R-squared: 0.3709
                            OLS Regression Results                            
Dep. Variable:      cre_chargeoff_pct   R-squared:                       0.371
Model:                            OLS   Adj. R-squared:                  0.334
Method:                 Least Squares   F-statistic:                     10.17
Date:                Fri, 11 Oct 2024   Prob (F-statistic):           1.57e-06
Time:                        14:37:37   Log-Likelihood:                 456.79
No. Observations:                  74   AIC:                            -903.6
Df Residuals:                      69   BIC:                            -892.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Comment on Results
- The CRE charge-offs model is weaker, with an R-squared explaining only 37.1% of the variance in charge-offs. 
- The lagged cre charge-offs, unemployment rate, and VIX growth are significant, the negative coefficients indicating that higher CRE charge-offs and volatility index in the previous period are associated with lower charge-offs in the current period. However, the high unemployment rate still slightly increase CRE charge-offs of the current period.
- The model doesn't perform as well as the card charge-off model, indicating that other factors likely drive CRE charge-offs.

# Step 3

Comment on what other factors might be useful for an exercise like this. Also, comment on what information you would need to make forecasts in the future using these models.

Some other factors that might improve the explanatory power of the models are:
- Interest Rates: Loan interest rates are key drivers of consumer and commercial borrowing costs. Higher interest rates may increase default risks as debt servicing becomes more expensive.
- Inflation Rates: Inflation affects the purchasing power of consumers and businesses. Rising inflation can lead to higher expenses, reducing the ability of borrowers to service debts.
- Credit Spread: The spread between corporate bond yields and government bond yields is often used as a measure of credit risk. A widening spread may indicate higher default risk.

Additional information needed to better forecast in the future:
- Updated Macroeconomic Data: To make accurate forecasts, the model needs the most up-to-date data on all macroeconomic variables used in the model.
- Scenario Projections and Stress Testings: We can use projected economic conditions such as expected changes in GDP, interest rates, inflation, and unemployment from the projections provided by central banks and credible financial institutions. Furthermore, stress testings under adverse economic conditions is also important to estimate how charge-offs might behave in worst-case scenarios.
- Historical Data for Model Re-Estimation: Over time, the relationships between variables might change, requiring periodic re-estimation of the models to ensure they remain accurate and relevant. Updated datasets with longer histories will help refine and improve the model's predictive accuracy.