In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\(GOV BONDS) Yield Curve Arbitrage\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC1']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
ridge_reg = Ridge(alpha=1.0)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
ridge_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = ridge_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = ridge_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]}
ridge_cv = GridSearchCV(Ridge(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = ridge_reg.coef_
intercept = ridge_reg.intercept_

# Display the coefficients and intercept
print("Intercept (b0):", intercept)
for feature, coef in zip(X.columns, coefficients):
    print(f"Coefficient for {feature} (b_{feature}):", coef)

# Create the mathematical formula
formula = "PC1 = " + f"{intercept:.4f}"
for feature, coef in zip(X.columns, coefficients):
    formula += f" + ({coef:.4f}) * {feature}"

print("\nMathematical formula for PC1:")
print(formula)

Root Mean Squared Error: 0.5534157399729221
R^2 Score: 0.7644238294806261
Best alpha: 1.0
Best cross-validated score: 0.8580311433595791

Intercept (b0): -0.16439816142176186
Coefficient for Inflation (b_Inflation): 0.2753833031080488
Coefficient for Real GDP growth (b_Real GDP growth): -0.15004377992505685
Coefficient for Unemployment rate (b_Unemployment rate): -0.15210662580791917
Coefficient for FF rate (b_FF rate): -0.23273985131752115
Coefficient for Debt (b_Debt): -0.16544316489726058
Coefficient for ISM Services PMI (b_ISM Services PMI): -0.04405496912507989
Coefficient for ISM Manufacturing PMI (b_ISM Manufacturing PMI): 0.0873395069905247
Coefficient for Economic Surprise Index (b_Economic Surprise Index): 0.15220443287733537
Coefficient for 10-Year Breakeven Inflation Rate (b_10-Year Breakeven Inflation Rate): -0.12294237931402326
Coefficient for SOFR (b_SOFR): -0.22799545464907667
Coefficient for M2 (b_M2): 0.01861981023512746
Coefficient for VIX (b_VIX): 0.0743578712420833

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\(GOV BONDS) Yield Curve Arbitrage\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC2']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
ridge_reg = Ridge(alpha=1.0)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
ridge_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = ridge_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = ridge_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]}
ridge_cv = GridSearchCV(Ridge(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = ridge_reg.coef_
intercept = ridge_reg.intercept_

# Display the coefficients and intercept
print("Intercept (b0):", intercept)
for feature, coef in zip(X.columns, coefficients):
    print(f"Coefficient for {feature} (b_{feature}):", coef)

# Create the mathematical formula
formula = "PC2 = " + f"{intercept:.4f}"
for feature, coef in zip(X.columns, coefficients):
    formula += f" + ({coef:.4f}) * {feature}"

print("\nMathematical formula for PC2:")
print(formula)

Root Mean Squared Error: 0.5684402110001177
R^2 Score: 0.5858852817378967
Best alpha: 1.0
Best cross-validated score: 0.5612052348345788

Intercept (b0): -0.07462830646156958
Coefficient for Inflation (b_Inflation): 0.011888895892125717
Coefficient for Real GDP growth (b_Real GDP growth): -0.011342971597812002
Coefficient for Unemployment rate (b_Unemployment rate): 0.01253633421567038
Coefficient for FF rate (b_FF rate): -0.018044066502428402
Coefficient for Debt (b_Debt): 0.1019312697159698
Coefficient for ISM Services PMI (b_ISM Services PMI): 0.00021189928918384504
Coefficient for ISM Manufacturing PMI (b_ISM Manufacturing PMI): 0.0024165747191477183
Coefficient for Economic Surprise Index (b_Economic Surprise Index): -0.024846157736671953
Coefficient for 10-Year Breakeven Inflation Rate (b_10-Year Breakeven Inflation Rate): 0.21003214156842698
Coefficient for SOFR (b_SOFR): -0.011428926884233165
Coefficient for M2 (b_M2): -0.034531249402018044
Coefficient for VIX (b_VIX): -0.00914

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\(GOV BONDS) Yield Curve Arbitrage\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC3']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
ridge_reg = Ridge(alpha=1.0)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
ridge_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = ridge_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = ridge_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]}
ridge_cv = GridSearchCV(Ridge(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = ridge_reg.coef_
intercept = ridge_reg.intercept_

# Display the coefficients and intercept
print("Intercept (b0):", intercept)
for feature, coef in zip(X.columns, coefficients):
    print(f"Coefficient for {feature} (b_{feature}):", coef)

# Create the mathematical formula
formula = "PC3 = " + f"{intercept:.4f}"
for feature, coef in zip(X.columns, coefficients):
    formula += f" + ({coef:.4f}) * {feature}"

print("\nMathematical formula for PC3:")
print(formula)

Root Mean Squared Error: 0.4384718585376833
R^2 Score: -1.207047606888867
Best alpha: 1.0
Best cross-validated score: 0.3498352668331353

Intercept (b0): 0.022213107834411203
Coefficient for Inflation (b_Inflation): 0.008304541553752293
Coefficient for Real GDP growth (b_Real GDP growth): 0.15013603798048772
Coefficient for Unemployment rate (b_Unemployment rate): -0.03694915781280638
Coefficient for FF rate (b_FF rate): -0.03869033849985782
Coefficient for Debt (b_Debt): -0.025595648858034458
Coefficient for ISM Services PMI (b_ISM Services PMI): 0.0024269723341735355
Coefficient for ISM Manufacturing PMI (b_ISM Manufacturing PMI): 0.021525584649443014
Coefficient for Economic Surprise Index (b_Economic Surprise Index): 0.03343252559673289
Coefficient for 10-Year Breakeven Inflation Rate (b_10-Year Breakeven Inflation Rate): 0.059732587186613784
Coefficient for SOFR (b_SOFR): -0.03950350167857114
Coefficient for M2 (b_M2): 0.036566024354549795
Coefficient for VIX (b_VIX): -0.056708448

## So PC1 will work but PC3 is not influenced by the macro indicators I compiled