In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\Yield Curve & Optimal Fly\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC1']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
lasso_reg = Lasso(alpha=0.1)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
lasso_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = lasso_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = lasso_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_cv = GridSearchCV(Lasso(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = lasso_reg.coef_
intercept = lasso_reg.intercept_

# Create a DataFrame to store features and their corresponding coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)  # Take the absolute value of the coefficients
})

# Filter out features where the absolute value of the coefficient is 0
coef_df = coef_df[coef_df['Abs_Coefficient'] > 0]

# Sort the DataFrame by the absolute value of the coefficients in descending order
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display the intercept
print("Intercept (b0):", intercept)

# Display the non-zero coefficients
for _, row in coef_df.iterrows():
    print(f"Coefficient for {row['Feature']} (b_{row['Feature']}): {row['Coefficient']}")

# Create the mathematical formula only with non-zero coefficients
formula = "PC1 = " + f"{intercept:.4f}"
for _, row in coef_df.iterrows():
    formula += f" + ({row['Coefficient']:.4f}) * {row['Feature']}"

print("\nMathematical formula for PC1:")
print(formula)

Root Mean Squared Error: 0.33368488950587677
R^2 Score: 0.9143551160905226
Best alpha: 0.01
Best cross-validated score: 0.9183872185003233

Intercept (b0): -0.1643981614217566
Coefficient for FF rate (b_FF rate): -0.5608306869611592
Coefficient for 5-Year, 5-Year Forward Inflation Expectation Rate (b_5-Year, 5-Year Forward Inflation Expectation Rate): -0.2958926959114346
Coefficient for Moody's Seasoned Aaa Corporate Bond Yield (b_Moody's Seasoned Aaa Corporate Bond Yield): -0.19022807841888137
Coefficient for Liabilities and Capital: Liabilities: Deposits with F.R. Banks, Other Than Reserve Balances: U.S. Treasury, General Account: Week Average (b_Liabilities and Capital: Liabilities: Deposits with F.R. Banks, Other Than Reserve Balances: U.S. Treasury, General Account: Week Average): -0.18330725731546454
Coefficient for 30-Year Fixed Rate Mortgage Average in the US (b_30-Year Fixed Rate Mortgage Average in the US): -0.029207025294657633
Coefficient for Inflation (b_Inflation): 0.0148

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\Yield Curve & Optimal Fly\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC2']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
lasso_reg = Lasso(alpha=0.1)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
lasso_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = lasso_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = lasso_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_cv = GridSearchCV(Lasso(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = lasso_reg.coef_
intercept = lasso_reg.intercept_

# Create a DataFrame to store features and their corresponding coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)  # Take the absolute value of the coefficients
})

# Filter out features where the absolute value of the coefficient is 0
coef_df = coef_df[coef_df['Abs_Coefficient'] > 0]

# Sort the DataFrame by the absolute value of the coefficients in descending order
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display the intercept
print("Intercept (b0):", intercept)

# Display the non-zero coefficients
for _, row in coef_df.iterrows():
    print(f"Coefficient for {row['Feature']} (b_{row['Feature']}): {row['Coefficient']}")

# Create the mathematical formula only with non-zero coefficients
formula = "PC2 = " + f"{intercept:.4f}"
for _, row in coef_df.iterrows():
    formula += f" + ({row['Coefficient']:.4f}) * {row['Feature']}"

print("\nMathematical formula for PC2:")
print(formula)

Root Mean Squared Error: 0.49219778388230434
R^2 Score: 0.6895223159401789


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha: 0.001
Best cross-validated score: 0.5753329522889647

Intercept (b0): -0.07462830646157541
Coefficient for 30-year Breakeven Inflation Rate (%) (b_30-year Breakeven Inflation Rate (%)): 0.19013424623357297
Coefficient for Personal Saving Rate (b_Personal Saving Rate): -0.18948485830345213
Coefficient for 7-year Breakeven Inflation Rate (%) (b_7-year Breakeven Inflation Rate (%)): 0.12998447530541354

Mathematical formula for PC2:
PC2 = -0.0746 + (0.1901) * 30-year Breakeven Inflation Rate (%) + (-0.1895) * Personal Saving Rate + (0.1300) * 7-year Breakeven Inflation Rate (%)


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_excel("C:\\Users\\Bogdan\\OneDrive - University of Warwick\\Desktop\\Projects\\Yield Curve & Optimal Fly\\Data\\Regress data.xlsx")

X = df.drop(columns=['PC1', 'PC2', 'PC3', 'Date']) #just exogenous variables
y = df['PC3']  # The target variable

# Split the data into training and testing sets
#random_state is to get the same test training sets irrespective of how many runs u do and 42 doesn't stand for anything logical
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler() #Basically what happens is you subtract the mean then divide by the std. Is is simple standardisation like the one in PCA. And the values that you get once you standardise cause they not the same right are calle z-score(s) and have a mean=0 and std=1
X_train_scaled = scaler.fit_transform(X_train) #fit_transform just applies the scaler to the data like each feature/column.
X_test_scaled = scaler.transform(X_test) #here fit is missing because I need the fit to be the same as the one in the training so the transform applies the fit from the training to the data like see fit_transform as 2 separate methods (fit calcualtes the mean and std of each column and transform applies the standardisation)

# Fit the Ridge Regression Model
lasso_reg = Lasso(alpha=0.1)  # ridge regression becasue of the multicollinearity (when predictors are highly correlated) i mean ridge with alpha=0 is the same as OLS but with alpha you have a greater penlaty for high coeff. Like OLS chooses the coef s.t. it minmises a cost function, that alpha is timed by the sum of coeff and added to the cost function so the coeff will be smaller once u do that and that helps with avoiding overfitting (although this is ehh cause it's not blac and withe as in waht is best to have). so OLS's cost function is the sume of squared residuals (actual - predicted) so it is prone to overfit.
lasso_reg.fit(X_train_scaled, y_train) #the model learns the relationships between the input features (X_train_scaled) and the target variable (y_train)

# Evaluate the model
y_pred = lasso_reg.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2_score = lasso_reg.score(X_test_scaled, y_test) #this is r squared (this is for the model that was created in the training stage and now used test data) So ideally u want 1. Now if the test data is like 1 day then you can think that you overfit but if it is 50% of the data then u probably did smth pretty good. hope u got the idea with overfit but again if it works well during test and the test is sufficintly large then gg

print(f'Root Mean Squared Error: {rmse}') #so on avergae I am missing by 0.83
print(f'R^2 Score: {r2_score}') #again like this a debate of waht a good R**2 value is fora 20% test data and a ridge regress

#Tuning the alpha hyperparameter using GridSearchCV
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
ridge_cv = GridSearchCV(Lasso(), alpha_values, cv=5) # cv=5 = five fold cross validation. The model is trained on 4 subsets and validated on the remaining one. This process is repeated 5 times (each time with a different validation subset), and the results are averaged to assess the model's performance.
ridge_cv.fit(X_train_scaled, y_train)

print(f'Best alpha: {ridge_cv.best_params_["alpha"]}') #retrieves the best alpha
print(f'Best cross-validated score: {ridge_cv.best_score_}') #like best R**2 using th ebest alpha
print('')

#either way u spin it you have to choose a test size (%) and an alpha based on other aspects and not grid search cause drig search will make alpha 0 and test data small so unless u get better R^2 values for higher alpha and higher test then grid is illogical and the test_data split is pretty much depnedent on your horizon, alph is for how much u wanna avoid overfitting

# Extract the coefficients and intercept
coefficients = lasso_reg.coef_
intercept = lasso_reg.intercept_

# Create a DataFrame to store features and their corresponding coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)  # Take the absolute value of the coefficients
})

# Filter out features where the absolute value of the coefficient is 0
coef_df = coef_df[coef_df['Abs_Coefficient'] > 0]

# Sort the DataFrame by the absolute value of the coefficients in descending order
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display the intercept
print("Intercept (b0):", intercept)

# Display the non-zero coefficients
for _, row in coef_df.iterrows():
    print(f"Coefficient for {row['Feature']} (b_{row['Feature']}): {row['Coefficient']}")

# Create the mathematical formula only with non-zero coefficients
formula = "PC3 = " + f"{intercept:.4f}"
for _, row in coef_df.iterrows():
    formula += f" + ({row['Coefficient']:.4f}) * {row['Feature']}"

print("\nMathematical formula for PC3:")
print(formula)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Root Mean Squared Error: 0.3208470096235788
R^2 Score: -0.1817462762859885
Best alpha: 0.1
Best cross-validated score: -1.1711233090892499

Intercept (b0): 0.022213107834411133
Coefficient for Median Sales Price of Houses Sold for the US (b_Median Sales Price of Houses Sold for the US): 0.032039300941968635
Coefficient for Reserves of Depository Institutions: Total (b_Reserves of Depository Institutions: Total): -0.009471282309944167

Mathematical formula for PC3:
PC3 = 0.0222 + (0.0320) * Median Sales Price of Houses Sold for the US + (-0.0095) * Reserves of Depository Institutions: Total


## PC3 is not influenced by the macro indicators I compiled