In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
import matplotlib.pyplot as plt

#linear regression
#Data Appendix
#Add comment and restructure project organization

In [9]:
# load econ data
approval = pd.read_csv('../DATA/approval_rating.csv')
gdp = pd.read_csv('../DATA/real_GDP_per_capita.csv', index_col=False)
gdp_change = pd.read_csv('../DATA/real_GDP_per_capita_daily_change.csv', index_col=False)
income = pd.read_csv('../DATA/median_household_income.csv', index_col=False)
income_change = pd.read_csv('../DATA/median_household_income_daily_change.csv', index_col=False)
sp500 = pd.read_csv('../DATA/sp500_historical_data.csv', index_col=False)
sp500_change = pd.read_csv('../DATA/sp500_daily_change.csv', index_col=False)
unemployment = pd.read_csv('../DATA/unemployment_rate.csv', index_col=False)

In [None]:
# rename columns for clean names
gdp.rename(columns={'A939RX0Q048SBEA': 'GDP', 'observation_date': 'observation_date'}, inplace=True)
gdp_change.rename(columns={'A939RX0Q048SBEA': 'GDP_Change', 'observation_date': 'observation_date'}, inplace=True)

income.rename(columns={'MEHOINUSA672N': 'Income', 'observation_date': 'observation_date'}, inplace=True)
income_change.rename(columns={'MEHOINUSA672N': 'Income', 'Change': 'Income_Change', 'observation_date': 'observation_date'}, inplace=True)

unemployment.rename(columns={'UNRATE': 'Unemployment', 'observation_date': 'observation_date'}, inplace=True)
sp500.rename(columns={'Close': 'SP500_Close'}, inplace=True)
sp500_change.rename(columns={'Close_Change': 'SP500_Close_Change'}, inplace=True)



In [6]:
# make dates datetime format
gdp['observation_date'] = pd.to_datetime(gdp['observation_date'])
gdp_change['observation_date'] = pd.to_datetime(gdp_change['observation_date'])
income['observation_date'] = pd.to_datetime(income['observation_date'])
income_change['observation_date'] = pd.to_datetime(income_change['observation_date'])
sp500['Date'] = pd.to_datetime(sp500['Date'], utc=True).dt.tz_localize(None)
sp500_change['Date'] = pd.to_datetime(sp500_change['Date'], utc=True).dt.tz_localize(None)
unemployment['observation_date'] = pd.to_datetime(unemployment['observation_date'])



NameError: name 'pd' is not defined

In [None]:
# sort values for merge_asof()
approval = approval.sort_values(by='Start Date')
gdp = gdp.sort_values(by='observation_date')
gdp_change = gdp_change.sort_values(by='observation_date')
income = income.sort_values(by='observation_date')
income_change = income_change.sort_values(by='observation_date')
unemployment = unemployment.sort_values(by='observation_date')
sp500 = sp500.sort_values(by='Date')
sp500_change = sp500_change.sort_values(by='Date')


In [None]:

# merge
df = pd.merge_asof(approval, gdp, left_on='Start Date', right_on='observation_date', direction='backward')
df.drop(columns=['observation_date'], inplace=True)

df = pd.merge_asof(df, gdp_change, left_on='Start Date', right_on='observation_date', direction='backward')
df.drop(columns=['observation_date'], inplace=True)

df = pd.merge_asof(df, income, left_on='Start Date', right_on='observation_date', direction='backward')
df.drop(columns=['observation_date'], inplace=True)

# Drop  Date After Merge
df = pd.merge_asof(df, income_change, left_on='Start Date', right_on='observation_date', direction='backward')
df.drop(columns=['observation_date'], inplace=True)  # ✅ THIS IS THE FIX

df = pd.merge_asof(df, unemployment, left_on='Start Date', right_on='observation_date', direction='backward')
df.drop(columns=['observation_date'], inplace=True)

df = pd.merge_asof(df, sp500, left_on='Start Date', right_on='Date', direction='backward')
df.drop(columns=['Date'], inplace=True)

df = pd.merge_asof(df, sp500_change, left_on='Start Date', right_on='Date', direction='backward')
df.drop(columns=['Date'], inplace=True)




In [None]:
# take care of NaNs
df.ffill(inplace=True)
df.fillna(method='bfill', inplace=True)  # Backup fill
df.replace([np.inf, -np.inf], np.nan, inplace=True)


  df.fillna(method='bfill', inplace=True)  # Backup fill


In [None]:

# drop rows with missing target values
df.dropna(subset=['Approving'], inplace=True)



In [None]:
# make interaction terms
df['Unemployment_Stock'] = df['Unemployment'] * df['SP500_Close']
df['GDP_Unemployment'] = df['GDP'] * df['Unemployment']



In [None]:
print(df.columns)


Index(['Start Date', 'End Date', 'Approving', 'Disapproving', 'Unsure/NoData',
       'Candidate', 'GDP', 'GDP_Change', 'Change', 'Income_x', 'Income_y',
       'Income_Change', 'Unemployment', 'Open_x', 'High_x', 'Low_x',
       'SP500_Close', 'Volume_x', 'Dividends_x', 'Stock Splits_x', 'Open_y',
       'High_y', 'Low_y', 'Close', 'Volume_y', 'Dividends_y', 'Stock Splits_y',
       'SP500_Close_Change', 'Unemployment_Stock', 'GDP_Unemployment'],
      dtype='object')


In [None]:
# use GDP, Unemployment, and SP500_Close as predictors (remove Income and GDP_Unemployment)
X = df[['GDP', 'Unemployment', 'SP500_Close']]  # focus on the most relevant variables-- but maybe  change??

# normalize  predictors
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# constant term for intercept
X_scaled = sm.add_constant(X_scaled)





In [None]:
# fit model
model = sm.OLS(y, X_scaled).fit()
print(model.summary())



                            OLS Regression Results                            
Dep. Variable:              Approving   R-squared:                       0.119
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     85.53
Date:                Tue, 18 Mar 2025   Prob (F-statistic):           6.51e-52
Time:                        22:28:44   Log-Likelihood:                -7404.6
No. Observations:                1907   AIC:                         1.482e+04
Df Residuals:                    1903   BIC:                         1.484e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         51.5543      0.269    191.393      0.0

In [None]:
# eval model performance
y_pred = model.predict(X_scaled)
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print("\nModel Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")


Model Performance:
MAE: 8.93
RMSE: 11.75
R-squared: 0.12


Index(['GDP', 'Unemployment'], dtype='object')
