In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

class LinearModel:
    """
    A class to perform linear regression.

    Attributes
    ----------
    df: Dataframe
        Input dataframe containing date, independent variables and dependent variable.
    X_col: List
        The list of independent variables.
    y_col: str
        The name of dependent variable.
    test_size: float
        The proportion of data assigned to test set.

    Methods
    -------
    get_summary():
        Obtain MSE, coefficients, intercept, residuals, equation, and R-Squared.
        Return a dictionary containing all coefficents.
    get_combined_df():
        Return a dataframe with date, value of true and prediction, type of value.
    get_parameters():
        Return a dataframe of variables and coefficients ordered by absolute value.
    """

    def __init__(self, df, X_col, y_col, test_size):
        self.df = df
        self.n = round(len(self.df)*test_size)
        self.X_col = X_col
        self.X = self.df[self.X_col]
        self.y_col = y_col
        self.y = self.df[[self.y_col]]
        self.X_train = self.X.iloc[self.n:,:]
        self.y_train = self.y.iloc[self.n:,:]
        self.X_test = self.X.iloc[:self.n,:]
        self.y_test = self.y.iloc[:self.n,:]
        self.model = LinearRegression().fit(self.X_train, self.y_train)
        self.simple = False
        if len(X_col) == 1:
            self.simple = True

    def get_summary(self):
        self.mse = mean_squared_error(self.y_test, self.model.predict(self.X_test))
        self.coef = self.model.coef_.tolist()
        self.intercept = self.model.intercept_.tolist()
        self.residuals = self.y_test - self.model.predict(self.X_test)
        self.equation = self.y_col + ' = ' + str(self.intercept[0])
        for i in range(len(self.coef)):
            self.equation += ' + ' + str(self.coef[i][0]) + ' * ' + self.X_train.columns[i]
        self.R_Squared = self.model.score(self.X_train, self.y_train)
        return dict({'MSE': self.mse, 'Coefficients': self.coef, 'Intercept': self.intercept, 'Residuals': self.residuals, 
                     'Equation': self.equation, 'R-Squared': self.R_Squared})
    
    def get_combined_df(self, model='LinearModel'):
        if self.simple:
            df_prediction = self.df[['date',self.X_col[0]]]
            df_true = self.df[['date',self.X_col[0], self.y_col]]
        else:
            df_prediction = self.df[['date']]
            df_true = self.df[['date',self.y_col]]
        prediction = []
        for i in self.model.predict(self.X_test):
            prediction.append(i[0])
        for i in self.model.predict(self.X_train):
            prediction.append(i[0])
        df_prediction[self.y_col] = prediction
        df_prediction['type'] = f'{model}_prediction'
        df_true['type'] = 'true value'
        return pd.concat([df_true, df_prediction], ignore_index=True)
    
    
    def get_parameters(self):
        
        variables = pd.DataFrame(self.X_col, 
                                 columns=['variable'])
        
        coefs = pd.DataFrame(self.coef[0], 
                             columns=['coefficient'])
        
        params = pd.concat([variables, coefs], 
                           axis=1)
        
        order = np.argsort(abs(coefs)['coefficient'])
        
        return params.reindex(order).reset_index(drop=True)
    
    
    
    
    
        
class RfModel:
    """
    A class to perform random forest regression.

    Attributes
    ----------
    df: Dataframe
        Input dataframe containing date, independent variables and dependent variable.
    X_col: List
        The list of independent variables.
    y_col: str
        The name of dependent variable.
    test_size: float
        The proportion of data assigned to test set.

    Methods
    -------
    get_summary():
        Obtain MSE, feature importances, residuals, and R-Squared.
        Return a dictionary containing all coefficents.
    get_combined_df():
        Return a dataframe with date, value of true and prediction, type of value.
   get_parameters():
        Return a dataframe of variables and coefficients ordered by absolute value.
    """
    def __init__(self, df, X_col, y_col, test_size):
        self.df = df
        self.n = round(len(self.df)*test_size)
        self.X_col = X_col
        self.X = self.df[self.X_col]
        self.y_col = y_col
        self.y = self.df[[self.y_col]]
        self.X_train = self.X.iloc[self.n:,:]
        self.y_train = self.y.iloc[self.n:,:]
        self.X_test = self.X.iloc[:self.n,:]
        self.y_test = self.y.iloc[:self.n,:]
        self.model = RandomForestRegressor().fit(self.X_train, self.y_train)

    def get_summary(self):
        self.mse = mean_squared_error(self.y_test, self.model.predict(self.X_test))
        self.feature_importances = self.model.feature_importances_
        self.residuals = self.y_test - self.model.predict(self.X_test).reshape(self.n,1)
        self.R_Squared = self.model.score(self.X_train, self.y_train)
        return dict({'MSE': self.mse, 'Feature importances': self.feature_importances, 'Residuals': self.residuals, 
                     'R-Squared': self.R_Squared})
    
    def get_combined_df(self, model='RandomForest'):
        df_prediction = self.df[['date']]
        prediction = []
        for i in self.model.predict(self.X_test).reshape(self.n,1):
            prediction.append(i[0])
        for i in self.model.predict(self.X_train).reshape(len(self.df) - self.n,1):
            prediction.append(i[0])
        df_prediction[self.y_col] = prediction
        df_prediction['type'] = f'{model}_prediction'
        df_true = self.df[['date',self.y_col]]
        df_true['type'] = 'true value'
        return pd.concat([df_true, df_prediction], ignore_index=True)
    
    
    def get_parameters(self):
        
        variables = pd.DataFrame(self.X_col, 
                                 columns=['variable'])
        
        importances = pd.DataFrame(self.feature_importances.tolist(), 
                                   columns=['importance'])
        
        params = pd.concat([variables, importances], 
                           axis=1)
        
        order = np.argsort(abs(importances)['importance'])
        
        return params.reindex(order).reset_index(drop=True)
    
    
    
    
    
    

In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


def drop_unnamed(df):
    # Drop the column 'Unnamed: 0'
    df.drop(['Unnamed: 0'], axis=1, inplace=True)

def format_date(df):
    # Convert the 'date' column to the 'Y-M' format and set as index
    df['date'] = pd.to_datetime(df['date']).dt.to_period('M')
    df.set_index('date', inplace=True)

def df_subset(df, name):
    # Subset the DataFrame based on dates
    if name == 'GDP':
        idx = df['2015-10':].index
    else:
        idx = df['2015-12':].index
    df.drop(idx, inplace=True)
    df.sort_index(ascending=True, inplace=True)

def df_rename(df, name):
    # Rename column
    df.rename(columns={'value': name}, inplace=True)

### Read data
cpi = pd.read_csv('data/cpi.csv') # per month from 1913-01-01 to 2022-12-01, total 1320 observations
unemployment = pd.read_csv('data/unemployment.csv') # per month from 1948-01-01 to 2022-12-01, total 900 observations
gdp = pd.read_csv('data/gdp.csv') # per quarter from 1947-01-01 to 2022-07-01, total 303 observations
fund_rate = pd.read_csv('data/fundrate.csv') # per month from 1954-07-01 to 2022-12-01, total 822 observations
retail = pd.read_csv('data/retail.csv') # per month from 1992-01-01 to 2022-12-01, total 372 observations
durables = pd.read_csv('data/durables.csv') # per month from 1992-02-01 to 2022-11-01, total 370 observations
SP500 = pd.read_csv('data/SP500.csv') # per day from 2016-01-04 to 2023-01-13, total 1771 observations
SP500_whole = pd.read_csv('data/SP500_whole.csv') # per day from 1927-12-30 to 2023-01-27, total 23883 observations
### Data Wrangling
for df, name in zip([cpi, unemployment, gdp, fund_rate, durables, retail],
                    ['CPI', 'Unemployment', 'GDP', 'Fund_rate', 'Durables', 'Retail']):
    drop_unnamed(df)
    format_date(df)
    df_subset(df, name)
    df_rename(df, name)

# SP500 is a special one
SP500 = SP500.rename({'Date':'date'}, axis=1)
format_date(SP500)
# Calculate the average value for each month for column 'Close'
SP500 = SP500.groupby('date').mean()
# Subset the DataFrame with dates between 2016-01 and 2022-12
SP500 = SP500['1992-01':'2022-12']
# Only keep the column 'Close'
SP500 = SP500[['Close']]
# Rename the column 'Close' to 'SP500'
SP500.rename(columns={'Close': 'SP500'}, inplace=True)

df = pd.concat([SP500, cpi, unemployment, fund_rate, retail], axis=1)

X = df.drop(columns='SP500')
Y = df['SP500']


KeyError: "['Unnamed: 0'] not found in axis"

In [17]:
df

Unnamed: 0_level_0,SP500,CPI,Unemployment,Fund_rate,Retail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01,1918.597888,236.916,4.8,0.34,351772
2016-02,1904.418494,237.111,4.9,0.38,363527
2016-03,2021.954090,238.132,5.0,0.36,404996
2016-04,2075.535232,239.261,5.1,0.37,394743
2016-05,2065.550479,240.229,4.8,0.37,413036
...,...,...,...,...,...
2022-08,4158.563084,296.171,3.7,2.33,613416
2022-09,3850.520496,296.808,3.5,2.56,577370
2022-10,3726.050956,298.012,3.7,3.08,597349
2022-11,3917.488595,297.711,3.6,3.78,610784
