<a href="https://colab.research.google.com/github/FrederiKob/Kelly_Replication_Code/blob/main/Rudimentary_Replication_Kelly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
data = pd.read_excel("/content/Data_Goyal_Welch_2022.xlsx",  sheet_name="Monthly", index_col="yyyymm")
data.index = pd.to_datetime(data.index, format = "%Y%m") + pd.DateOffset(months = 1)

  warn("""Cannot parse header or footer so it will be ignored""")


Offset the data since the data are month-end values

In [None]:
data.index = pd.to_datetime(data.index, format = "%Y%m") + pd.DateOffset(months = 1)

Create and Select required variables

In [6]:
# i) dfy -- Default Yield Spread (dfy) is the difference between BAA and AAA-rated corporate bond yields
data["dfy"] = data["BAA"] - data["AAA"]
# ii) de -- The Dividend Payout Ratio (d/e) is the difference between the log of dividends and the log of earnings
data["de"] = np.log(data["D12"]/data["E12"])
# iii) tms -- the term spread (tms) is the difference between the long term yield on government bonds and the Treasury-bill
data["tms"] = data["lty"] - data["tbl"]
# iv) dfr -- Default Return Spread (dfr) is the difference between long-term corporate bond and long-term government bond returns
data["dfr"] = data.corpr - data.ltr
# v) dp -- The Dividend Price Ratio (d/p) is the difference between the log of dividends and the log of prices
data["dp"] = np.log(data["D12"]/data["Index"])
# vi) dy -- The Dividend Yield (d/y) is the difference between the log of dividends and the log of lagged prices
data["dy"] = np.log(data.D12/data.Index.shift(1))
# vii) ep -- Earnings Price Ratio (e/p) is the difference between the log of earnings and the log of prices
data["ep"] = np.log(data.E12/data.Index)
# Excess Returns (xr)
data["xr"] = data.CRSP_SPvw - data.Rfree
# One lag of market returns
data["lag_1"] = data.CRSP_SPvw

# Reduce to relevant variables
data = data.loc[:, ['b/m', 'tbl', 'lty', 'ntis', 'infl', 'ltr', 'svar', 'dfy',
                           'de', 'tms', 'dfr', 'dp', 'dy', 'ep', 'xr']].dropna()


Volatility Standardize predictors and Returns

a) Predictors: standardized using an expanding window historical standard deviation (at least 36 months for predictors)
     apply to training and test predictors

b) Returns: standardized by their trailing 12-month return standard deviation (this includes target and lagged market return)

In [None]:
# Separate Dataframes into predictors and returns (target)
pred = data.copy().drop("xr",axis=1)
ret = pd.DataFrame(data.copy().xr)

# (i) returns
ret["roll_std"] = ret.rolling(12).std(ddof=0) # rolling standard deviation
ret["xr_std"] = ret.xr/ret.roll_std.shift(1) # standardize xr(t) by std(t-1)
ret["target"] = ret.xr_std.shift(-1) # pull forward xr so that predictors and target assigned same date
ret["lag_1"] = ret.target.shift(1)
ret_std = ret.drop(["xr_std","roll_std","xr"], axis=1)

# (ii) predictors
p_std = pred.copy().expanding(36).std(ddof=0)
""" See Complexity Everywhere --> use up to t-1 std() to standardize t // WHY NOT USE t AS WELL ???"""
pred_std = pred/p_std.shift(1)

# (iii) combine predictors and isolate target (y)
pred_std = pd.concat([pred_std, ret_std], axis=1).dropna()
ret_std = pred_std.copy().target
pred_std = pred_std.drop("target",axis=1)
del(p_std,pred,ret)


# New Section