In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import skew, kurtosis

The datashare file contains all the time-series stocks data of all the U.S. markets, including NYSE, NASDAQ and AMEX. 
every stock also contains all the corresponding stock characteristics that are used as predictors in the models.

## NOTE

Given the size of the origial dataset is too big to be imported on Github, the importing on the data has to be done locally. The Notebook provided includes all the steps we did in the datapreprocessing, but in order for the notebook to run smoothly it is necessary to dowload the original dataset first (a link is provided).

[Stocks Data Dowlnoad](https://dachxiu.chicagobooth.edu/)

In [2]:
Stocks = pd.read_csv("datashare.csv")

In [3]:
Stocks.index = Stocks["DATE"]
Stocks.index = pd.to_datetime(Stocks.index, format="%Y%m%d")

In [4]:
Stocks = Stocks.drop("DATE", axis=1)

In [5]:
Stocks = Stocks['1957-03':"2016"] 
Stocks['Month'] = Stocks.index
Stocks

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10006,8.630850e+04,1.117907,1.249717,0.134574,10.296745,0.024863,0.060541,0.002016,0.081150,...,,0.013856,1.557681e-07,0.029167,0.013546,0.914592,0.802461,1.066005e-07,37.0,1957-03-29
1957-03-29,10014,3.903375e+03,0.331304,0.109762,0.006667,7.032404,0.065248,0.060541,-0.086957,-0.080000,...,,0.031389,8.383815e-06,0.071429,0.031165,0.943409,0.716406,1.105263e+00,,1957-03-29
1957-03-29,10022,9.841000e+03,0.942052,0.887461,0.058761,7.294038,0.029338,0.060541,-0.037037,-0.005452,...,,0.010066,5.360386e-06,0.020000,0.016886,0.993558,0.534808,2.387797e-07,,1957-03-29
1957-03-29,10030,5.190100e+04,0.886532,0.785940,-0.032065,9.516942,0.022007,0.060541,0.000000,0.033590,...,,0.009523,1.090908e-07,0.017857,0.009104,0.736000,0.712647,9.202902e-08,,1957-03-29
1957-03-29,10057,3.525000e+04,1.229520,1.511719,-0.102811,8.421013,0.025453,0.060541,-0.030717,0.048778,...,,0.007661,2.386914e-07,0.007194,0.006748,0.889503,0.775954,6.631579e+00,,1957-03-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,93428,1.250976e+06,1.599794,2.559341,-0.137348,14.916210,0.043205,0.016130,-0.001203,-0.047018,...,6.0,0.029302,1.010214e-09,0.027604,0.017847,0.490189,5.347476,8.664791e-09,73.0,2016-12-30
2016-12-30,93429,5.600537e+06,0.299106,0.089465,0.206434,16.319190,0.026198,-0.090970,0.093973,0.000143,...,6.0,0.016001,1.093471e-10,0.040887,0.010437,0.539155,6.252920,8.275139e-09,62.0,2016-12-30
2016-12-30,93433,2.473625e+04,2.331811,5.437343,0.328248,9.975208,0.181381,0.066502,0.279070,-0.358209,...,2.0,0.149511,3.889972e-07,0.375742,0.126628,1.285707,29.078756,5.014198e-09,65.0,2016-12-30
2016-12-30,93434,8.573280e+04,0.630254,0.397221,0.020854,10.862196,0.059796,0.105374,-0.049505,0.109890,...,3.0,0.042877,8.576942e-08,0.031579,0.019838,0.629682,2.431419,3.518593e-08,1.0,2016-12-30


## 1.1 Stocks Selection

In [6]:
top500_df = Stocks.groupby(Stocks.index).apply(lambda x: x.nlargest(500, 'mvel1')).reset_index(drop=True)

In [7]:
top500_df.index = top500_df["Month"]

In [8]:
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,,0.005598,2.747520e-09,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03-29
1957-03-29,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,,0.012122,6.304279e-09,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03-29
1957-03-29,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,,0.011116,4.738215e-09,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03-29
1957-03-29,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,,0.010675,1.193363e-08,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03-29
1957-03-29,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,,0.014534,1.002258e-08,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,3.0,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,4.0,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,4.0,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,6.0,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30


In [9]:
top500_df= top500_df.rename_axis('Date')

In [10]:
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,,0.005598,2.747520e-09,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03-29
1957-03-29,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,,0.012122,6.304279e-09,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03-29
1957-03-29,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,,0.011116,4.738215e-09,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03-29
1957-03-29,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,,0.010675,1.193363e-08,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03-29
1957-03-29,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,,0.014534,1.002258e-08,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,3.0,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,4.0,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,4.0,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,6.0,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30


### List with Stock Characteristics

In [11]:
characteristics_l = list(set(top500_df.columns).difference({'permno', 'Month', 'sic2', 'weight', 'total_market_cap'}))
len(characteristics_l)

94

## 1.2 Adding Stock Returns

In this section we add to the dataframe the returns of every stock in the corresponding point in time. The data is retrieved from the CRSP databse (via WRDS) based on the PERMNO number of the stock. 

In [12]:
Rets = pd.read_csv("Permno_date_return.csv")
Rets['date'] = pd.to_datetime(Rets['date'])
Rets.index = Rets['date']
Rets.rename({'date':'Month'}, inplace=True, axis=1)
Rets

Unnamed: 0_level_0,PERMNO,Month,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1957-01-31,10006,1957-01-31,0.064378
1957-02-28,10006,1957-02-28,0.002016
1957-03-29,10006,1957-03-29,0.018405
1957-04-30,10006,1957-04-30,-0.008032
1957-05-31,10006,1957-05-31,0.004049
...,...,...,...
2016-08-31,93436,2016-08-31,-0.097023
2016-09-30,93436,2016-09-30,-0.037640
2016-10-31,93436,2016-10-31,-0.030878
2016-11-30,93436,2016-11-30,-0.042128


In [13]:
Rets.rename({'PERMNO':'permno'}, inplace=True, axis=1)
Rets

Unnamed: 0_level_0,permno,Month,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1957-01-31,10006,1957-01-31,0.064378
1957-02-28,10006,1957-02-28,0.002016
1957-03-29,10006,1957-03-29,0.018405
1957-04-30,10006,1957-04-30,-0.008032
1957-05-31,10006,1957-05-31,0.004049
...,...,...,...
2016-08-31,93436,2016-08-31,-0.097023
2016-09-30,93436,2016-09-30,-0.037640
2016-10-31,93436,2016-10-31,-0.030878
2016-11-30,93436,2016-11-30,-0.042128


In [14]:
merged = pd.merge(top500_df, Rets, on=['permno', 'Month'])
merged.index = merged['Month']
merged = merged.rename_axis('Date')
merged

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,RET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,0.005598,2.747520e-09,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03-29,0.014799
1957-03-29,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,0.012122,6.304279e-09,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03-29,0.029279
1957-03-29,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,0.011116,4.738215e-09,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03-29,-0.003205
1957-03-29,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,0.010675,1.193363e-08,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03-29,0.019858
1957-03-29,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,0.014534,1.002258e-08,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03-29,0.060948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30,0.074279
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30,-0.034056
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30,-0.028373
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30,0.040418


## 1.3 Removing stocks without returns data

In [15]:
merged = merged.dropna(subset=['RET'])
merged

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,RET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,0.005598,2.747520e-09,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03-29,0.014799
1957-03-29,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,0.012122,6.304279e-09,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03-29,0.029279
1957-03-29,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,0.011116,4.738215e-09,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03-29,-0.003205
1957-03-29,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,0.010675,1.193363e-08,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03-29,0.019858
1957-03-29,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,0.014534,1.002258e-08,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03-29,0.060948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30,0.074279
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30,-0.034056
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30,-0.028373
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30,0.040418


After removing observations without stock returns a total of 643 rows is removed (0.18%).

## 1.4 Stock-Weighting

In this section we use the stock market cap to compute the corresponding weight in that point in time for every company in the replicating portfolio (S&P500). 

In [16]:
merged['Month'] = pd.to_datetime(merged['Month'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['Month'] = pd.to_datetime(merged['Month'])


In [17]:
total_market_cap = merged.groupby(merged['Month'].dt.to_period("M"))['mvel1'].sum()
merged = merged.merge(total_market_cap.rename('total_market_cap'), left_on=merged['Month'].dt.to_period("M"), right_index=True)

In [18]:
merged['weight'] = merged['mvel1'] / merged['total_market_cap']
merged = merged.drop('key_0', axis=1)
merged

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,RET,total_market_cap,weight
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03-29,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03-29,0.014799,1.921618e+08,0.058053
1957-03-29,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03-29,0.029279,1.921618e+08,0.056703
1957-03-29,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03-29,-0.003205,1.921618e+08,0.056303
1957-03-29,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03-29,0.019858,1.921618e+08,0.041728
1957-03-29,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03-29,0.060948,1.921618e+08,0.024992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30,0.074279,2.159413e+10,0.000473
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30,-0.034056,2.159413e+10,0.000473
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30,-0.028373,2.159413e+10,0.000472
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30,0.040418,2.159413e+10,0.000472


In [19]:
merged['Month'] = merged['Month'].dt.strftime('%Y-%m')
merged.index = merged['Month']

In [20]:
merged = merged.rename_axis('Date')
merged

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,RET,total_market_cap,weight
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,0.009972,0.004336,0.320722,0.034650,1.202819e-06,48.0,1957-03,0.014799,1.921618e+08,0.058053
1957-03,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,0.021028,0.009592,0.329003,0.037144,1.171764e-06,29.0,1957-03,0.029279,1.921618e+08,0.056703
1957-03,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,0.012987,0.009500,0.340839,0.044143,9.931459e-07,37.0,1957-03,-0.003205,1.921618e+08,0.056303
1957-03,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,0.015299,0.008441,0.424106,0.030334,1.589654e-06,28.0,1957-03,0.019858,1.921618e+08,0.041728
1957-03,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,0.028302,0.009733,0.381111,0.061730,7.547341e-07,99.0,1957-03,0.060948,1.921618e+08,0.024992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12,0.074279,2.159413e+10,0.000473
2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12,-0.034056,2.159413e+10,0.000473
2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12,-0.028373,2.159413e+10,0.000472
2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12,0.040418,2.159413e+10,0.000472


## 2.1 Creating Dummy variables for SIC2 characteristic

In this section we compiute the dummy variables for the industry code (variable) "SIC2"

In [21]:
dummy_variables = pd.get_dummies(merged['sic2'], prefix='SIC')
merged_dum = pd.concat([merged, dummy_variables], axis=1)

merged_dum

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,SIC_70.0,SIC_72.0,SIC_73.0,SIC_75.0,SIC_78.0,SIC_79.0,SIC_80.0,SIC_82.0,SIC_87.0,SIC_99.0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,0.031283,...,False,False,False,False,False,False,False,False,False,False
1957-03,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,0.037593,...,False,False,False,False,False,False,False,False,False,False
1957-03,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,-0.133388,...,False,False,False,False,False,False,False,False,False,False
1957-03,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,-0.092576,...,False,False,False,False,False,False,False,False,False,False
1957-03,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,-0.109084,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,False,False,False,False,False,False,False,False,False,False
2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,False,False,False,False,False,False,False,False,False,False
2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,False,False,False,False,False,False,False,False,False,False
2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,False,False,False,False,False,False,False,False,False,False


In [22]:
merged_dum['sic2']

Date
1957-03    48.0
1957-03    29.0
1957-03    37.0
1957-03    28.0
1957-03    99.0
           ... 
2016-12    37.0
2016-12    62.0
2016-12    31.0
2016-12    60.0
2016-12    63.0
Name: sic2, Length: 358357, dtype: float64

In the paper 74 dummies are obtained, in our case only 65 since we are reducing the analysis only to the firms contained in the S&P500 in the time period considered, which results in 9 less industry dummies.

## 2. Replicating Portfolio - Returns

In this section we compute the (weighted) monthly return of every stock in order to compute the return of the replicating portfolio in every period. This step will also allow to check the correlation or the replicating portfolio returns with the ones of the index, to check the accuracy of the portfolio with the benchmark. 

In [23]:
merged['RET'] = merged['RET'].astype(float)
merged['weighted_RET'] = merged['weight'] * merged['RET']
replicating_returns = merged.groupby('Date')['weighted_RET'].sum().reset_index()
replicating_returns.index = replicating_returns['Date']
replicating_returns = replicating_returns.drop('Date', axis=1)
replicating_returns

Unnamed: 0_level_0,weighted_RET
Date,Unnamed: 1_level_1
1957-03,0.024682
1957-04,0.045670
1957-05,0.039478
1957-06,-0.002842
1957-07,0.009107
...,...
2016-08,0.002228
2016-09,0.002254
2016-10,-0.017501
2016-11,0.032861


## 3.1 Macro Predictors

The data source is professor's Amid Goyal's personal website, but this dataset has been included in the Github folder under the name "PredictorData2022".

According to the paper, firm characteristics are lagged due to the data being released with a delay. To match this we lag macro predictors by one month.

In [24]:
Macro_pred = pd.read_csv('PredictorData2022.csv', parse_dates=True, index_col=0)
Macro_pred.index = pd.to_datetime(Macro_pred.index, format="%Y%m").to_period('M')
Macro_pred_lag = Macro_pred.shift(1) #lag values by one month
Macro_pred_lag = Macro_pred_lag.rename_axis('Date')
Macro_pred_lag = Macro_pred_lag['1957-03':'2016']
Macro_pred_lag

  Macro_pred = pd.read_csv('PredictorData2022.csv', parse_dates=True, index_col=0)


Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1957-03,43.26,1.7333,3.4033,0.584994,0.0310,0.0367,0.0447,0.0328,0.030173,0.0024,0.003623,0.0025,0.0093,0.001056,0.000827,-0.024919,-0.032823
1957-04,44.11,1.7300,3.4000,0.599819,0.0308,0.0366,0.0443,0.0331,0.026600,0.0023,0.003610,-0.0024,0.0050,0.000330,0.001054,0.023827,0.020752
1957-05,45.74,1.7300,3.4067,0.576098,0.0307,0.0367,0.0444,0.0345,0.027421,0.0025,0.003597,-0.0222,-0.0066,0.000302,0.002142,0.046538,0.045215
1957-06,47.43,1.7300,3.4133,0.564039,0.0306,0.0374,0.0452,0.0348,0.028849,0.0026,0.003584,-0.0023,-0.0075,0.000482,0.002422,0.038734,0.033208
1957-07,47.37,1.7300,3.4200,0.565877,0.0329,0.0391,0.0463,0.0361,0.030528,0.0024,0.003571,-0.0180,-0.0322,0.000579,0.002176,-0.000705,-0.003632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08,2173.60,44.6485,87.6433,0.314661,0.0030,0.0328,0.0422,0.0175,-0.031675,0.0002,-0.001618,0.0081,0.0245,0.000478,,0.036571,0.035324
2016-09,2170.95,44.8371,88.3667,0.315197,0.0030,0.0332,0.0424,0.0186,-0.030782,0.0002,0.000918,-0.0140,0.0016,0.000279,,0.001247,-0.001396
2016-10,2168.27,45.0257,89.0900,0.316794,0.0029,0.0341,0.0431,0.0196,-0.032603,0.0002,0.002404,-0.0124,-0.0119,0.001673,,0.000446,-0.001033
2016-11,2126.15,45.2507,90.9100,0.319688,0.0033,0.0351,0.0438,0.0220,-0.029034,0.0002,0.001247,-0.0314,-0.0263,0.000364,,-0.017958,-0.019206


In [25]:
Macro_pred= Macro_pred.rename_axis('Date')
Macro_pred = Macro_pred['1957-03':'2016']
Macro_pred

Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1957-03,44.11,1.7300,3.4000,0.599819,0.0308,0.0366,0.0443,0.0331,0.026600,0.0023,0.003610,-0.0024,0.0050,0.000330,0.001054,0.023827,0.020752
1957-04,45.74,1.7300,3.4067,0.576098,0.0307,0.0367,0.0444,0.0345,0.027421,0.0025,0.003597,-0.0222,-0.0066,0.000302,0.002142,0.046538,0.045215
1957-05,47.43,1.7300,3.4133,0.564039,0.0306,0.0374,0.0452,0.0348,0.028849,0.0026,0.003584,-0.0023,-0.0075,0.000482,0.002422,0.038734,0.033208
1957-06,47.37,1.7300,3.4200,0.565877,0.0329,0.0391,0.0463,0.0361,0.030528,0.0024,0.003571,-0.0180,-0.0322,0.000579,0.002176,-0.000705,-0.003632
1957-07,47.91,1.7400,3.4367,0.560057,0.0316,0.0399,0.0473,0.0365,0.032346,0.0030,0.007117,-0.0041,-0.0110,0.000554,0.001631,0.009747,0.008217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08,2170.95,44.8371,88.3667,0.315197,0.0030,0.0332,0.0424,0.0186,-0.030782,0.0002,0.000918,-0.0140,0.0016,0.000279,,0.001247,-0.001396
2016-09,2168.27,45.0257,89.0900,0.316794,0.0029,0.0341,0.0431,0.0196,-0.032603,0.0002,0.002404,-0.0124,-0.0119,0.001673,,0.000446,-0.001033
2016-10,2126.15,45.2507,90.9100,0.319688,0.0033,0.0351,0.0438,0.0220,-0.029034,0.0002,0.001247,-0.0314,-0.0263,0.000364,,-0.017958,-0.019206
2016-11,2198.81,45.4756,92.7300,0.303286,0.0045,0.0386,0.0471,0.0267,-0.027452,0.0001,-0.001555,-0.0599,-0.0510,0.000946,,0.035790,0.033007


#### Macroeconomic Factors: 
* including dividend-price ratio (dp): (d/p) is the difference between the log of dividends and the log of prices.
* earnings-price ratio (ep): (e/p) is the difference between the log of earnings and the log of prices.
* book-to-market ratio (bm)
* net equity expansion (ntis)
* Treasury-bill rate (tbl)
* term spread (tms): The Term Spread (tms) is the difference between the long term yield on government bonds (lty) and the Treasury-bill (tbl)
* default spread (dfy): The Default Yield Spread (dfy) is the difference between BAA and AAA-rated corporate bond yields.
* stock variance (svar)

In [26]:
Macro_pred_lag['Index'] = Macro_pred_lag['Index'].str.replace(',', '').astype(float)
Macro_pred_lag["d/p"] = np.log(Macro_pred_lag["D12"]) - np.log(Macro_pred_lag["Index"])
Macro_pred_lag["e/p"] = np.log(Macro_pred_lag["E12"]) - np.log(Macro_pred_lag["Index"])
Macro_pred_lag["tms"] = Macro_pred_lag["lty"] - Macro_pred_lag["tbl"]
Macro_pred_lag['dfy'] = Macro_pred_lag['BAA'] - Macro_pred_lag['AAA']

Only the macroeconomic varibales used in the paper are selected:

In [27]:
# drop the redundent variables - Selecting the needed Macro predictors used in the paper
Macro_pred_sel = Macro_pred_lag.drop(columns=['Index','D12', 'E12', 'AAA', 'BAA', 'CRSP_SPvwx', 'corpr','Rfree', 'CRSP_SPvw', 'lty','infl', 'ltr','csp'])
Macro_pred_sel

Unnamed: 0_level_0,b/m,tbl,ntis,svar,d/p,e/p,tms,dfy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080
1957-04,0.599819,0.0308,0.026600,0.000330,-3.238565,-2.562911,0.0023,0.0077
1957-05,0.576098,0.0307,0.027421,0.000302,-3.274852,-2.597229,0.0038,0.0077
1957-06,0.564039,0.0306,0.028849,0.000482,-3.311134,-2.631575,0.0042,0.0078
1957-07,0.565877,0.0329,0.030528,0.000579,-3.309868,-2.628349,0.0032,0.0072
...,...,...,...,...,...,...,...,...
2016-08,0.314661,0.0030,-0.031675,0.000478,-3.885319,-3.210865,0.0145,0.0094
2016-09,0.315197,0.0030,-0.030782,0.000279,-3.879884,-3.201425,0.0156,0.0092
2016-10,0.316794,0.0029,-0.032603,0.001673,-3.874451,-3.192038,0.0167,0.0090
2016-11,0.319688,0.0033,-0.029034,0.000364,-3.849850,-3.152198,0.0187,0.0087


### Creating list with Macro Predictors

In [28]:
Macro_pred_l = ['b/m', 'tbl', 'ntis', 'svar', 'd/p', 'e/p', 'tms', 'dfy']

In [29]:
# Create the 'months' column from the existing DatetimeIndex
Macro_pred_sel['Month'] = Macro_pred_sel.index.strftime('%Y-%m')

# Display the DataFrame to verify the new column
print(Macro_pred_sel.head())


              b/m     tbl      ntis      svar       d/p       e/p     tms  \
Date                                                                        
1957-03  0.584994  0.0310  0.030173  0.001056 -3.217201 -2.542483  0.0018   
1957-04  0.599819  0.0308  0.026600  0.000330 -3.238565 -2.562911  0.0023   
1957-05  0.576098  0.0307  0.027421  0.000302 -3.274852 -2.597229  0.0038   
1957-06  0.564039  0.0306  0.028849  0.000482 -3.311134 -2.631575  0.0042   
1957-07  0.565877  0.0329  0.030528  0.000579 -3.309868 -2.628349  0.0032   

            dfy    Month  
Date                      
1957-03  0.0080  1957-03  
1957-04  0.0077  1957-04  
1957-05  0.0077  1957-05  
1957-06  0.0078  1957-06  
1957-07  0.0072  1957-07  


## 4.1 Reliability of Replicating Portfolio - S&P500 Sanity Check

**Correlation:**

In [30]:
np.corrcoef(replicating_returns['weighted_RET'], Macro_pred['CRSP_SPvw'])

array([[1.        , 0.99543735],
       [0.99543735, 1.        ]])

**Mean:**

In [31]:
abs(np.mean(replicating_returns['weighted_RET'])-np.mean(Macro_pred['CRSP_SPvw']))

0.00027194334994176846

**Variance:**

In [32]:
abs(np.var(replicating_returns['weighted_RET'])-np.var(Macro_pred['CRSP_SPvw']))

9.990573194619459e-06

**Skewness:**

In [33]:
abs(skew(replicating_returns['weighted_RET'])-skew(Macro_pred['CRSP_SPvw']))

0.0639163545294062

**Kurtosis:**

In [34]:
abs(kurtosis(replicating_returns['weighted_RET'])-kurtosis(Macro_pred['CRSP_SPvw']))

0.057543532982029966

## 5.1 Creating Interaction Terms

In [35]:
merged_macro_char = pd.merge(Macro_pred_sel, merged_dum, on=['Month'])
merged_macro_char.index = merged_macro_char['Month']
merged_macro_char = merged_macro_char.rename_axis('Date')
merged_macro_char

Unnamed: 0_level_0,b/m,tbl,ntis,svar,d/p,e/p,tms,dfy,Month,permno,...,SIC_70.0,SIC_72.0,SIC_73.0,SIC_75.0,SIC_78.0,SIC_79.0,SIC_80.0,SIC_82.0,SIC_87.0,SIC_99.0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,10401,...,False,False,False,False,False,False,False,False,False,False
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,11850,...,False,False,False,False,False,False,False,False,False,False
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,12079,...,False,False,False,False,False,False,False,False,False,False
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,11703,...,False,False,False,False,False,False,False,False,False,False
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,12060,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,15735,...,False,False,False,False,False,False,False,False,False,False
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,69649,...,False,False,False,False,False,False,False,False,False,False
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,88661,...,False,False,False,False,False,False,False,False,False,False
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,91461,...,False,False,False,False,False,False,False,False,False,False


In [36]:
data = merged_macro_char.copy()
for fc in characteristics_l:
    for mp in Macro_pred_l:
        data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
        
data

  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp

Unnamed: 0_level_0,b/m,tbl,ntis,svar,d/p,e/p,tms,dfy,Month,permno,...,herf*tms,herf*dfy,acc*b/m,acc*tbl,acc*ntis,acc*svar,acc*d/p,acc*e/p,acc*tms,acc*dfy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,10401,...,,,,,,,,,,
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,11850,...,,,,,,,,,,
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,12079,...,,,,,,,,,,
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,11703,...,,,,,,,,,,
1957-03,0.584994,0.0310,0.030173,0.001056,-3.217201,-2.542483,0.0018,0.0080,1957-03,12060,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,15735,...,0.001926,0.000738,,,,,,,,
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,69649,...,0.001305,0.000500,-0.004835,-0.000072,0.000438,-0.000015,0.061837,0.050477,-0.000354,-0.000136
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,88661,...,0.003545,0.001357,-0.038957,-0.000578,0.003526,-0.000122,0.498196,0.406673,-0.002852,-0.001092
2016-12,0.303286,0.0045,-0.027452,0.000946,-3.878496,-3.165980,0.0222,0.0085,2016-12,91461,...,0.000793,0.000304,-0.007314,-0.000109,0.000662,-0.000023,0.093528,0.076346,-0.000535,-0.000205


## 5.2 Computing Excess Returns

In [37]:
data['RET'] = pd.to_numeric(data['RET'], errors='coerce')
data['tbl'] = pd.to_numeric(data['tbl'], errors='coerce')

# Perform subtraction operation
data['RET'] = data['RET'] - data['tbl']

In [38]:
data.rename(columns={'RET': 'Exc_RET'}, inplace=True)

In [39]:
data=data.drop(columns=Macro_pred_l)

In [40]:
data

Unnamed: 0_level_0,Month,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,herf*tms,herf*dfy,acc*b/m,acc*tbl,acc*ntis,acc*svar,acc*d/p,acc*e/p,acc*tms,acc*dfy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,1957-03,10401,1.115565e+07,0.389741,0.151898,0.019737,12.002520,0.006526,0.060541,0.003536,...,,,,,,,,,,
1957-03,1957-03,11850,1.089609e+07,1.123312,1.261831,-0.091272,12.002520,0.020446,0.060541,-0.029437,...,,,,,,,,,,
1957-03,1957-03,12079,1.081930e+07,0.979271,0.958971,-0.220210,12.002520,0.024846,0.060541,-0.021672,...,,,,,,,,,,
1957-03,1957-03,11703,8.018494e+06,1.186155,1.406963,-0.056089,11.731129,0.020091,0.060541,-0.033967,...,,,,,,,,,,
1957-03,1957-03,12060,4.802563e+06,1.101270,1.212795,-0.152782,11.948152,0.024809,0.060541,0.030233,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,...,0.001926,0.000738,,,,,,,,
2016-12,2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,...,0.001305,0.000500,-0.004835,-0.000072,0.000438,-0.000015,0.061837,0.050477,-0.000354,-0.000136
2016-12,2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,...,0.003545,0.001357,-0.038957,-0.000578,0.003526,-0.000122,0.498196,0.406673,-0.002852,-0.001092
2016-12,2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,...,0.000793,0.000304,-0.007314,-0.000109,0.000662,-0.000023,0.093528,0.076346,-0.000535,-0.000205


## 5.3 Scaling Variables to [-1,1] and Dataset Creation

In [41]:
from sklearn.preprocessing import MinMaxScaler

features = list(set(data.columns).difference({'permno','Month','Exc_RET','total_market_cap','weight','sic2'})) # a list storing all features

X = MinMaxScaler((-1,1)).fit_transform(data[features])
X = pd.DataFrame(X, columns=features)

In [42]:
y = data['Exc_RET']
y

Date
1957-03   -0.016201
1957-03   -0.001721
1957-03   -0.034205
1957-03   -0.011142
1957-03    0.029948
             ...   
2016-12    0.069779
2016-12   -0.038556
2016-12   -0.032873
2016-12    0.035918
2016-12    0.038658
Name: Exc_RET, Length: 358357, dtype: float64

In [43]:
X.index = data.index
X.fillna(0, inplace=True)
X

Unnamed: 0_level_0,acc*e/p,chinv,divo*tms,dolvol*e/p,stdcf*d/p,lgr*ntis,cinvest*tms,SIC_42.0,SIC_1.0,mom6m*tms,...,std_turn*tbl,cfp*tbl,agr*svar,mvel1*tbl,SIC_27.0,pchquick*svar,sgr*d/p,SIC_73.0,nincr*d/p,ill*svar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-03,0.000000,0.000000,0.000000,0.372972,0.000000,0.000000,0.000000,-1.0,-1.0,-0.474411,...,-0.999685,0.000000,0.000000,-0.980481,-1.0,0.000000,0.000000,-1.0,0.000000,-0.999990
1957-03,0.000000,0.000000,0.000000,0.372972,0.000000,0.000000,0.000000,-1.0,-1.0,-0.474318,...,-0.999663,0.000000,0.000000,-0.980935,-1.0,0.000000,0.000000,-1.0,0.000000,-0.999977
1957-03,0.000000,0.000000,0.000000,0.372972,0.000000,0.000000,0.000000,-1.0,-1.0,-0.476834,...,-0.999599,0.000000,0.000000,-0.981070,-1.0,0.000000,0.000000,-1.0,0.000000,-0.999982
1957-03,0.000000,0.000000,0.000000,0.389207,0.000000,0.000000,0.000000,-1.0,-1.0,-0.476234,...,-0.999724,0.000000,0.000000,-0.985977,-1.0,0.000000,0.000000,-1.0,0.000000,-0.999956
1957-03,0.000000,0.000000,0.000000,0.376224,0.000000,0.000000,0.000000,-1.0,-1.0,-0.476477,...,-0.999439,0.000000,0.000000,-0.991611,-1.0,0.000000,0.000000,-1.0,0.000000,-0.999963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,0.000000,0.000000,0.000000,-0.047624,0.000000,0.000000,-0.003315,-1.0,-1.0,-0.430633,...,-0.998285,-0.065949,0.000000,-0.997425,-1.0,0.000000,0.000000,-1.0,0.785654,-0.999998
2016-12,-0.374974,-0.175427,-0.107579,-0.087709,0.000000,0.123080,-0.003614,-1.0,-1.0,-0.460298,...,-0.995703,-0.065298,0.725200,-0.997428,-1.0,-0.742618,0.792912,-1.0,0.785654,-0.999999
2016-12,-0.229317,-0.204771,-0.107579,-0.181701,0.999839,0.071588,-0.003570,-1.0,-1.0,-0.488219,...,-0.992087,-0.065744,0.723928,-0.997430,-1.0,-0.740322,0.837490,-1.0,1.000000,-1.000000
2016-12,-0.364396,-0.174568,-0.107579,-0.136110,0.000000,0.141043,-0.003782,-1.0,-1.0,-0.466070,...,-0.994551,-0.065418,0.726839,-0.997432,-1.0,-0.743760,0.813524,-1.0,1.000000,-1.000000


## 6.1 Exporting Processed Data

If you want to export the processed datasets, delete the '#' symbols from the cell below. Note that it takes a bit of time depending on the computer, generally between 3 and 10 minutes.

In [46]:
X.to_csv('Features_lagged_X.csv')
y.to_csv('Dependent_y.csv')

In [None]:
weights = pd.DataFrame(merged['weight'])
weights.to_csv('Stocks_weights.csv')