In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import skew, kurtosis

The datashare file contains all the time-series stocks data of all the U.S. markets, including NYSE, NASDAQ and AMEX. 
every stock also contains all the corresponding stock characteristics that are used as predictors in the models.

## NOTE

Given the size of the origial dataset is too big to be imported on Github, the importing on the data has to be done locally. The Notebook provided includes all the steps we did in the datapreprocessing, but in order for the notebook to run smoothly it is necessary to dowload the original dataset first (a link is provided).

[Stocks Data Dowlnoad](https://dachxiu.chicagobooth.edu/)

In [185]:
Stocks = pd.read_csv("datashare.csv")

In [186]:
Stocks.index = Stocks["DATE"]
Stocks.index = pd.to_datetime(Stocks.index, format="%Y%m%d")

In [187]:
Stocks = Stocks.drop("DATE", axis=1)

In [188]:
Stocks = Stocks[:"2016"] 
Stocks['Month'] = Stocks.index
Stocks

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01-31,10006,8.224900e+04,1.122846,1.260784,0.047180,9.569953,0.025742,0.046433,0.044843,-0.059517,...,,0.013234,9.411565e-08,0.015453,0.008058,0.355638,0.460420,1.120996e-07,37.0,1957-01-31
1957-01-31,10014,3.903375e+03,0.426734,0.182102,-0.275641,6.237836,0.072103,0.046433,-0.086957,-0.115385,...,,0.033305,6.610609e-06,0.047619,0.033495,1.152126,1.169610,9.229146e-08,,1957-01-31
1957-01-31,10022,9.273250e+03,1.066449,1.137313,-0.025490,7.008844,0.027648,0.046433,-0.060377,-0.039550,...,,0.016023,2.286832e-06,0.020833,0.015589,0.815777,0.679803,1.181757e-07,,1957-01-31
1957-01-31,10030,5.446588e+04,0.926038,0.857547,0.018171,9.825337,0.021700,0.046433,0.044633,0.050470,...,,0.015295,1.464273e-07,0.039326,0.015849,0.739302,1.333656,6.126699e-08,,1957-01-31
1957-01-31,10057,4.025000e+04,1.247748,1.556875,0.025785,7.901007,0.025506,0.046433,0.086667,0.055247,...,,0.005954,1.380375e-06,0.056856,0.019945,0.755510,0.410391,3.315790e+00,,1957-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,93428,1.250976e+06,1.599794,2.559341,-0.137348,14.916210,0.043205,0.016130,-0.001203,-0.047018,...,6.0,0.029302,1.010214e-09,0.027604,0.017847,0.490189,5.347476,8.664791e-09,73.0,2016-12-30
2016-12-30,93429,5.600537e+06,0.299106,0.089465,0.206434,16.319190,0.026198,-0.090970,0.093973,0.000143,...,6.0,0.016001,1.093471e-10,0.040887,0.010437,0.539155,6.252920,8.275139e-09,62.0,2016-12-30
2016-12-30,93433,2.473625e+04,2.331811,5.437343,0.328248,9.975208,0.181381,0.066502,0.279070,-0.358209,...,2.0,0.149511,3.889972e-07,0.375742,0.126628,1.285707,29.078756,5.014198e-09,65.0,2016-12-30
2016-12-30,93434,8.573280e+04,0.630254,0.397221,0.020854,10.862196,0.059796,0.105374,-0.049505,0.109890,...,3.0,0.042877,8.576942e-08,0.031579,0.019838,0.629682,2.431419,3.518593e-08,1.0,2016-12-30


## Stocks Selection

In [189]:
top500_df = Stocks.groupby(Stocks.index).apply(lambda x: x.nlargest(500, 'mvel1')).reset_index(drop=True)

In [190]:
top500_df.index = top500_df["Month"]

In [191]:
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01-31,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,,0.015315,1.014458e-08,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01-31
1957-01-31,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,,0.010303,5.288339e-09,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01-31
1957-01-31,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,,0.004239,1.746008e-09,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01-31
1957-01-31,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,,0.013444,1.142168e-08,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01-31
1957-01-31,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,,0.011841,1.153844e-08,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,3.0,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,4.0,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,4.0,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,6.0,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30


In [192]:
top500_df= top500_df.rename_axis('Date')

In [193]:
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01-31,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,,0.015315,1.014458e-08,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01-31
1957-01-31,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,,0.010303,5.288339e-09,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01-31
1957-01-31,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,,0.004239,1.746008e-09,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01-31
1957-01-31,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,,0.013444,1.142168e-08,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01-31
1957-01-31,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,,0.011841,1.153844e-08,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,3.0,0.018265,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,4.0,0.022101,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,4.0,0.022755,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,6.0,0.019966,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30


### List with Stock Characteristics

In [None]:
characteristics_l = list(set(top500_df.columns).difference({'permno', 'Month', 'sic2', 'weight', 'total_market_cap'}))
len(characteristics_l)

## Stock-Weighting

In this section we use the stock market cap to compute the corresponding weight in that point in time for every company in the replicating portfolio (S&P500). 

In [194]:
total_market_cap = top500_df.groupby(top500_df['Month'].dt.to_period("M"))['mvel1'].sum()
top500_df = top500_df.merge(total_market_cap.rename('total_market_cap'), left_on=top500_df['Month'].dt.to_period("M"), right_index=True)

In [195]:
top500_df['weight'] = top500_df['mvel1'] / top500_df['total_market_cap']
top500_df = top500_df.drop('key_0', axis=1)
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,total_market_cap,weight
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01-31,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,1.014458e-08,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01-31,2.029613e+08,0.060141
1957-01-31,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,5.288339e-09,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01-31,2.029613e+08,0.056829
1957-01-31,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,1.746008e-09,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01-31,2.029613e+08,0.053105
1957-01-31,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,1.142168e-08,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01-31,2.029613e+08,0.043206
1957-01-31,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,1.153844e-08,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01-31,2.029613e+08,0.025746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-30,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12-30,2.161774e+10,0.000473
2016-12-30,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12-30,2.161774e+10,0.000472
2016-12-30,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12-30,2.161774e+10,0.000472
2016-12-30,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12-30,2.161774e+10,0.000472


In [196]:
top500_df['Month'] = top500_df['Month'].dt.strftime('%Y-%m')
top500_df.index = top500_df['Month']

In [197]:
top500_df = top500_df.rename_axis('Date')
top500_df

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,total_market_cap,weight
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,1.014458e-08,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01,2.029613e+08,0.060141
1957-01,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,5.288339e-09,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01,2.029613e+08,0.056829
1957-01,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,1.746008e-09,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01,2.029613e+08,0.053105
1957-01,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,1.142168e-08,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01,2.029613e+08,0.043206
1957-01,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,1.153844e-08,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01,2.029613e+08,0.025746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,6.410270e-10,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12,2.161774e+10,0.000473
2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,2.730737e-10,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12,2.161774e+10,0.000472
2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,8.764646e-11,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12,2.161774e+10,0.000472
2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,1.167440e-10,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12,2.161774e+10,0.000472


## Adding Stock Returns

In this section we add to the dataframe the returns of every stock in the corresponding point in time. The data is retrieved from the CRSP databse (via WRDS) based on the PERMNO number of the stock. 

In [198]:
Rets = pd.read_csv("Permno_date_return.csv")
Rets['date'] = pd.to_datetime(Rets['date'])
Rets['date'] = Rets['date'].dt.strftime('%Y-%m')
Rets.index = Rets['date']
Rets.rename({'date':'Month'}, inplace=True, axis=1)
Rets

Unnamed: 0_level_0,PERMNO,Month,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1957-01,10006,1957-01,0.064378
1957-02,10006,1957-02,0.002016
1957-03,10006,1957-03,0.018405
1957-04,10006,1957-04,-0.008032
1957-05,10006,1957-05,0.004049
...,...,...,...
2016-08,93436,2016-08,-0.097023
2016-09,93436,2016-09,-0.037640
2016-10,93436,2016-10,-0.030878
2016-11,93436,2016-11,-0.042128


In [199]:
Rets.rename({'PERMNO':'permno'}, inplace=True, axis=1)
Rets

Unnamed: 0_level_0,permno,Month,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1957-01,10006,1957-01,0.064378
1957-02,10006,1957-02,0.002016
1957-03,10006,1957-03,0.018405
1957-04,10006,1957-04,-0.008032
1957-05,10006,1957-05,0.004049
...,...,...,...
2016-08,93436,2016-08,-0.097023
2016-09,93436,2016-09,-0.037640
2016-10,93436,2016-10,-0.030878
2016-11,93436,2016-11,-0.042128


In [200]:
merged = pd.merge(top500_df, Rets, on=['permno', 'Month'])
merged.index = merged['Month']
merged = merged.rename_axis('Date')
merged

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,Month,total_market_cap,weight,RET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01,2.029613e+08,0.060141,-0.082386
1957-01,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01,2.029613e+08,0.056829,-0.017021
1957-01,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01,2.029613e+08,0.053105,0.031364
1957-01,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01,2.029613e+08,0.043206,-0.045396
1957-01,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01,2.029613e+08,0.025746,-0.107884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,0.243695,...,0.074799,0.022341,0.480976,1.300660,3.575367e-08,37.0,2016-12,2.161774e+10,0.000473,0.074279
2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,0.080280,...,0.064590,0.020551,0.498526,3.258613,1.525324e-08,62.0,2016-12,2.161774e+10,0.000472,-0.034056
2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,-0.073532,...,0.026764,0.015357,0.442136,6.000719,7.869013e-09,31.0,2016-12,2.161774e+10,0.000472,-0.028373
2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,0.048481,...,0.037188,0.014731,0.430538,4.131828,1.110965e-08,60.0,2016-12,2.161774e+10,0.000472,0.040418


In [None]:
#Rets['RET'] = pd.to_numeric(Rets['RET'], errors='coerce')
#Rets['RET'] = Rets['RET'].fillna(0)


## Creating Dummy variables for SIC2 characteristic

In this section we compiute the dummy variables for the industry code (variable) "SIC2"

In [None]:
dummy_variables = pd.get_dummies(merged['sic2'], prefix='SIC')
merged_dum = pd.concat([merged, dummy_variables], axis=1)

merged_dum

In the paper 74 dummies are obtained, in our case only 66 since we are reducing the analysis only to the firms contained in the S&P500 in the time period considered, which results in 8 less industry dummies.

## Replicating Portfolio - Returns

In this section we compute the (weighted) monthly return of every stock in order to compute the return of the replicating portfolio in every period. This step will also allow to check the correlation or the replicating portfolio returns with the ones of the index, to check the accuracy of the portfolio with the benchmark. 

In [None]:
merged['RET'] = merged['RET'].astype(float)
merged['weighted_RET'] = merged['weight'] * merged['RET']
replicating_returns = merged.groupby('Date')['weighted_RET'].sum().reset_index()
replicating_returns.index = replicating_returns['Date']
replicating_returns = replicating_returns.drop('Date', axis=1)
replicating_returns

## Macro Predictors

The data source is professor's Amid Goyal's personal website, but this dataset has been included in the Github folder under the name "PredictorData2022".

In [201]:
Macro_pred = pd.read_csv('PredictorData2022.csv', parse_dates=True, index_col=0)
Macro_pred.index = pd.to_datetime(Macro_pred.index, format="%Y%m").to_period('M')
Macro_pred = Macro_pred.rename_axis('Date')
Macro_pred = Macro_pred['1957':'2016']
#Macro_pred = Macro_pred['']
Macro_pred

Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1957-01,44.72,1.7367,3.4067,0.567243,0.0311,0.0377,0.0449,0.0328,0.027992,0.0027,0.000000,0.0346,0.0197,0.000902,0.000739,-0.040238,-0.041569
1957-02,43.26,1.7333,3.4033,0.584994,0.0310,0.0367,0.0447,0.0328,0.030173,0.0024,0.003623,0.0025,0.0093,0.001056,0.000827,-0.024919,-0.032823
1957-03,44.11,1.7300,3.4000,0.599819,0.0308,0.0366,0.0443,0.0331,0.026600,0.0023,0.003610,-0.0024,0.0050,0.000330,0.001054,0.023827,0.020752
1957-04,45.74,1.7300,3.4067,0.576098,0.0307,0.0367,0.0444,0.0345,0.027421,0.0025,0.003597,-0.0222,-0.0066,0.000302,0.002142,0.046538,0.045215
1957-05,47.43,1.7300,3.4133,0.564039,0.0306,0.0374,0.0452,0.0348,0.028849,0.0026,0.003584,-0.0023,-0.0075,0.000482,0.002422,0.038734,0.033208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08,2170.95,44.8371,88.3667,0.315197,0.0030,0.0332,0.0424,0.0186,-0.030782,0.0002,0.000918,-0.0140,0.0016,0.000279,,0.001247,-0.001396
2016-09,2168.27,45.0257,89.0900,0.316794,0.0029,0.0341,0.0431,0.0196,-0.032603,0.0002,0.002404,-0.0124,-0.0119,0.001673,,0.000446,-0.001033
2016-10,2126.15,45.2507,90.9100,0.319688,0.0033,0.0351,0.0438,0.0220,-0.029034,0.0002,0.001247,-0.0314,-0.0263,0.000364,,-0.017958,-0.019206
2016-11,2198.81,45.4756,92.7300,0.303286,0.0045,0.0386,0.0471,0.0267,-0.027452,0.0001,-0.001555,-0.0599,-0.0510,0.000946,,0.035790,0.033007


#### Macroeconomic Factors: 
* including dividend-price ratio (dp): (d/p) is the difference between the log of dividends and the log of prices.
* earnings-price ratio (ep): (e/p) is the difference between the log of earnings and the log of prices.
* book-to-market ratio (bm)
* net equity expansion (ntis)
* Treasury-bill rate (tbl)
* term spread (tms): The Term Spread (tms) is the difference between the long term yield on government bonds (lty) and the Treasury-bill (tbl)
* default spread (dfy): The Default Yield Spread (dfy) is the difference between BAA and AAA-rated corporate bond yields.
* stock variance (svar)

In [None]:
Macro_pred['Index'] = Macro_pred['Index'].str.replace(',', '').astype(float)
Macro_pred["d/p"] = np.log(Macro_pred["D12"]) - np.log(Macro_pred["Index"])
Macro_pred["e/p"] = np.log(Macro_pred["E12"]) - np.log(Macro_pred["Index"])
Macro_pred["tms"] = Macro_pred["lty"] - Macro_pred["tbl"]
Macro_pred['dfy'] = Macro_pred['BAA'] - Macro_pred['AAA']

Only the macroeconomic varibales used in the paper are selected:

In [None]:
# drop the redundent variables - Selecting the needed Macro predictors used in the paper
Macro_pred_sel = Macro_pred.drop(columns=['Index','D12', 'E12', 'AAA', 'BAA', 'CRSP_SPvwx', 'corpr','Rfree', 'CRSP_SPvw', 'lty','infl', 'ltr','csp'])
Macro_pred_sel

### Creating list with Macro Predictors

In [None]:
Macro_pred_l = ['b/m', 'tbl', 'ntis', 'svar', 'd/p', 'e/p', 'tms', 'dfy']

In [None]:
# Create the 'months' column from the existing DatetimeIndex
Macro_pred_sel['Month'] = Macro_pred_sel.index.strftime('%Y-%m')

# Display the DataFrame to verify the new column
print(Macro_pred_sel.head())


              b/m     tbl      ntis      svar       d/p       e/p     tms  \
Date                                                                        
1957-01  0.567243  0.0311  0.027992  0.000902 -3.248434 -2.574677  0.0017   
1957-02  0.584994  0.0310  0.030173  0.001056 -3.217201 -2.542483  0.0018   
1957-03  0.599819  0.0308  0.026600  0.000330 -3.238565 -2.562911  0.0023   
1957-04  0.576098  0.0307  0.027421  0.000302 -3.274852 -2.597229  0.0038   
1957-05  0.564039  0.0306  0.028849  0.000482 -3.311134 -2.631575  0.0042   

            dfy    Month  
Date                      
1957-01  0.0072  1957-01  
1957-02  0.0080  1957-02  
1957-03  0.0077  1957-03  
1957-04  0.0077  1957-04  
1957-05  0.0078  1957-05  


## Reliability of Replicating Portfolio - S&P500 Sanity Check

**Correlation:**

In [None]:
np.corrcoef(replicating_returns['weighted_RET'], Macro_pred['CRSP_SPvw'])

**Mean:**

In [None]:
abs(np.mean(replicating_returns['weighted_RET'])-np.mean(Macro_pred['CRSP_SPvw']))

**Variance:**

In [None]:
abs(np.var(replicating_returns['weighted_RET'])-np.var(Macro_pred['CRSP_SPvw']))

**Skewness:**

In [None]:
abs(skew(replicating_returns['weighted_RET'])-skew(Macro_pred['CRSP_SPvw']))

**Kurtosis:**

In [None]:
abs(kurtosis(replicating_returns['weighted_RET'])-kurtosis(Macro_pred['CRSP_SPvw']))

## Creating Interaction Terms

In [None]:
merged_macro_char = pd.merge(Macro_pred_sel, merged_dum, on=['Month'])
merged_macro_char.index = merged_macro_char['Month']
merged_macro_char = merged_macro_char.rename_axis('Date')
merged_macro_char

In [None]:
data = merged_macro_char.copy()
for fc in characteristics_l:
    for mp in Macro_pred_l:
        data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
        
data

  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp] = merged_macro_char[fc] * merged_macro_char[mp]
  data[fc + '*' + mp

Unnamed: 0_level_0,b/m,tbl,ntis,svar,d/p,e/p,tms,dfy,Month,permno,...,std_turn*tms,std_turn*dfy,zerotrade*b/m,zerotrade*tbl,zerotrade*ntis,zerotrade*svar,zerotrade*d/p,zerotrade*e/p,zerotrade*tms,zerotrade*dfy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01,0.567243,0.0311,0.027992,0.000902,-3.248434,-2.574677,0.0017,0.0072,1957-01,12079,...,0.000035,0.000147,8.579164e-07,4.703666e-08,4.233601e-08,1.364127e-09,-4.913038e-06,-3.894025e-06,2.571136e-09,1.088952e-08
1957-01,0.567243,0.0311,0.027992,0.000902,-3.248434,-2.574677,0.0017,0.0072,1957-01,11850,...,0.000049,0.000207,5.990650e-07,3.284471e-08,2.956234e-08,9.525415e-10,-3.430671e-06,-2.719116e-06,1.795370e-09,7.603920e-09
1957-01,0.567243,0.0311,0.027992,0.000902,-3.248434,-2.574677,0.0017,0.0072,1957-01,10401,...,0.000104,0.000441,4.495894e-07,2.464947e-08,2.218610e-08,7.148683e-10,-2.574668e-06,-2.040656e-06,1.347399e-09,5.706629e-09
1957-01,0.567243,0.0311,0.027992,0.000902,-3.248434,-2.574677,0.0017,0.0072,1957-01,11703,...,0.000088,0.000372,5.487716e-07,3.008729e-08,2.708049e-08,8.725726e-10,-3.142656e-06,-2.490838e-06,1.644643e-09,6.965547e-09
1957-01,0.567243,0.0311,0.027992,0.000902,-3.248434,-2.574677,0.0017,0.0072,1957-01,12060,...,0.000041,0.000172,6.246612e-07,3.424806e-08,3.082545e-08,9.932406e-10,-3.577253e-06,-2.835296e-06,1.872081e-09,7.928812e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,0.293479,0.0051,-0.025104,0.000524,-3.891597,-3.164580,0.0221,0.0077,2016-12,15735,...,0.028745,0.010015,1.049295e-08,1.823437e-10,-8.975448e-10,1.873570e-11,-1.391389e-07,-1.131453e-07,7.901560e-10,2.753032e-10
2016-12,0.293479,0.0051,-0.025104,0.000524,-3.891597,-3.164580,0.0221,0.0077,2016-12,69649,...,0.072015,0.025091,4.476508e-09,7.779153e-11,-3.829109e-10,7.993033e-12,-5.935947e-08,-4.827010e-08,3.370966e-10,1.174500e-10
2016-12,0.293479,0.0051,-0.025104,0.000524,-3.891597,-3.164580,0.0221,0.0077,2016-12,88661,...,0.132616,0.046206,2.309391e-09,4.013197e-11,-1.975404e-10,4.123535e-12,-3.062303e-08,-2.490212e-08,1.739052e-10,6.059140e-11
2016-12,0.293479,0.0051,-0.025104,0.000524,-3.891597,-3.164580,0.0221,0.0077,2016-12,91461,...,0.091313,0.031815,3.260451e-09,5.665923e-11,-2.788920e-10,5.821701e-12,-4.323430e-08,-3.515738e-08,2.455233e-10,8.554433e-11


In [None]:
data=data.drop(columns=Macro_pred_l)

In [None]:
data

Unnamed: 0_level_0,Month,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,std_turn*tms,std_turn*dfy,zerotrade*b/m,zerotrade*tbl,zerotrade*ntis,zerotrade*svar,zerotrade*d/p,zerotrade*e/p,zerotrade*tms,zerotrade*dfy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01,1957-01,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,...,0.000035,0.000147,8.579164e-07,4.703666e-08,4.233601e-08,1.364127e-09,-4.913038e-06,-3.894025e-06,2.571136e-09,1.088952e-08
1957-01,1957-01,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,...,0.000049,0.000207,5.990650e-07,3.284471e-08,2.956234e-08,9.525415e-10,-3.430671e-06,-2.719116e-06,1.795370e-09,7.603920e-09
1957-01,1957-01,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,...,0.000104,0.000441,4.495894e-07,2.464947e-08,2.218610e-08,7.148683e-10,-2.574668e-06,-2.040656e-06,1.347399e-09,5.706629e-09
1957-01,1957-01,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,...,0.000088,0.000372,5.487716e-07,3.008729e-08,2.708049e-08,8.725726e-10,-3.142656e-06,-2.490838e-06,1.644643e-09,6.965547e-09
1957-01,1957-01,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,...,0.000041,0.000172,6.246612e-07,3.424806e-08,3.082545e-08,9.932406e-10,-3.577253e-06,-2.835296e-06,1.872081e-09,7.928812e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12,2016-12,15735,1.022451e+07,1.153902,1.331490,0.348974,15.285029,0.037444,0.030660,0.025583,...,0.028745,0.010015,1.049295e-08,1.823437e-10,-8.975448e-10,1.873570e-11,-1.391389e-07,-1.131453e-07,7.901560e-10,2.753032e-10
2016-12,2016-12,69649,1.021339e+07,1.235921,1.527500,0.330644,15.823137,0.023783,-0.090970,0.196607,...,0.072015,0.025091,4.476508e-09,7.779153e-11,-3.829109e-10,7.993033e-12,-5.935947e-08,-4.827010e-08,3.370966e-10,1.174500e-10
2016-12,2016-12,88661,1.020248e+07,0.891193,0.794224,-0.324766,17.084926,0.032959,-0.010422,0.013931,...,0.132616,0.046206,2.309391e-09,4.013197e-11,-1.975404e-10,4.123535e-12,-3.062303e-08,-2.490212e-08,1.739052e-10,6.059140e-11
2016-12,2016-12,91461,1.019650e+07,1.013885,1.027964,0.049796,16.472893,0.022217,0.076096,0.047833,...,0.091313,0.031815,3.260451e-09,5.665923e-11,-2.788920e-10,5.821701e-12,-4.323430e-08,-3.515738e-08,2.455233e-10,8.554433e-11


## Scaling Variables to [-1,1] and Dataset Creation

In [None]:
from sklearn.preprocessing import MinMaxScaler

features = list(set(data.columns).difference({'permno','Month','RET'})) # a list storing all features

X = MinMaxScaler((-1,1)).fit_transform(data[features])
X = pd.DataFrame(X, columns=features)

In [None]:
y = replicating_returns
y

In [None]:
X.index = data.index
X.fillna(0, inplace=True)
X

## Exporting Processed Data

If you want to export the processed datasets, delete the '#' symbols from the cell below. Note that it takes a bit of time depending on the computer, generally between 3 and 10 minutes.

In [None]:
X.to_csv('Features_X.csv', index=False)
y.to_csv('Dependet_y.csv', index=False)