In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('data.csv')

df = df.drop(columns = ['Unnamed: 0'])

# According to note 30: "Therefore, to predict returns at month t+1, we use most recent monthly characteristics at the end of month t." <br>
# Hence, **shift return t+1 to serve as response: r(t+1)**.

df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

### handle missing data

# According to note 30 (bottom of p 2248): "Another issue is missing characteristics, which we replace with the cross-sectional median at each month for each stock, respectively." <br>
# Hence, calculate monthly cross-sectional median for features: **'mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr'**.

df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

df_filled.isna().sum()

df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

df['Date'] = pd.to_datetime(df['Date'])

# Set the datetime column as index
df.set_index('Date', inplace=True, drop = True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_scaled = scaler.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)


permno = df['permno'].reset_index(drop = True)

df_scaled['permno'] = permno

df_scaled.index = df.index

use dataframe: df_scaled<br>
try not to reset the index

In [2]:
df_scaled

Unnamed: 0_level_0,permno,return,mom1m,mom12m,chmom,indmom,mom36m,turn,mvel1,dolvol,...,retvol,idiovol,beta,betasq,ep,sp,agr,nincr,return(t-1),r(t+1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-31,13610,-1.577601,2.067047,-0.423509,1.027879,0.298817,-0.641896,-0.461379,-0.241530,-0.183589,...,0.343957,0.054602,-1.125798,-0.818182,0.109832,-0.005714,1.352752,3.271570,,1.068007
2001-01-31,13856,-0.928752,0.647658,0.318155,-0.292685,-0.154773,-0.359650,-0.551531,2.804602,1.483815,...,-0.213394,-0.403658,-1.604035,-0.904739,0.164941,-0.435813,1.039952,0.044234,,0.234089
2001-01-31,13901,-0.034564,1.197333,1.209452,0.999410,1.010986,-0.840862,-0.486887,3.941007,1.573948,...,-0.155772,-0.148161,-1.633817,-0.906819,0.435438,-0.135575,0.262103,0.851068,,0.580304
2001-01-31,13928,0.128928,-0.042072,0.194538,0.079124,1.010986,-0.332444,-0.465040,0.342760,0.965806,...,-0.132290,-0.546369,-1.516602,-0.896391,0.194226,-0.139399,0.122298,0.044234,,-0.722710
2001-01-31,13936,-0.007235,-0.018197,0.849310,0.445356,-0.666015,-0.793118,-0.498117,-0.268943,-0.348039,...,0.208855,0.418791,-0.042420,-0.251914,0.300888,2.164993,0.340622,-0.762600,,0.983025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31,89279,0.385862,0.079381,-0.722938,0.613486,-0.339810,-0.276255,-0.820610,-0.277718,-1.194736,...,0.022378,-0.641827,-0.058042,-0.263729,0.146997,-0.516974,0.089618,0.044234,0.098223,
2019-12-31,89317,0.990307,0.621034,-0.508996,0.971117,-0.534816,2.596698,-0.905457,-0.271309,-1.430611,...,-0.155995,0.934011,0.968611,0.739869,0.984190,-0.371310,1.407156,-0.762600,0.566403,
2019-12-31,89456,0.712041,0.340663,0.171882,1.465368,0.014624,-0.928936,-0.612435,-0.278154,-0.860719,...,-0.248385,0.071144,0.153347,-0.094796,-0.316093,0.404040,0.667478,0.044234,0.318867,
2019-12-31,89790,-0.065003,-0.756850,-0.065572,-0.229284,-1.688349,-0.114813,-0.430683,-0.208020,0.288665,...,0.054204,1.307453,1.181609,1.005859,-0.138276,-0.592114,-1.143443,-0.762600,-0.703883,


In [7]:
companies = pd.read_csv('Companies.csv')
companies.columns

merged_data = pd.merge(df_scaled, companies, on='permno', how='inner')

merged_data.to_csv('merged_data.csv', index=False)

In [9]:
merged_data

Unnamed: 0,permno,return,mom1m,mom12m,chmom,indmom,mom36m,turn,mvel1,dolvol,...,ep,sp,agr,nincr,return(t-1),r(t+1),SICCD,NCUSIP,TICKER,COMNAM
0,13610,-1.577601,2.067047,-0.423509,1.027879,0.298817,-0.641896,-0.461379,-0.241530,-0.183589,...,0.109832,-0.005714,1.352752,3.271570,,1.068007,2810,,,OLIN MATHIESON CHEM CORP
1,13610,1.058516,-1.574474,0.371761,0.494096,-0.336094,-0.671895,-0.346706,-0.249674,0.184594,...,0.109832,-0.005714,1.352752,3.271570,-1.577213,-0.175120,2810,,,OLIN MATHIESON CHEM CORP
2,13610,-0.177782,1.225698,0.120184,0.382901,-0.927110,-0.799050,-0.282267,-0.244187,0.150266,...,0.109832,-0.005714,1.352752,3.271570,1.059436,-0.422289,2810,,,OLIN MATHIESON CHEM CORP
3,13610,-0.423593,-0.228519,0.170046,0.697031,-0.828601,-0.722422,-0.303494,-0.245688,0.029208,...,0.109832,-0.005714,1.352752,4.078404,-0.177111,-0.052509,2810,,,OLIN MATHIESON CHEM CORP
4,13610,-0.055845,-0.642610,0.128872,0.129373,-0.787751,-0.720479,-0.223918,-0.248413,0.118913,...,0.109832,-0.005714,1.352752,4.078404,-0.422972,-0.817485,2810,,,OLIN MATHIESON CHEM CORP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115819,89915,-0.538794,0.698640,-0.134446,1.005318,-0.285345,0.689510,-0.348149,-0.212936,0.273253,...,0.227653,-0.434535,0.149612,-0.762600,0.607219,0.512273,,,,
115820,89915,0.505834,-0.559286,0.130665,0.073562,-0.464183,0.633251,-0.436287,-0.217209,0.274621,...,0.227653,-0.434535,0.149612,-0.762600,-0.538197,0.667431,,,,
115821,89915,0.660140,0.581295,0.133048,0.380996,-0.490072,0.522500,-0.473676,-0.211290,0.155966,...,0.227653,-0.434535,0.149612,-0.762600,0.506644,-0.385375,,,,
115822,89915,-0.386882,0.698823,0.370886,0.133777,-0.223460,0.582557,-0.455938,-0.204242,0.232663,...,0.227653,-0.434535,0.149612,-0.762600,0.660980,0.249742,,,,
