In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
from scipy import stats

import gc

# Risk disclosure

In [2]:
# Load RF data 
topics_df = pd.read_csv("Data/RDdf_T2V5.csv", parse_dates=['report_dt', 'filing_dt'])

topics_df['NERs'] = topics_df['NERs'].str.replace(pat=" ", repl="").str.findall(pat=r"'(.*?)'")

NE_labels = ['PERSON', 'NORP' 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']
topics_df['Specificity'] = topics_df['NERs'].apply(lambda NERs: len([ne for ne in NERs if ne in NE_labels]))

topics_df['SIC3'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:3])


print(topics_df.shape)
topics_df.columns

(3245397, 25)


Index(['Topic', 'Score', 'Topic_H', 'Score_H', 'CIK', 'report_dt', 'filing_dt',
       'rf_seq', 'ticker', 'filerCIK', 'rf_length', 'NERs', 'Pa', 'Pr', 'Fu',
       'Sentiment', 'FOG', 'clean_len', 'SIC', 'FF', 'ryear', 'fyear',
       'rf_seq_count', 'Specificity', 'SIC3'],
      dtype='object')

In [3]:
# replace length of raw RF with cleaned RF
topics_df["rf_length"] = topics_df["clean_len"]
topics_df.drop(columns="clean_len", inplace=True)

### Firm level

In [4]:
N = topics_df['Topic_H'].nunique()
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "FF"], 
    columns='Topic_H', values='Score'
).notna().astype(int).reset_index()

# Long format
disc_long = pd.melt(disc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='Disclosed')
disc_long.sort_values(["CIK", 'Topic_H', "filing_dt", "report_dt"], inplace=True)

disc_long['Topic_H'] = disc_long['Topic_H'].astype(int)

disc_long['DiscSum'] = disc_long.groupby(["CIK", 'Topic_H'])['Disclosed'].cumsum()

# Total number of risk topics
disc_long['TotalRFs'] = disc_long.groupby(["CIK", "filing_dt", "report_dt"])['Disclosed'].transform('sum')

In [5]:

# Difference between disclosed risk topics in 2 consecutive years
disc_diff = disc_df.filter(range(N)) - disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1)

# Repeated risk factors
disc_repeat = (
    disc_df.filter(range(N))
    + disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1) 
    == 2
).astype(int)

# Whether risk factor was disclosed in the previouse year's report
disc_long['LstYrDisc'] = disc_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(['CIK', 'Topic_H'])['Disclosed'].shift(1)
disc_long['LstYrDisc'] = disc_long.groupby(['CIK', 'Topic_H'])['LstYrDisc'].ffill()

# Generate added, repeated and removed dummies
disc_long['New'] = ((disc_long['LstYrDisc']==0)&(disc_long['Disclosed']==1)).astype(int)
disc_long['Removed'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==0)).astype(int)
disc_long['Repeated'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==1)).astype(int)

# Whether risk factor was newly disclosed in the previouse year's report
disc_long['LstYrNew'] = disc_long.groupby(['CIK', 'Topic_H'])['New'].shift(1)

In [6]:
# Keep RFs that if disclosed, they are either added or repeated
disc_long = (
    disc_long[disc_long[['New', 'Repeated']]
              .sum(axis=1)==disc_long['Disclosed']]
              .reset_index(drop=True)
)

# Number of days from fiscal year end and actual filing date
disc_long['rfGap'] = (disc_long['filing_dt'] - disc_long['report_dt']).dt.days

disc_long['fyear'] = disc_long["filing_dt"].dt.year
disc_long['ryear'] = disc_long["report_dt"].dt.year

In [7]:
agg_tops = (
    topics_df.groupby(["CIK", "report_dt", "filing_dt", "Topic_H"])[
        ['rf_length', 'Specificity', 'Sentiment', 'FOG', 'rf_seq']
    ]
    .agg({
        'rf_length': 'sum',
        'Specificity': 'sum',
        'Sentiment': 'mean', 
        'FOG': 'mean',
        'rf_seq': 'mean'
    }).reset_index()
).drop_duplicates(subset=["CIK", "filing_dt", "report_dt", "Topic_H"]).sort_values(["CIK", "filing_dt", "report_dt", "Topic_H"])

# Length of the topics last year
agg_tops['rf_length_1'] = agg_tops.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(["CIK", "Topic_H"])['rf_length'].shift(1)
agg_tops['rf_length_1'] = agg_tops.groupby(['CIK', 'Topic_H'])['rf_length_1'].ffill()

disc_long = pd.merge(
    left=disc_long,
    right=agg_tops[['CIK', 'report_dt', 'filing_dt', 'Topic_H', 'rf_length', 'Specificity', 
                    'Sentiment', 'FOG', 'rf_seq', 'rf_length_1']],
    on=['CIK', 'filing_dt', 'report_dt', 'Topic_H'],
    how='left'
)

# fill NA RF attributes with 0
disc_long.fillna(
    {'rf_length':0, 'rf_length_1':0, 'Specificity':0, 'Sentiment':0, 'FOG':0}, 
    inplace=True
)

### Industry level

In [8]:
# Load filings data
files = pd.read_csv(
    filepath_or_buffer="Data/10Kurls.csv", 
    usecols=['cik', 'periodOfReport'],
    parse_dates=['periodOfReport']
).dropna(subset=['cik', 'periodOfReport']).drop_duplicates().sort_values(['cik', 'periodOfReport'])

# Load SIC and Fama-French data
SIC_df = pd.read_csv("Data/SIC_df.csv", usecols=['cik', 'sic']).drop_duplicates().dropna().astype(int)
SIC_df.rename(columns={'sic': "SIC"}, inplace=True)
SIC_FF = pd.read_csv("Data/SIC_FF.csv")

# Add SIC
filings_df = pd.merge(
    left=files,
    right=SIC_df,
    on=["cik"],
    how="left"
)

# Add Fama-French industry
filings_df = pd.merge(
    left=filings_df, 
    right=SIC_FF, 
    on="SIC", how='left'
).fillna({"FF": 49}).dropna()

In [9]:
filings_df = filings_df[
    (filings_df["periodOfReport"] >= disc_df['report_dt'].min())
    &(filings_df["periodOfReport"] <= disc_df['report_dt'].max())
]
filings_df['ryear'] = filings_df["periodOfReport"].dt.year

# Number of reporting firms per year per industry
cnt_firms = filings_df.groupby(['FF', 'ryear'])['cik'].nunique().reset_index()

In [10]:
disc_long['FF'] = disc_long['FF'].astype(int)
cnt_firms['FF'] = cnt_firms['FF'].astype(int)

In [11]:
# Number of firms in the industry
disc_long['#firms'] = pd.merge(
    left=disc_long,
    right=cnt_firms,
    on=['FF', 'ryear'],
    how='left'
)['cik']

In [12]:
disc_df["filing_dt-1"] = disc_df["filing_dt"] - pd.Timedelta(weeks=52)

def count_func(x):
    """
    Counts the number of firms in the industry that disclose a specific RF.
    """
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt-1"])
        &(disc_df["filing_dt"]<=x["filing_dt"])
        &(disc_df['FF']==x['FF'])
    ]
    output = (
        df_slice[df_slice['CIK']!=x['CIK']].filter(range(N)).sum() / df_slice["filing_dt"].count()
    )
    
    return output

# Running the function on disc_df
IndDisc_df = disc_df.drop(columns=range(N)).copy()
IndDisc_df.loc[:, range(N)] = disc_df.apply(count_func, axis=1)
IndDisc_df.drop(columns=['filing_dt-1'], inplace=True)

# Create the data in long format
Inddisc_long = pd.melt(IndDisc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='IndDisc')

In [13]:
def other_count_func(x):
    """
    Disclosing firms not in the firm's industry.
    """
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt-1"])&
        (disc_df["filing_dt"]<=x["filing_dt"])&
        (disc_df['FF']!=x['FF'])
    ]
    output = (
        df_slice.filter(range(N)).sum() / df_slice["filing_dt"].count()
    )
    
    return output

# Running the function on disc_df
OthrIndDisc_df = disc_df.drop(columns=range(N)).copy()
OthrIndDisc_df.loc[:, range(N)] = disc_df.apply(other_count_func, axis=1)
OthrIndDisc_df.drop(columns=['filing_dt-1'], inplace=True)

# Create the data in long format
OthrIndDisc_long = pd.melt(OthrIndDisc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='OtherIndDisc')

In [14]:
# Drop RFs that have never been disclosed per industry 
Inddisc_long = Inddisc_long[Inddisc_long.groupby(['FF', 'Topic_H'])['IndDisc'].transform('sum')>0]

In [15]:
OthrIndDisc_long['Topic_H'] = OthrIndDisc_long['Topic_H'].astype(int)
Inddisc_long['Topic_H'] = Inddisc_long['Topic_H'].astype(int)

In [16]:
disc_long['IndDisc'] = pd.merge(
    left=disc_long,
    right=Inddisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['IndDisc'] * 100

disc_long['OtherIndDisc'] = pd.merge(
    left=disc_long,
    right=OthrIndDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['OtherIndDisc'] * 100

In [17]:
# Drop the first year of every firm observation
disc_long.dropna(subset=['LstYrDisc', 'IndDisc'], inplace=True)

In [18]:
disc_long.reset_index(drop=True, inplace=True)

Industry specific

In [19]:
def ttest_func(x):
    A = x['IndDisc'].values
    B = x['OtherIndDisc'].values

    tstat = stats.ttest_ind(a=A, b=B, equal_var=False).statistic
    pvalue = stats.ttest_ind(a=A, b=B, equal_var=False).pvalue

    return (tstat, pvalue)

In [20]:
ttest_df = disc_long.groupby(['FF', 'Topic_H'])[['IndDisc', 'OtherIndDisc']].mean()

ttest = disc_long.groupby(['FF', 'Topic_H'])[['IndDisc', 'OtherIndDisc']].apply(ttest_func)

ttest_df['tstat'] = ttest.apply(lambda x: x[0]).values
ttest_df['pvalue'] = ttest.apply(lambda x: x[1]).values

# IndDisc > NoIndDisc statistically significant at 5%
ttest_df['IndSpecific'] = ((ttest_df['pvalue']<0.05)&(ttest_df['IndDisc']>ttest_df['OtherIndDisc'])).astype(int)

In [21]:
disc_long.loc[:, 'IndSpecific'] = pd.merge(
    left=disc_long,
    right=ttest_df,
    how='left',
    left_on=['FF', 'Topic_H'],
    right_index=True
)['IndSpecific'].values

In [22]:
# Industry disclosure quartiles
disc_long.loc[:, 'Qcut'] = disc_long\
    .groupby('Topic_H')['IndDisc']\
        .transform(lambda x: pd.cut(x, 4, labels=False)).astype(int) + 1

## DVs

### Volatility and BA Spread

In [27]:
prices_df = pd.read_csv("Data\Prices2.csv", parse_dates=['datadate'])

prices_df.rename(columns={'cik': 'CIK', 'datadate': 'filing_dt'}, inplace=True)
prices_df = prices_df.sort_values(["CIK", "filing_dt"]).set_index('filing_dt')
prices_df.columns

Index(['CIK', 'VOLUME', 'TTLCMNSHARESOUT', 'CLOSEPRICE'], dtype='object')

In [29]:
# Daily shares turnover
prices_df["SHRTURN"] = prices_df["VOLUME"] / prices_df["TTLCMNSHARESOUT"]

prices_df["Return"] = prices_df.groupby("CIK")["CLOSEPRICE"].pct_change(1, fill_method=None)

In [30]:
# Combine the list of CIKs and filing date in main dataframe and prices df
CIKdates = pd.concat(
    [prices_df.reset_index()[["CIK", "filing_dt"]], disc_df[["CIK", "filing_dt"]]], 
    axis=0
).drop_duplicates().sort_values(["CIK", "filing_dt"])

prices_df.reset_index(inplace=True)
prices_df = pd.merge(left=CIKdates, right=prices_df, on=["CIK", "filing_dt"], how='left')
prices_df.set_index('filing_dt', inplace=True)

In [34]:
# Time window
N = 30

# Average of N-day std of daily returns
std_returns = (
    prices_df.groupby("CIK")["Return"]
    .rolling(N, min_periods=N//2).std().to_frame()
)
# Window [2, 32]
std_returns[f"Volatility+{N}"] = std_returns.groupby("CIK")["Return"].shift(-N-2)
# Window [-32, -2]
std_returns[f"Volatility_{N}"] = std_returns.groupby("CIK")["Return"].shift(2)

# Window [-5, 60]
std_returns[f"Volatility+60"] = prices_df.groupby("CIK")["Return"].rolling(65, min_periods=30).std().groupby("CIK").shift(-60)

# Average of 120-day std of daily returns
std_returns[f"Volatility_120"] = prices_df.groupby("CIK")["Return"].rolling(120, min_periods=60).std()

std_returns.drop(columns="Return", inplace=True)

In [35]:
# Average of N-day trade volumes during event
N=10
MA_vol = prices_df.groupby("CIK")["SHRTURN"].rolling(N, min_periods=6).mean().to_frame()
MA_vol[f"SHRTURN"] = MA_vol.groupby("CIK")["SHRTURN"].shift(N//2)

# Window [2, 7]
MA_vol["SHRTURN+5"] = prices_df.groupby("CIK")["SHRTURN"].rolling(5, min_periods=3).mean().shift(-7)
# Window [-7, -2]
MA_vol["SHRTURN_5"] = prices_df.groupby("CIK")["SHRTURN"].rolling(5, min_periods=3).mean().shift(2)

In [None]:
BidAsk_df = pd.read_csv("Data/BidAsk2.csv", parse_dates=['date'])
BidAsk_df.rename(columns={"cik": "CIK", "date": "filing_dt"}, inplace=True)
BidAsk_df = BidAsk_df[BidAsk_df["filing_dt"]>'2005-01-01']
BidAsk_df.columns

In [42]:
BidAsk_df["BAspread"] = (BidAsk_df["ASKHI"] - BidAsk_df['BIDLO'])/BidAsk_df["PRC"]

BidAsk_df = pd.merge(left=CIKdates, right=BidAsk_df, on=["CIK", "filing_dt"], how='left')
BidAsk_df.set_index('filing_dt', inplace=True)

In [43]:
N = 30

# N-day moving average (trading days only)
MA_BA = BidAsk_df.groupby('CIK')['BAspread'].rolling(N, min_periods=N//2).mean().to_frame()
# Window [2, 32]
MA_BA[f"Spread+{N}"] = MA_BA.groupby("CIK")["BAspread"].shift(-N-2)
# Window [-32, -2]
MA_BA[f"Spread_{N}"] = MA_BA.groupby("CIK")["BAspread"].shift(2)

# Window [-5, 60]
MA_BA["Spread+60"] = BidAsk_df.groupby('CIK')['BAspread'].rolling(65, min_periods=30).mean().groupby("CIK").shift(-60)

# Average of 120-day bid-ask spread
MA_BA[f"Spread_120"] = BidAsk_df.groupby('CIK')['BAspread'].rolling(120, min_periods=60).mean()

MA_BA.drop(columns='BAspread', inplace=True)

In [54]:
IVol = pd.read_csv("Data\WRDS_Beta.csv", parse_dates=['DATE'])
IVol.columns

Index(['PERMNO', 'DATE', 'n', 'b_mkt', 'ivol', 'tvol', 'TICKER'], dtype='object')

In [55]:
gc.collect()

0

In [56]:
CIK_PERMNO_TIC = pd.read_csv("Data\CIK_PERMNO_TIC.csv")

IVol = pd.merge(
    left=IVol,
    right=CIK_PERMNO_TIC[['PERMNO', 'cik']],
    on='PERMNO',
    how='left'
)

IVol['cik2'] = pd.merge(
    left=IVol,
    right=CIK_PERMNO_TIC[['tic', 'cik']],
    left_on='TICKER',
    right_on='tic',
    how='left'
)['cik_y']

IVol.fillna({'cik': IVol['cik2']}, inplace=True)

In [57]:
IVol = IVol.drop(columns=['cik2']).dropna(subset=['cik', 'DATE'])

IVol[['cik']] = IVol[['cik']].astype(int)
IVol.rename(columns={"cik": "CIK", "DATE": "filing_dt"}, inplace=True)
IVol = IVol.sort_values(by=["CIK", "filing_dt"]).reset_index(drop=True)

In [59]:
IVol['ivol'] = IVol['ivol'].str.replace('%', '').astype(float)
IVol["Fwrd_ivol"] = IVol.groupby("CIK")["ivol"].shift(-60)

In [60]:
# Combine the list of CIKs and filing date in main dataframe and prices df
CIKdates = pd.concat(
    [IVol[["CIK", "filing_dt"]], disc_df[["CIK", "filing_dt"]]], 
    axis=0
).drop_duplicates().sort_values(["CIK", "filing_dt"])

IVol = pd.merge(left=CIKdates, right=IVol, on=["CIK", "filing_dt"], how='left')

IVol[['b_mkt', 'ivol', 'Fwrd_ivol']] = IVol.groupby('CIK')[['b_mkt', 'ivol', 'Fwrd_ivol']].ffill(limit=3)

IVol.drop(columns=['PERMNO', 'n', 'TICKER'], inplace=True)

### Earnings Estimates

In [None]:
EPSEst = pd.read_csv("Data\EIKON_EPSforecast.csv", parse_dates=['TR.EPSESTVALUE().DATE', 'TR.EPSESTVALUE().periodenddate'])
EPSAct = pd.read_csv("Data\EIKON_EPSActual.csv", parse_dates=['Date', 'Report Date', 'Period End Date'])

EPSEst.rename(columns={'TR.EPSESTVALUE().DATE': 'Date', 
               'TR.EPSESTVALUE().periodenddate': 'Period End Date',
               'TR.EPSESTVALUE()': 'Earnings Per Share - Broker Estimate',
               'TR.EPSESTVALUE().analystcode': 'Analyst Code'}, inplace=True)

In [72]:
EPSEst = pd.merge(
    left=EPSEst,
    right=EPSAct.drop(columns=['Date']),
    on=['Instrument', 'Period End Date'],
    how='left'
)

# Absolute diff actual and estimated earings
EPSEst["Act-Est"] = (EPSEst['Earnings Per Share - Actual'] - EPSEst['Earnings Per Share - Broker Estimate'])

EPSEst = EPSEst.sort_values(['Instrument', 'Period End Date', 'Date']).reset_index(drop=True)

In [73]:
# Last fiscal year end
FYEnd = EPSEst[['Instrument', 'Period End Date']].drop_duplicates().reset_index(drop=True)
FYEnd['FYEnd-1'] = (
    FYEnd.groupby(['Instrument'])['Period End Date'].shift(1)
)

EPSEst = pd.merge(
    left=EPSEst,
    right=FYEnd,
    on=['Instrument', 'Period End Date'],
    how='left'
).dropna(subset=['FYEnd-1'])

EPSEst["FY-1"] = EPSEst['FYEnd-1'].dt.year

In [74]:
# Risk disclosure dates
disc_dts = disc_df[["CIK", "filing_dt", "report_dt"]].copy()
disc_dts['FY-1'] = disc_dts['report_dt'].dt.year
disc_dts.drop_duplicates(subset=['CIK', 'FY-1'], inplace=True)

# Match estimation errors with filing date
EPSEst = pd.merge(
    left=EPSEst,
    right=disc_dts,
    left_on=['Instrument', 'FY-1'],
    right_on=["CIK", 'FY-1'],
    how='left'
).dropna(subset=['CIK'])

EPSEst['CIK'] = EPSEst['CIK'].astype(int)

In [75]:
# Estimates after filing date of last annual report
EPSEst['After'] = (
    (EPSEst['Date'].dt.date >= EPSEst['filing_dt']-pd.Timedelta(weeks=1))
    & (EPSEst['Date'].dt.date < EPSEst['filing_dt']+pd.Timedelta(days=90))
).astype(int)

In [None]:

# Standasrd deviation fo analyst forecasts and Analyst forecast errors after filing date
EPSEst_After = (
    EPSEst[EPSEst['After']==1]
    .groupby(['CIK', 'FY-1'])[['Earnings Per Share - Broker Estimate', 'Earnings Per Share - Actual', 'Act-Est']]
    .agg({'Earnings Per Share - Broker Estimate': 'std', 'Act-Est': 'mean', 'Earnings Per Share - Actual': 'mean'})
)
EPSEst_After['nEsts'] = EPSEst[EPSEst['After']==1].groupby(['CIK', 'FY-1'])['Analyst Code'].nunique()
EPSEst_After.rename(columns={'Earnings Per Share - Broker Estimate': 'EPSEst'}, inplace=True)

EPSEst_After.reset_index(inplace=True)

EPSEst['Before'] = (
    (EPSEst['Date'].dt.date < EPSEst['filing_dt']-pd.Timedelta(weeks=1))
    & (EPSEst['Date'].dt.date >= EPSEst['filing_dt']-pd.Timedelta(days=90))
).astype(int)


# Standasrd deviation fo analyst forecasts and Analyst forecast errors Before filing date
EPSEst_Before = (
    EPSEst[EPSEst['Before']==1]
    .groupby(['CIK', 'FY-1'])[['Earnings Per Share - Broker Estimate', 'Act-Est']]
    .agg({'Earnings Per Share - Broker Estimate': 'std', 'Act-Est': 'mean'})
)

EPSEst_Before.rename(columns={'Earnings Per Share - Broker Estimate': 'EPSEst'}, inplace=True)

EPSEst_Before.reset_index(inplace=True)

In [78]:
EPSEst_After = pd.merge(
    left=EPSEst_After,
    right=EPSEst_Before,
    on=['CIK', 'FY-1'],
    how='outer',
    suffixes=['', '_before']
)

## Control variables

In [80]:
Beta = pd.read_csv("Data\Beta_AR.csv", parse_dates=['datadate'])
Beta.drop_duplicates(inplace=True)
Beta.columns

Index(['datadate', 'CIK', 'Beta_63', 'Beta_126', 'Beta_252', 'CAR_2', 'CAR_5',
       'CAR_10'],
      dtype='object')

In [82]:
Beta.rename(columns={"Instrument": "CIK", "datadate": "filing_dt"}, inplace=True)
Beta = pd.merge(left=CIKdates, right=Beta, on=["CIK", "filing_dt"], how='left')
Beta.set_index(['CIK', 'filing_dt'], inplace=True)

In [83]:
# Daily Beta
Beta["BETA+126"] = Beta.groupby('CIK')["Beta_126"].shift(-120)
Beta["BETA+63"] = Beta.groupby('CIK')["Beta_63"].shift(-60)

Analysts

In [90]:
Analysts = pd.read_csv("Data/Analysts.csv", parse_dates=['Date'])

Analysts['Instrument'] = Analysts['Instrument'].astype(int)

Analyst_df = pd.merge(
    left=prices_df.reset_index()[["filing_dt", "CIK"]], 
    right=Analysts, 
    left_on=["filing_dt", "CIK"],
    right_on=["Date", "Instrument"],
    how="outer"
).drop(columns=["Date", "Instrument"])

Analyst_df.sort_values(['CIK', 'filing_dt', 'NUMBEROFANALYSTS'], inplace=True)


In [91]:
Analyst_df["NUMBEROFANALYSTS"] = Analyst_df.groupby("CIK")["NUMBEROFANALYSTS"].ffill()
Analyst_df.drop_duplicates(subset=["filing_dt", "CIK"], keep='last', inplace=True)
Analyst_df.dropna(how='all', inplace=True)

Fama-French industry portfolio volatility

In [None]:
# Returns in % 
FF_rtrn = pd.read_csv("FF_Industry_Portfolios_daily/IndustryDaily.csv", parse_dates=["Date"], index_col=['Date'])

# STDs in % - Do not *100 in stata
FF_vol = FF_rtrn.rolling(126).std().dropna().unstack().reset_index() 
FF_vol.rename(columns={"level_0": "FF", "level_1": "Date", 0: "IndVol_"}, inplace=True)
FF_vol["FF"] = FF_vol["FF"].astype(int)

# Foreward looking IndVol
FF_vol["IndVol+"] = FF_vol.groupby("FF")['IndVol_'].shift(-126)

Free float

In [94]:
FreeFloat = pd.read_csv("Data/FreeFloat.csv")

FreeFloat.dropna(inplace=True)

FreeFloat.columns = ['CIK', 'filing_dt', 'FREEFLOAT']
FreeFloat['filing_dt'] = pd.to_datetime(FreeFloat['filing_dt'], errors='coerce').dt.tz_localize(None)

FreeFloat['CIK'] = FreeFloat['CIK'].astype(int)

FreeFloat.sort_values(by=['CIK', 'filing_dt'], inplace=True)

In [96]:
# Combine the list of CIKs and filing date in main dataframe and prices df
CIKdates = pd.concat(
    [FreeFloat[["CIK", "filing_dt"]], disc_df[["CIK", "filing_dt"]]], 
    axis=0
).drop_duplicates().sort_values(["CIK", "filing_dt"])

FreeFloat = pd.merge(left=CIKdates, right=FreeFloat, on=["CIK", "filing_dt"], how='left')

FreeFloat['FREEFLOAT'] = FreeFloat.groupby('CIK')['FREEFLOAT'].ffill(limit=3)

Financial data

In [97]:
financials = pd.read_csv("Data\Financials3.csv", parse_dates=['datadate']).drop_duplicates()

In [98]:
# Leverage
financials["DtA"] = financials["dt"] / financials["at"].replace(0, np.nan)

# Profitability
financials["ROE"] = financials["ni"] / financials["seq"].replace(0, np.nan)
financials["NPM"] = financials["ni"] / financials["revt"].replace(0, np.nan) # net profit margin
financials["ROA"] = financials["ni"] / financials["at"].replace(0, np.nan) 

# Firm size
financials["logMC"] = np.log(financials["mkvalt"].replace(0, np.nan))
financials["logTA"] = np.log(financials["at"].replace(0, np.nan))

# Intangible assets
financials["INTtA"] = financials["intan"] / financials["at"].replace(0, np.nan) 

# Liquidity
financials["Current"] = financials["act"] / financials["lct"].replace(0, np.nan)

# Other
financials["TobinQ"] = financials["mkvalt"] / financials["at"].replace(0, np.nan)
financials["BtM"] = financials["seq"] / financials["mkvalt"].replace(0, np.nan)

# R&D intensity
financials["RDxopr"] = financials["xrd"].fillna(0) / financials["xopr"].replace(0, np.nan)
financials["ProprietaryCost"] = financials["xrd"].fillna(0) / financials.groupby('cik')["at"].shift(1).replace(0, np.nan)

In [99]:
financials['ryear'] = financials['datadate'].dt.year
financials['rmonth'] = financials['datadate'].dt.month
financials["NAs"] = financials.isna().sum(axis=1)
financials.sort_values(["cik", "ryear", "rmonth", 'NAs'], inplace=True)
financials.drop_duplicates(subset=["cik", "ryear", "rmonth"], keep='first', inplace=True)

In [100]:
financials.columns

Index(['cik', 'datadate', 'act', 'at', 'dt', 'ebit', 'ebitda', 'intan', 'lct',
       'lt', 'ni', 'revt', 'seq', 'teq', 'xopr', 'xrd', 'xt', 'naicsh', 'sich',
       'mkvalt', 'naics', 'sic', 'DtA', 'ROE', 'NPM', 'ROA', 'logMC', 'logTA',
       'INTtA', 'Current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost',
       'ryear', 'rmonth', 'NAs'],
      dtype='object')

Shares held

In [101]:
Owner = pd.read_csv("Data\EIKON_Ownership.csv", parse_dates=['Date'])

Invetors = [
    'Bank and Trust', 'Corporation', 'Hedge Fund', 'Insurance Company',
    'Investment Advisor/Hedge Fund', 'Pension Fund', 'Research Firm',
    'Sovereign Wealth Fund', 'Venture Capital', 'Foundation',
    'Endowment Fund', 'Holding Company', 'Independent Research Firm',
    'Private Equity', 'Mutual Fund', 'Institution', 'Hedge Fund Portfolio',
    'Government Agency', 'Exchange-Traded Fund', 'Brokerage Firms']

Owner = Owner[Owner['Category Value'].isin(Invetors)].groupby(['Instrument', 'Date'])['Percent Of Traded Share'].sum().reset_index()

Owner.rename(columns={'Percent Of Traded Share': 'InstOwnership'}, inplace=True)

Internal control

In [103]:
ICW = pd.read_csv("Data/ICW.csv", parse_dates=['FYE_IC_OP', 'FILE_DATE'])

ICW.sort_values(['COMPANY_FKEY', 'FILE_DATE'], inplace=True)

ICW["OPyr"] = ICW['FYE_IC_OP'].dt.year
ICW["fyear"] = ICW['FILE_DATE'].dt.year

In [104]:
ICW['Big4'] = ICW['OP_AUD_NAME'].str.contains(r'Deloitte|KPMG|Ernst|Pricewaterhouse', case=False).astype(int)

ICW_gr = ICW.groupby(['COMPANY_FKEY', 'FILE_DATE'])[['COUNT_WEAK', 'Big4']].max().reset_index()
ICW_gr2 = ICW.groupby(['COMPANY_FKEY', 'fyear'])[['COUNT_WEAK', 'Big4']].max().reset_index()

In [105]:
disc_long = pd.merge(
    left=disc_long,
    right=ICW_gr,
    left_on=['CIK', 'filing_dt'],
    right_on=['COMPANY_FKEY', 'FILE_DATE'],
    how="left"
).drop(columns=['COMPANY_FKEY', 'FILE_DATE'])

In [106]:
disc_long[['COUNT_WEAK_2', 'Big4_2']] = pd.merge(
    left=disc_long,
    right=ICW_gr2,
    left_on=['CIK', 'fyear'],
    right_on=['COMPANY_FKEY', 'fyear'],
    how="left"
)[['COUNT_WEAK_y', 'Big4_y']]

In [107]:
disc_long.fillna({'COUNT_WEAK': disc_long['COUNT_WEAK_2']}, inplace=True)
disc_long.fillna({'Big4': disc_long['Big4_2']}, inplace=True)

disc_long.drop(columns=['COUNT_WEAK_2', 'Big4_2'], inplace=True)

## Merge data

In [109]:
Study1_data = pd.merge(
    left=disc_long,
    right=EPSEst_After,
    left_on=["CIK", "ryear"],
    right_on=["CIK", 'FY-1'],
    how="left"
).drop(columns=['FY-1'])

In [110]:
Study1_data = pd.merge(
    left=Study1_data,
    right=IVol,
    on=["CIK", "filing_dt"],
    how="left"
)

In [120]:
rprt_length = (
    topics_df.groupby(["CIK", "report_dt", "filing_dt"])['rf_length'].sum()
).reset_index().drop_duplicates(subset=["CIK", "filing_dt", "report_dt"])

rprt_length.rename(columns={'rf_length': '1A_len'}, inplace=True)

Study1_data = pd.merge(
    left=Study1_data, 
    right=rprt_length,
    on=["CIK", "filing_dt", "report_dt"],
    how='left'
)

In [121]:
Study1_data = pd.merge(
    left=Study1_data,
    right=std_returns,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

Study1_data = pd.merge(
    left=Study1_data,
    right=MA_BA,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

Study1_data = pd.merge(
    left=Study1_data,
    right=MA_vol,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

In [123]:
Study1_data = pd.merge(
    left=Study1_data,
    right=Beta.reset_index()[[
        'CIK', 'filing_dt', 
        'BETA+126', 'Beta_126',
        'CAR_5', 'CAR_2'
    ]],
    on=["CIK", "filing_dt"],
    how="left"
)

Study1_data = pd.merge(
    left=Study1_data,
    right=Analyst_df[["CIK", "filing_dt", 'NUMBEROFANALYSTS']],
    on=["CIK", "filing_dt"],
    how="left"
)

Study1_data.fillna({"NUMBEROFANALYSTS": 0}, inplace=True)

In [124]:
Study1_data = pd.merge(
    left=Study1_data,
    right=FF_vol,
    left_on=["FF", "filing_dt"],
    right_on=["FF", "Date"],
    how="left"
).drop(columns="Date")

Study1_data["fmonth"] = Study1_data['filing_dt'].dt.month

Owner['fyear'] = Owner['Date'].dt.year
Owner['fmonth'] = Owner['Date'].dt.month

Study1_data = pd.merge(
    left=Study1_data,
    right=Owner,
    left_on=["CIK", "fyear", "fmonth"],
    right_on=["Instrument", "fyear", "fmonth"],
    how="left"
).drop(columns=['Instrument', 'Date', 'fmonth'])

Study1_data['InstOwnership'] = Study1_data.groupby('CIK')['InstOwnership'].bfill(limit=1)

Study1_data = pd.merge(
    left=Study1_data,
    right=FreeFloat,
    on=["CIK", "filing_dt"],
    how="left"
)

In [125]:
Study1_data["ryear"] = Study1_data['report_dt'].dt.year
Study1_data["rmonth"] = Study1_data['report_dt'].dt.month

fin_cols = ['DtA', 'ROE', 'NPM', 'mkvalt', 'logMC', 'at', 'logTA', 'INTtA', 
            'Current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost', 'ROA']

Study1_data = pd.merge(
    left=Study1_data,
    right=financials[['cik', 'ryear', 'rmonth', *fin_cols]],
    left_on=["CIK", "ryear", "rmonth"],
    right_on=["cik", "ryear", "rmonth"],
    how="left"
)

financials.sort_values(["cik", "ryear", 'NAs'], inplace=True)
financials.drop_duplicates(subset=["cik", "ryear"], keep='first', inplace=True)

df = pd.merge(
    left=Study1_data,
    right=financials,
    left_on=["CIK", "ryear"],
    right_on=["cik", "ryear"],
    how="left",
    suffixes=['', '_2']
)

Study1_data.fillna(dict([(col, df[f"{col}_2"]) for col in fin_cols]), inplace=True)

In [126]:
Study1_data.shape

(7946420, 76)

In [127]:
Study1_data.columns

Index(['CIK', 'filing_dt', 'report_dt', 'FF', 'Topic_H', 'Disclosed',
       'DiscSum', 'TotalRFs', 'LstYrDisc', 'New', 'Removed', 'Repeated',
       'LstYrNew', 'rfGap', 'fyear', 'ryear', 'rf_length', 'Specificity',
       'Sentiment', 'FOG', 'rf_seq', 'rf_length_1', '#firms', 'IndDisc',
       'OtherIndDisc', 'IndSpecific', 'Qcut', 'COUNT_WEAK', 'Big4', 'EPSEst',
       'Act-Est', 'Earnings Per Share - Actual', 'nEsts', 'EPSEst_before',
       'Act-Est_before', 'b_mkt', 'ivol', 'tvol', 'Fwrd_ivol', '1A_len',
       'Volatility+30', 'Volatility_30', 'Volatility+60', 'Volatility_120',
       'Spread+30', 'Spread_30', 'Spread+60', 'Spread_120', 'SHRTURN',
       'SHRTURN+5', 'SHRTURN_5', 'BETA+126', 'Beta_126', 'CAR_5', 'CAR_2',
       'NUMBEROFANALYSTS', 'IndVol_', 'IndVol+', 'InstOwnership', 'FREEFLOAT',
       'rmonth', 'cik', 'DtA', 'ROE', 'NPM', 'mkvalt', 'logMC', 'at', 'logTA',
       'INTtA', 'Current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost',
       'ROA'],
      dtype='obj

In [130]:
Study1_data.to_csv('Data\study1_data2_V5.csv', index=False)