In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import re

# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.style.use('seaborn')
# plt.rc('figure', autolayout=True)

In [2]:
# Load data sample 
RF_df = pd.read_csv("Data\clean_docs_3.csv")

In [3]:
RF_df.shape

(1245475, 8)

# Top2Vec model

In [4]:
# Load Top2Vec topics df
t2v_df = (
    pd.read_csv("Top2Vec\T2V_df_H95.csv")
    .set_index("index").drop(columns=['Docs'])
)

t2v_df.rename(columns={'Report_dt': 'report_dt', 'Filing_dt': 'filing_dt'}, inplace=True)

In [5]:
topics_df = pd.concat([RF_df, t2v_df[['Topic', 'Score', 'Topic_H', 'Score_H']]], axis=1)

topics_df["filing_dt"] = pd.to_datetime(topics_df["filing_dt"])
topics_df["report_dt"] = pd.to_datetime(topics_df["report_dt"])

topics_df['rprt_length'] = topics_df['cleaned_txt'].map(lambda x: len(x.split()))

# Stats

In [6]:
# # Removing observations with duplicated filing_dt and ryear
# sample = (
#     topics_df[["CIK", "filing_dt", "report_dt"]]
#     .drop_duplicates()
#     .sort_values(["CIK", "filing_dt", "report_dt"])
# )

# sample.drop_duplicates(["CIK", "filing_dt"], keep='last', inplace=True)

# sample['ryear'] = sample['report_dt'].dt.year
# sample.drop_duplicates(["CIK", "ryear"], keep='first', inplace=True)

# sample.drop(columns='ryear', inplace=True)

# topics_df = (
#     topics_df.set_index(["CIK", "filing_dt", "report_dt"])
#     .loc[sample.apply(lambda x: (x[0], x[1], x[2]), axis=1).values]
#     .reset_index()
# )

In [7]:
# Use historical SIC data for industry analysis
sich = pd.read_csv(
    filepath_or_buffer="Data\Financials.csv",
    decimal=".", 
    thousands=',',
    usecols=[
        'cik', 'datadate', 'sich'
    ]
).drop_duplicates()

sich.sort_values(['cik', 'datadate'], inplace=True)
sich["datadate"] = pd.to_datetime(sich["datadate"], format="%d/%m/%Y")

topics_df = pd.merge(
    left=topics_df,
    right=sich[[
        'cik', 'datadate', 'sich']],
    left_on=["CIK", "report_dt"],
    right_on=["cik", "datadate"],
    how="left"
).drop(columns=['datadate', 'cik'])

topics_df['sich'] = topics_df['sich'].fillna(topics_df['SIC']).astype(int)

# First 2 digits of SIC -> Major sector group
topics_df['sich'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:2])

In [8]:
# Aggregate the records at the CIK-year level
agg_tops = (
    topics_df.groupby(["CIK", "report_dt", "filing_dt"])[
        ['Topic', 'Topic_H', 'rprt_length', 'SIC', 'sich', 'Industry', 'category']
    ]
    .agg({
        # 'filing_dt': 'max', 
        'Topic': lambda l: set(l), 
        'Topic_H': lambda l: set(l), 
        'rprt_length': 'sum',
        'SIC' : 'unique', 
        'sich' : 'unique', 
        'Industry' : 'unique',
        'category' : 'unique'
    }).reset_index()
).drop_duplicates(subset=["CIK", "filing_dt", "report_dt"]).sort_values(["CIK", "report_dt", "filing_dt"])


In [9]:
agg_tops["SIC"] = agg_tops["SIC"].map(lambda x: x[0])
agg_tops["sich"] = agg_tops["sich"].map(lambda x: x[0])

## Variables for H1

In [10]:
# Shift records to compare every year with previous year
agg_tops["shifted"] = agg_tops.groupby("CIK")['Topic'].shift(1)

agg_tops.dropna(inplace=True)

# Generate repeated, added and removed RFs
agg_tops["repeated"] = agg_tops.apply(lambda r: r['Topic'].intersection(r['shifted']), axis=1)
agg_tops["added"] = agg_tops.apply(lambda r: r['Topic'].difference(r['shifted']), axis=1)
agg_tops["removed"] = agg_tops.apply(lambda r: r['shifted'].difference(r['Topic']), axis=1)

In [11]:
stat_data = agg_tops[[
    'CIK', 'filing_dt', 'report_dt', 'rprt_length', 'SIC', 'Industry', 'category'
]].copy()

# For individual topics
stat_data["reported_crnt"] = agg_tops['Topic'].map(len)
stat_data["reported_last"] = agg_tops['shifted'].map(len)
stat_data["repeated"] = agg_tops["repeated"].map(len)
stat_data["added"] = agg_tops["added"].map(len)
stat_data["removed"] = agg_tops["removed"].map(len)

In [12]:
stat_data['rfGap'] = (stat_data['filing_dt'] - stat_data['report_dt']).dt.days

stat_data['fyear'] = stat_data['filing_dt'].dt.year
stat_data['ryear'] = stat_data['report_dt'].dt.year

## Variables for H2

In [50]:
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "sich"], 
    columns='Topic_H', values='Score_H'
).notna().astype(int).reset_index()

# disc_df["fyear"] = disc_df["filing_dt"].dt.year

disc_df.sort_values(['CIK', 'report_dt', 'filing_dt'], inplace=True)

In [19]:
# Difference between disclosed risk topics in 2 consecutive years
disc_diff = disc_df.filter(range(0,100)) - disc_df.groupby("CIK")[disc_df.filter(range(0,100)).columns].shift(1)

disc_repeat = (
    disc_df.filter(range(0,100))
    + disc_df.groupby("CIK")[disc_df.filter(range(0,100)).columns].shift(1) 
    == 2
).astype(int)

no_disc = -(disc_df.filter(range(0,100))-1)

In [20]:
# To what extent the risk factors disclosed by the firm 
# are already disclosed by other firms over the past 52 weeks
# Number of disclosing firms before the focal firm during the 52 weeks before filing date
def count_func(x):
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt"] - pd.Timedelta(weeks=52))&
        (disc_df["filing_dt"]<x["filing_dt"])&
        (disc_df['CIK']!=x['CIK'])
    ]
    output = (df_slice.filter(range(0,100)).sum() + 1) / (df_slice["CIK"].count() + 1)
    
    return output

count_disc = disc_df.apply(count_func, axis=1)

In [21]:
# No. disclosing firms in the same industry devided by no. disclosing firms 
def count_func_ind(x):
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt"] - pd.Timedelta(weeks=52))&
        (disc_df["filing_dt"]<x["filing_dt"])&
        (disc_df['sich']==x['sich'])&
        (disc_df['CIK']!=x['CIK'])
    ]

    output = (df_slice.filter(range(0,100)).sum() + 1) / (df_slice["CIK"].count() + 1)

    return output

count_disc_ind = disc_df.apply(count_func_ind, axis=1)

In [22]:
# Weighted disclosures
disc_diff_w = disc_diff.multiply(count_disc.values)
disc_w = disc_df.filter(range(0,101)).multiply(count_disc.values)
disc_repeat_w = disc_repeat.multiply(count_disc.values)
no_disc_w = no_disc.multiply(count_disc.values)

disc_diff_w_ind = disc_diff.multiply(count_disc_ind.values)
disc_w_ind = disc_df.filter(range(0,101)).multiply(count_disc_ind.values)
disc_repeat_w_ind = disc_repeat.multiply(count_disc_ind.values)
no_disc_w_ind = no_disc.multiply(count_disc_ind.values)

In [51]:
disc_df["avg_all"] = disc_w.mean(axis=1, skipna=True)

disc_df["avg_repeated"] = disc_repeat_w[disc_repeat_w>0].mean(axis=1, skipna=True)

disc_df["avg_added"] = disc_diff_w[disc_diff_w>0].mean(axis=1, skipna=True)

disc_df["avg_removed"] = disc_diff_w[disc_diff_w<0].mean(axis=1, skipna=True)

disc_df["avg_nodisc"] = no_disc_w.mean(axis=1, skipna=True)

In [52]:
disc_df["avg_all_ind"] = disc_w_ind.mean(axis=1, skipna=True)

disc_df["avg_repeated_ind"] = disc_repeat_w_ind[disc_repeat_w_ind>0].mean(axis=1, skipna=True)

disc_df["avg_added_ind"] = disc_diff_w_ind[disc_diff_w_ind>0].mean(axis=1, skipna=True)

disc_df["avg_removed_ind"] = disc_diff_w_ind[disc_diff_w_ind<0].mean(axis=1, skipna=True)

disc_df["avg_nodisc_ind"] = no_disc_w_ind.mean(axis=1, skipna=True)

## Daily data

EIKON prices

In [25]:
prices_df = pd.read_csv("Data\Prices.csv")
prices_df["Date"] = pd.to_datetime(prices_df["Date"])
prices_df.sort_values(["Instrument", "Date"], inplace=True)
prices_df = prices_df[prices_df["Date"]>'2005-01-01'].set_index('Date')
prices_df.columns

Index(['Instrument', 'CLOSEPRICE', 'VOLUME', 'COMPANYMARKETCAP',
       'TTLCMNSHARESOUT'],
      dtype='object')

In [26]:
# Daily shares turnover
prices_df["SHRTURN"] = prices_df["VOLUME"] / prices_df["TTLCMNSHARESOUT"]

# Returns
prices_df["Return"] = prices_df.groupby("Instrument")["CLOSEPRICE"].pct_change(1)

prices_df["log_Return"] = np.log(prices_df["Return"] + 1)

In [27]:
# Time window
N = 20

# Average of N-day std of daily returns
std_returns = (
    prices_df.groupby("Instrument")["Return"]
    .rolling(N, min_periods=N//2).std().to_frame()
)
std_returns[f"stdReturn+{N}"] = std_returns.groupby("Instrument")["Return"].shift(-N-2)
std_returns[f"stdReturn_{N}"] = std_returns.groupby("Instrument")["Return"].shift(2)

# std_returns.drop(columns="log_Return", inplace=True)

In [28]:
# Average of N-day trade volumes
N=5
MA_vol = prices_df.groupby("Instrument")["VOLUME"].rolling(N, min_periods=3).mean().to_frame()
MA_vol[f"VOLUME"] = MA_vol.groupby("Instrument")["VOLUME"].shift(N//2)
MA_vol["SHRTURN"] = prices_df.groupby("Instrument")["SHRTURN"].rolling(N, min_periods=3).mean()
MA_vol[f"SHRTURN"] = MA_vol.groupby("Instrument")["SHRTURN"].shift(N//2)

In [29]:
# Time window
N = 30

# Average of 60-day std of daily returns
std_returns[f"stdReturn+{N}"] = prices_df.groupby("Instrument")["Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(-N-2)
std_returns[f"stdReturn_{N}"] = prices_df.groupby("Instrument")["Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(2)

Bid-Ask spread

In [30]:
BidAsk_df = pd.read_csv("Data\BidAsk.csv").drop_duplicates()
BidAsk_df["Date"] = pd.to_datetime(BidAsk_df["Date"])
BidAsk_df.set_index('Date', inplace=True)
BidAsk_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8687523 entries, 2006-01-17 to 2022-06-14
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Instrument  int64  
 1   HIGHPRICE   float64
 2   LOWPRICE    float64
 3   BIDPRICE    float64
 4   ASKPRICE    float64
dtypes: float64(4), int64(1)
memory usage: 397.7 MB


In [31]:
BidAsk_df["BAspread"] = (BidAsk_df["ASKPRICE"] - BidAsk_df['BIDPRICE'])/BidAsk_df["ASKPRICE"]

In [32]:
N = 20

# N-day moving average (trading days only)
MA_BA = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().to_frame()
MA_BA[f"avgBA+{N}"] = MA_BA.groupby("Instrument")["BAspread"].shift(-N-2)
MA_BA[f"avgBA_{N}"] = MA_BA.groupby("Instrument")["BAspread"].shift(2)

MA_BA.drop(columns='BAspread', inplace=True)

In [33]:
N = 30
# N-day moving average (trading days only)
MA_BA[f"avgBA+{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(-N-2)
MA_BA[f"avgBA_{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(2)

Beta

In [34]:
Beta = pd.read_csv("Data\Beta_AR.csv")
Beta.drop_duplicates(inplace=True)
Beta.columns

Index(['Date', 'Instrument', 'Beta_30', 'Beta_90', 'Beta_250', 'AR', 'CAR_10',
       'CAR_20', 'CAR_30'],
      dtype='object')

In [35]:
Beta['Date'] = pd.to_datetime(Beta['Date'])
Beta.set_index('Date', inplace=True)

In [36]:
# Daily Beta
Beta["BETA+30"] = Beta.groupby('Instrument')["Beta_30"].shift(-30-2)
Beta["BETA_30"] = Beta.groupby('Instrument')["Beta_30"].shift(2)

Beta["BETA+90"] = Beta.groupby('Instrument')["Beta_90"].shift(-90-2)
Beta["BETA_90"] = Beta.groupby('Instrument')["Beta_90"].shift(2)

Analysts

In [37]:
Analysts = pd.read_csv("Data\Analysts.csv")
Analysts.dropna(inplace=True)

Analysts.columns = ['Instrument', 'NUMBEROFANALYSTS', 'Date']
Analysts['Date'] = pd.to_datetime(Analysts['Date'], errors='coerce').dt.tz_localize(None)

In [38]:
Analyst_df = pd.merge(
    left=prices_df["Instrument"], 
    right=Analysts, 
    on=["Date", "Instrument"],
    how="outer"
)

Analyst_df = Analyst_df[(Analyst_df["Date"]>'2005-01-01')&(Analyst_df["Date"]<'2023-01-01')]

Analyst_df.sort_values(['Instrument', 'Date', 'NUMBEROFANALYSTS'], inplace=True)

In [39]:
Analyst_df["NUMBEROFANALYSTS"] = Analyst_df.groupby("Instrument")["NUMBEROFANALYSTS"].fillna(method='ffill')
Analyst_df.drop_duplicates(subset=['Instrument', 'Date'], keep='last', inplace=True)
Analyst_df.fillna(0, inplace=True)

## Financial data

In [40]:
# Load annual financial data
# Fill NAs with last (previouse) observation and drop duplicates
financials = pd.read_csv(
    filepath_or_buffer="Data\Financials.csv",
    decimal=".", 
    thousands=',',
).drop_duplicates()

financials.sort_values(['cik', 'datadate'], inplace=True)

In [41]:
financials["datadate"] = pd.to_datetime(financials["datadate"], format="%d/%m/%Y")
financials["seq"].fillna(financials["teq"], inplace=True)

In [42]:
# Leverage
financials["DtA"] = financials["dt"] / financials["at"].replace(0, np.nan)
financials["DtEBITDA"] = financials["dt"] / financials["ebitda"].replace(0, np.nan)

# Profitability
financials["ROE"] = financials["ni"] / financials["seq"].replace(0, np.nan)
financials["NPM"] = financials["ni"] / financials["revt"].replace(0, np.nan) # net profit margin

# Firm size
financials["logMC"] = np.log(financials["mkvalt"].replace(0, np.nan))
financials["logTA"] = np.log(financials["at"].replace(0, np.nan))

# Intangible assets
financials["RtINT"] = financials["revt"] / financials["intan"].replace(0, np.nan)
financials["INTtA"] = financials["intan"] / financials["at"].replace(0, np.nan) 

# Liquidity
financials["current"] = financials["act"] / financials["lct"].replace(0, np.nan)

# Other
financials["TobinQ"] = financials["mkvalt"] / financials["at"].replace(0, np.nan)
financials["BtM"] = financials["seq"] / financials["mkvalt"].replace(0, np.nan)

# R&D intensity
financials["RDxopr"] = financials["xrd"].fillna(0) / financials["xopr"].replace(0, np.nan)
financials["ProprietaryCost"] = financials["xrd"].fillna(0) / financials.groupby('cik')["at"].shift(1).replace(0, np.nan)

# %change in earnings
financials["DEarnings"] = financials.groupby("cik")["ni"].pct_change(1).replace(np.Inf, np.nan)
financials["DEarnings"].replace(-np.Inf, np.nan, inplace=True)

In [43]:
financials['naicsh'] = financials['naicsh'].fillna(financials['naics']).astype(int)
financials['sich'] = financials['sich'].fillna(financials['sic']).astype(int)

# First 3 digits of historical NAICS > Subsector
financials['naicsh'] = financials['naicsh'].map(lambda x: str(x)[:3])

# First 2 digits of historical SIC > Major sector group
financials['sich'] = financials['sich'].map(lambda x: str(x)[:2])

In [44]:
financials.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'curcd', 'act', 'at', 'dt', 'ebit', 'ebitda', 'intan', 'lct', 'lt',
       'ni', 'revt', 'seq', 'teq', 'xopr', 'xrd', 'xt', 'cik', 'costat',
       'naicsh', 'sich', 'mkvalt', 'naics', 'sic', 'DtA', 'DtEBITDA', 'ROE',
       'NPM', 'logMC', 'logTA', 'RtINT', 'INTtA', 'current', 'TobinQ', 'BtM',
       'RDxopr', 'ProprietaryCost', 'DEarnings'],
      dtype='object')

## Merge data

In [45]:
stat_data = pd.merge(
    left=stat_data,
    right=std_returns,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

stat_data = pd.merge(
    left=stat_data,
    right=MA_vol,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

stat_data = pd.merge(
    left=stat_data,
    right=MA_BA,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

In [46]:
stat_data = pd.merge(
    left=stat_data,
    right=Beta.reset_index()[[
        'Instrument', 'Date', 
        'BETA+30', 'BETA_30', 'BETA+90', 'BETA_90',
        'CAR_10', 'CAR_20', 'CAR_30'
    ]],
    left_on=["CIK", "filing_dt"],
    right_on=['Instrument', 'Date'],
    how="left"
).drop(columns=['Instrument', 'Date'])

stat_data = pd.merge(
    left=stat_data,
    right=Analyst_df[['Date', 'Instrument', 'NUMBEROFANALYSTS']],
    left_on=["CIK", "filing_dt"],
    right_on=['Instrument', 'Date'],
    how="left"
).drop(columns=['Instrument', 'Date'])
# stat_data['NUMBEROFANALYSTS'].fillna(0, inplace=True)

stat_data = pd.merge(
    left=stat_data,
    right=financials[[
        'cik', 'datadate', 'naicsh', 'DtA', 'DtEBITDA', 'ROE', 'NPM', 'mkvalt', 'logMC',
        'at', 'logTA', 'RtINT', 'INTtA', 'current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost', 'DEarnings'
    ]],
    left_on=["CIK", "report_dt"],
    right_on=["cik", "datadate"],
    how="left"
).drop(columns=['datadate', 'cik'])

In [53]:
stat_data = pd.merge(
    left=stat_data,
    right=disc_df[[
        'CIK', 'filing_dt', 'report_dt', 'avg_all', 'avg_repeated', 'avg_added', 'avg_removed', 'avg_nodisc',
        'avg_all_ind', 'avg_repeated_ind', 'avg_added_ind', 'avg_removed_ind', 'avg_nodisc_ind'
    ]],
    on=["CIK", "filing_dt", "report_dt"],
    how="left"
)

In [54]:
stat_data['Industry'] = stat_data['Industry'].map(lambda x: re.sub('Office of ', '', x[0]))
stat_data["category"] = stat_data["category"].map(lambda x: x[0])

stat_data.drop(columns=['filing_dt', 'report_dt'], inplace=True)
stat_data.columns

Index(['CIK', 'rprt_length', 'SIC', 'Industry', 'category', 'reported_crnt',
       'reported_last', 'repeated', 'added', 'removed', 'rfGap', 'fyear',
       'ryear', 'Return', 'stdReturn+20', 'stdReturn_20', 'stdReturn+30',
       'stdReturn_30', 'VOLUME', 'SHRTURN', 'avgBA+20', 'avgBA_20', 'avgBA+30',
       'avgBA_30', 'BETA+30', 'BETA_30', 'BETA+90', 'BETA_90', 'CAR_10',
       'CAR_20', 'CAR_30', 'NUMBEROFANALYSTS', 'naicsh', 'DtA', 'DtEBITDA',
       'ROE', 'NPM', 'mkvalt', 'logMC', 'at', 'logTA', 'RtINT', 'INTtA',
       'current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost', 'DEarnings',
       'avg_all', 'avg_repeated', 'avg_added', 'avg_removed', 'avg_nodisc',
       'avg_all_ind', 'avg_repeated_ind', 'avg_added_ind', 'avg_removed_ind',
       'avg_nodisc_ind'],
      dtype='object')

In [55]:
stat_data.shape

(28194, 59)

In [56]:
stat_data.to_csv(f'Data\stats_data_T2V_H_V3.csv', index=False)