In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import pickle
import re

# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.style.use('seaborn')
# plt.rc('figure', autolayout=True)

In [2]:
# Load data sample 
RF_df = pd.read_csv("Data\clean_docs_3.csv")

In [3]:
RF_df.shape

(1245475, 8)

# ETM model

In [49]:
model = 'ETM'

In [50]:
with open('ETM\doc_topic_dist_3.pkl', 'rb') as f:
    document_topic_dist = pickle.load(f).sort(descending=True)

In [51]:
# Load and match the training indexes and the sample
with open("ETM\ETM_idx_train_100123.pkl", 'rb') as f:
    idx_train = pickle.load(f)

RF_df.reset_index(inplace=True)
RF_df["doc_idx"] = pd.Series(data=range(len(idx_train)), index = idx_train)

topics_df = RF_df.set_index('doc_idx').sort_index()

topics_df["filing_dt"] = pd.to_datetime(topics_df["filing_dt"])
topics_df["report_dt"] = pd.to_datetime(topics_df["report_dt"])

In [52]:
topics_df['rprt_length'] = topics_df['cleaned_txt'].map(lambda x: len(x.split()))

# Get topics per RF
topics_df["topic"] = document_topic_dist.indices[:, 0]
topics_df["Score"] = document_topic_dist.values[:, 0]

In [8]:
topics_df.groupby(topics_df.report_dt.dt.year)["CIK"].nunique().sum()

35468

# Top2Vec model

In [4]:
model = 'T2V_H'

In [5]:
# Load Top2Vec topics df
t2v_df = (
    pd.read_csv("Top2Vec\T2V_df_H95.csv")
    .set_index("index").drop(columns=['Docs'])
)

t2v_df.rename(columns={'Report_dt': 'report_dt', 'Report_dt': 'filing_dt'}, inplace=True)

In [6]:
topics_df = pd.concat([RF_df, t2v_df[['Topic', 'Score', 'Topic_H', 'Score_H']]], axis=1)

topics_df["filing_dt"] = pd.to_datetime(topics_df["filing_dt"])
topics_df["report_dt"] = pd.to_datetime(topics_df["report_dt"])

topics_df['rprt_length'] = topics_df['cleaned_txt'].map(lambda x: len(x.split()))

# Stats

In [7]:
# Removing observations with duplicated filing_dt and ryear
sample = (
    topics_df[["CIK", "filing_dt", "report_dt"]]
    .drop_duplicates()
    .sort_values(["CIK", "filing_dt", "report_dt"])
)

sample.drop_duplicates(["CIK", "filing_dt"], keep='last', inplace=True)

sample['ryear'] = sample['report_dt'].dt.year
sample.drop_duplicates(["CIK", "ryear"], keep='first', inplace=True)

sample.drop(columns='ryear', inplace=True)

topics_df = (
    topics_df.set_index(["CIK", "filing_dt", "report_dt"])
    .loc[sample.apply(lambda x: (x[0], x[1], x[2]), axis=1).values]
    .reset_index()
)

In [15]:
# Use historical SIC data for industry analysis
sich = pd.read_csv(
    filepath_or_buffer="Data\Financials.csv",
    decimal=".", 
    thousands=',',
    usecols=[
        'cik', 'datadate', 'sich', 'sic'
    ]
).drop_duplicates()

sich.sort_values(['cik', 'datadate'], inplace=True)
sich["datadate"] = pd.to_datetime(sich["datadate"], format="%d/%m/%Y")

# Replace missing historical data with current data 
sich['sich'] = sich['sich'].fillna(sich['sic']).astype(int)

topics_df = pd.merge(
    left=topics_df,
    right=sich[[
        'cik', 'datadate', 'naicsh', 'sich']],
    left_on=["CIK", "report_dt"],
    right_on=["cik", "datadate"],
    how="left"
).drop(columns=['datadate', 'cik'])

topics_df['sich'] = topics_df['sich'].fillna(topics_df['SIC']).astype(int)

# First 2 digits of SIC -> Major sector group
topics_df['SIC'] = topics_df['sich'].map(lambda x: str(x)[:2])

In [8]:
# Aggregate the records at the CIK-year level
agg_tops = (
    topics_df.groupby(["CIK", "filing_dt"])[['report_dt', 'Topic', 'Topic_H', 'rprt_length', 'SIC', 'Industry', 'category']]
    .agg({
        'report_dt': 'max', 
        'Topic': lambda l: set(l), 
        'Topic_H': lambda l: set(l), 
        'rprt_length': 'sum',
        'SIC' : 'unique', 
        'Industry' : 'unique',
        'category' : 'unique'
    }).reset_index()
)

In [9]:
agg_tops["SIC"] = agg_tops["SIC"].map(lambda x: x[0])

## Variables for H1

In [10]:
H = 'H1'

In [11]:
# For micro-topics
# Shift records to compare every year with previous year
agg_tops["shifted"] = agg_tops.groupby("CIK")['Topic'].shift(1)

# For macro-topics
# Shift records to compare every year with previous year
agg_tops["shifted_H"] = agg_tops.groupby("CIK")['Topic_H'].shift(1)

agg_tops.dropna(inplace=True)

# Generate repeated, added and removed RFs
agg_tops["repeated"] = agg_tops.apply(lambda r: r['Topic'].intersection(r['shifted']), axis=1)
agg_tops["added"] = agg_tops.apply(lambda r: r['Topic'].difference(r['shifted']), axis=1)
agg_tops["removed"] = agg_tops.apply(lambda r: r['shifted'].difference(r['Topic']), axis=1)

# Generate repeated, added and removed RFs
agg_tops["repeated_H"] = agg_tops.apply(lambda r: r['Topic_H'].intersection(r['shifted_H']), axis=1)
agg_tops["added_H"] = agg_tops.apply(lambda r: r['Topic_H'].difference(r['shifted_H']), axis=1)
agg_tops["removed_H"] = agg_tops.apply(lambda r: r['shifted_H'].difference(r['Topic_H']), axis=1)

In [12]:
stat_data = agg_tops[[
    'CIK', 'filing_dt', 'report_dt', 'rprt_length', 'SIC', 'Industry', 'category'
]].copy()

# For micro-topics
stat_data["reported_crnt"] = agg_tops['Topic'].map(len)
stat_data["reported_last"] = agg_tops['shifted'].map(len)
stat_data["repeated"] = agg_tops["repeated"].map(len)
stat_data["added"] = agg_tops["added"].map(len)
stat_data["removed"] = agg_tops["removed"].map(len)

# For macro-topics
stat_data["reported_crnt_H"] = agg_tops['Topic_H'].map(len)
stat_data["reported_last_H"] = agg_tops['shifted_H'].map(len)
stat_data["repeated_H"] = agg_tops["repeated_H"].map(len)
stat_data["added_H"] = agg_tops["added_H"].map(len)
stat_data["removed_H"] = agg_tops["removed_H"].map(len)

In [13]:
stat_data['rfGap'] = (stat_data['filing_dt'] - stat_data['report_dt']).dt.days

stat_data['fyear'] = stat_data['filing_dt'].dt.year
stat_data['ryear'] = stat_data['report_dt'].dt.year

# stat_data.drop_duplicates(["CIK", "ryear"], keep='first', inplace=True)

## Variables for H2

In [14]:
H = 'H2'

In [16]:
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "SIC"], 
    columns='Topic_H', values='Score_H'
).notna().astype(int).reset_index()

disc_df["ryear"] = disc_df["report_dt"].dt.year

disc_df.sort_values(['CIK', 'filing_dt'], inplace=True)

In [17]:
# Difference between disclosed risk topics in 2 consecutive years
disc_diff = disc_df.filter(range(0,100)) - disc_df.groupby("CIK")[disc_df.filter(range(0,100)).columns].shift(1)

# Number of reporting firms per fiscal year
rprt_firms = disc_df.groupby(['ryear', 'SIC'])["CIK"].transform('nunique')

In [18]:
# To what extent the risk factors disclosed by the firm 
# are already disclosed by other firms over the past 52 weeks
# Number of disclosing firms before the focal firm during the 52 weeks before filing date
def count_func(x):
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt"] - pd.Timedelta(weeks=52))&
        (disc_df["filing_dt"]<x["filing_dt"])&
        (disc_df['CIK']!=x['CIK'])
    ]
    output = (df_slice.filter(range(0,100)).sum() + 1) / (df_slice["CIK"].count() + 1)
    
    return output

count_disc = disc_df.apply(count_func, axis=1)

# count_disc.to_csv("count_disc.csv")
# count_disc = pd.read_csv("count_disc.csv", index_col=0)

In [19]:
# No. disclosing firms in the same industry devided by no. disclosing firms 
def count_func_ind(x):
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt"] - pd.Timedelta(weeks=52))&
        (disc_df["filing_dt"]<x["filing_dt"])&
        (disc_df['SIC']==x['SIC'])&
        (disc_df['CIK']!=x['CIK'])
    ]

    output = (df_slice.filter(range(0,100)).sum() + 1) / (df_slice["CIK"].count() + 1)

    return output

count_disc_ind = disc_df.apply(count_func_ind, axis=1)

In [20]:
# Weighted disclosures
disc_diff_w = disc_diff.multiply(count_disc.values)
disc_w = disc_df.filter(range(0,101)).multiply(count_disc.values)

disc_diff_w_ind = disc_diff.multiply(count_disc_ind.values)
disc_w_ind = disc_df.filter(range(0,101)).multiply(count_disc_ind.values)

In [21]:
disc_df.drop(columns=['report_dt'], inplace=True)

disc_df["avg_topic"] = disc_w.mean(axis=1, skipna=True)
# disc_df["std_topic"] = disc_w.std(axis=1, skipna=True)

disc_df["avg_added"] = disc_diff_w[disc_diff_w>0].mean(axis=1, skipna=True)
# disc_df["std_added"] = disc_diff_w[disc_diff_w>0].std(axis=1, skipna=True)

disc_df["avg_removed"] = disc_diff_w[disc_diff_w<0].mean(axis=1, skipna=True)
# disc_df["std_removed"] = disc_diff_w[disc_diff_w<0].std(axis=1, skipna=True)

In [22]:
disc_df["avg_topic_ind"] = disc_w_ind.mean(axis=1, skipna=True)

disc_df["avg_added_ind"] = disc_diff_w_ind[disc_diff_w_ind>0].mean(axis=1, skipna=True)

disc_df["avg_removed_ind"] = disc_diff_w_ind[disc_diff_w_ind<0].mean(axis=1, skipna=True)

In [60]:
# disc_df["std_topic_inv"] = (1/disc_w[disc_w>0]).std(axis=1, skipna=True)
# disc_df["avg_topic_inv"] = (1/disc_w[disc_w>0]).mean(axis=1, skipna=True)

# disc_df["std_added_inv"] = (1/disc_diff_w[disc_diff_w>0]).std(axis=1, skipna=True)
# disc_df["avg_added_inv"] = (1/disc_diff_w[disc_diff_w>0]).mean(axis=1, skipna=True)

# disc_df["std_removed_inv"] = (1/disc_diff_w[disc_diff_w<0]).std(axis=1, skipna=True)
# disc_df["avg_removed_inv"] = (1/disc_diff_w[disc_diff_w<0]).mean(axis=1, skipna=True)

In [61]:
# disc_cnt = topics_df.groupby('topic')["CIK"].nunique()

# g_25 = disc_cnt[(disc_cnt<=disc_cnt.quantile(0.25))].index.to_list()

# g_75 = disc_cnt[(disc_cnt>=disc_cnt.quantile(0.75))].index.to_list()

# disc_df["avg_topic_25"] = disc_w.filter(g_25, axis=1).mean(axis=1)
# disc_df["avg_topic_75"] = disc_w.filter(g_75, axis=1).mean(axis=1)

# disc_df["std_topic_25"] = disc_w.filter(g_25, axis=1).std(axis=1)
# disc_df["std_topic_75"] = disc_w.filter(g_75, axis=1).std(axis=1)

In [64]:
# disc_df[disc_w.columns] = disc_w
# disc_df.rename(columns=dict([(x, f"topic{x}") for x in disc_w.columns]), inplace=True)

In [23]:
disc_df[["avg_topic", "avg_added", "avg_removed", "avg_topic_ind", "avg_added_ind", "avg_removed_ind"]].describe()

Topic_H,avg_topic,avg_added,avg_removed,avg_topic_ind,avg_added_ind,avg_removed_ind
count,35408.0,24412.0,23231.0,35408.0,24412.0,23231.0
mean,0.064165,0.269367,-0.257724,0.091576,0.328394,-0.317904
std,0.030513,0.108579,0.098264,0.042991,0.153171,0.153935
min,0.000443,0.01254,-0.939983,0.002011,0.005137,-1.0
25%,0.041666,0.197876,-0.312541,0.059712,0.223674,-0.403911
50%,0.059608,0.256215,-0.24896,0.085206,0.314815,-0.305043
75%,0.082505,0.322588,-0.18875,0.116932,0.415385,-0.212108
max,0.183031,0.943875,-0.01254,0.442105,1.0,-0.004673


## Daily data

EIKON prices

In [24]:
prices_df = pd.read_csv("Data\Prices.csv")
prices_df["Date"] = pd.to_datetime(prices_df["Date"])
prices_df.set_index('Date', inplace=True)
prices_df.columns

Index(['Instrument', 'CLOSEPRICE', 'VOLUME', 'COMPANYMARKETCAP',
       'TTLCMNSHARESOUT'],
      dtype='object')

In [25]:
# Daily shares turnover
prices_df["SHRTURN"] = prices_df["VOLUME"] / prices_df["TTLCMNSHARESOUT"]

# Returns
prices_df["Return"] = prices_df.groupby("Instrument")["CLOSEPRICE"].pct_change(1)

prices_df["log_Return"] = np.log(prices_df["Return"] + 1)

In [26]:
# Time window
N = 10

# Average of N-day std of daily returns
std_returns = (
    prices_df.groupby("Instrument")["log_Return"]
    .rolling(N, min_periods=N//2).std().to_frame()
)
std_returns[f"stdReturn+{N}"] = std_returns.groupby("Instrument")["log_Return"].shift(-N-2)
std_returns[f"stdReturn_{N}"] = std_returns.groupby("Instrument")["log_Return"].shift(2)

std_returns.drop(columns="log_Return", inplace=True)

# Average of N-day trade volumes
MA_vol = prices_df.groupby("Instrument")["VOLUME"].rolling(N, min_periods=N//2).mean().to_frame()
MA_vol[f"VOLUME_{N}"] = MA_vol.groupby("Instrument")["VOLUME"].shift(2)
MA_vol["SHRTURN"] = prices_df.groupby("Instrument")["SHRTURN"].rolling(N, min_periods=N//2).mean()

MA_vol.drop(columns="VOLUME", inplace=True)

In [27]:
# Time window
N = 20

# Average of N-day std of daily returns
std_returns[f"stdReturn+{N}"] = prices_df.groupby("Instrument")["log_Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(-N-2)
std_returns[f"stdReturn_{N}"] = prices_df.groupby("Instrument")["log_Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(2)

# Average of daily trade returns
# MA_vol[f"VOLUME_{N}"] = grouped_prices["VOLUME"].rolling(N).mean().groupby("Instrument").shift(2)

In [28]:
# Time window
N = 30

# Average of 60-day std of daily returns
std_returns[f"stdReturn+{N}"] = prices_df.groupby("Instrument")["log_Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(-N-2)
std_returns[f"stdReturn_{N}"] = prices_df.groupby("Instrument")["log_Return"].rolling(N, min_periods=N//2).std().groupby("Instrument").shift(2)

Bid-Ask spread

In [29]:
BidAsk_df = pd.read_csv("Data\BidAsk.csv").drop_duplicates()
BidAsk_df["Date"] = pd.to_datetime(BidAsk_df["Date"])
BidAsk_df.set_index('Date', inplace=True)
BidAsk_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8687523 entries, 2006-01-17 to 2022-06-14
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Instrument  int64  
 1   HIGHPRICE   float64
 2   LOWPRICE    float64
 3   BIDPRICE    float64
 4   ASKPRICE    float64
dtypes: float64(4), int64(1)
memory usage: 397.7 MB


In [30]:
BidAsk_df["BAspread"] = (BidAsk_df["ASKPRICE"] - BidAsk_df['BIDPRICE'])/BidAsk_df["ASKPRICE"]

In [31]:
N = 10

# N-day moving average (trading days only)
MA_BA = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().to_frame()
MA_BA[f"avgBA+{N}"] = MA_BA.groupby("Instrument")["BAspread"].shift(-N-2)
MA_BA[f"avgBA_{N}"] = MA_BA.groupby("Instrument")["BAspread"].shift(2)

# MA_BA[f"stdBA+{N}"] = grouped_BA['BAspread'].rolling(N).std().groupby("Instrument").shift(-N-2)
# MA_BA[f"stdBA_{N}"] = grouped_BA['BAspread'].rolling(N).std().groupby("Instrument").shift(2)

MA_BA.drop(columns='BAspread', inplace=True)

In [32]:
N = 20
# N-day moving average (trading days only)
MA_BA[f"avgBA+{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(-N-2)
MA_BA[f"avgBA_{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(2)

In [33]:
N = 30
# N-day moving average (trading days only)
MA_BA[f"avgBA+{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(-N-2)
MA_BA[f"avgBA_{N}"] = BidAsk_df.groupby('Instrument')['BAspread'].rolling(N, min_periods=N//2).mean().groupby("Instrument").shift(2)

Beta

In [34]:
Beta = pd.read_csv("Data\Beta_AR.csv")
Beta.drop_duplicates(inplace=True)
Beta.columns

Index(['Date', 'Instrument', 'Beta_30', 'Beta_90', 'Beta_250', 'AR', 'CAR_10',
       'CAR_20', 'CAR_30'],
      dtype='object')

In [35]:
Beta['Date'] = pd.to_datetime(Beta['Date'])
Beta.set_index('Date', inplace=True)

In [36]:
# Daily Beta
Beta["BETA+30"] = Beta.groupby('Instrument')["Beta_30"].shift(-30-2)
Beta["BETA_30"] = Beta.groupby('Instrument')["Beta_30"].shift(2)

Beta["BETA+90"] = Beta.groupby('Instrument')["Beta_90"].shift(-90-2)
Beta["BETA_90"] = Beta.groupby('Instrument')["Beta_90"].shift(2)

Beta["BETA+250"] = Beta.groupby('Instrument')["Beta_250"].shift(-250-2)
Beta["BETA_250"] = Beta.groupby('Instrument')["Beta_250"].shift(2)

# Daily Cost of Capital
# Beta["WACC"] = Beta.groupby('Instrument')["TR.WACC"].shift(-2)
# Beta["WACC_"] = Beta.groupby('Instrument')["TR.WACC"].shift(2)

Analysts

In [37]:
Analysts = pd.read_csv("Data\Analysts.csv")
Analysts.dropna(inplace=True)
Analysts.columns

Index(['Instrument', 'TR.NUMBEROFANALYSTS(PERIOD=FY1,METHODOLOGY=INTERIMSUM)',
       'TR.NUMBEROFANALYSTS(PERIOD=FY1,METHODOLOGY=INTERIMSUM).DATE'],
      dtype='object')

In [38]:
Analysts.columns = ['Instrument', 'NUMBEROFANALYSTS', 'Date']
Analysts['Date'] = pd.to_datetime(Analysts['Date'], errors='coerce').dt.tz_localize(None)

In [39]:
Analyst_df = pd.merge(
    left=prices_df["Instrument"], 
    right=Analysts, 
    on=["Date", "Instrument"],
    how="outer"
)

Analyst_df.sort_values(['Date', 'Instrument', 'NUMBEROFANALYSTS'], inplace=True)

In [40]:
Analyst_df["NUMBEROFANALYSTS"] = Analyst_df.groupby("Instrument")["NUMBEROFANALYSTS"].fillna(method='ffill')
Analyst_df.drop_duplicates(subset=['Instrument', 'Date'], keep='last', inplace=True)

## Financial data

In [41]:
# Load annual financial data
# Fill NAs with last (previouse) observation and drop duplicates
financials = pd.read_csv(
    filepath_or_buffer="Data\Financials.csv",
    decimal=".", 
    thousands=',',
    # usecols=[
    #     'cik', 'datadate', 'act', 'at', 'dt', 'ebit', 'ebitda', 'intan', 'lct',
    #     'lt', 'ni', 'revt', 'seq', 'teq', 'xrd', 'xt', 'mkvalt'
    # ]
).drop_duplicates()

financials.sort_values(['cik', 'datadate'], inplace=True)

In [42]:
financials["datadate"] = pd.to_datetime(financials["datadate"], format="%d/%m/%Y")
financials["seq"].fillna(financials["teq"], inplace=True)

In [43]:
# Leverage
financials["DtA"] = financials["dt"] / financials["at"].replace(0, np.nan)
financials["DtEBITDA"] = financials["dt"] / financials["ebitda"].replace(0, np.nan)

# Profitability
financials["ROE"] = financials["ni"] / financials["seq"].replace(0, np.nan)
financials["NPM"] = financials["ni"] / financials["revt"].replace(0, np.nan) # net profit margin

# Firm size
financials["logMC"] = np.log(financials["mkvalt"].replace(0, np.nan))
financials["logTA"] = np.log(financials["at"].replace(0, np.nan))

# Intangible assets
financials["RtINT"] = financials["revt"] / financials["intan"].replace(0, np.nan)
financials["INTtA"] = financials["intan"] / financials["at"].replace(0, np.nan) 

# Liquidity
financials["current"] = financials["act"] / financials["lct"].replace(0, np.nan)

# Other
financials["TobinQ"] = financials["mkvalt"] / financials["at"].replace(0, np.nan)
financials["BtM"] = financials["seq"] / financials["mkvalt"].replace(0, np.nan)

# R&D intensity
financials["RDxopr"] = financials["xrd"].fillna(0) / financials["xopr"].replace(0, np.nan)
financials["ProprietaryCost"] = financials["xrd"].fillna(0) / financials.groupby('cik')["at"].shift(1).replace(0, np.nan)

# %change in earnings
financials["DEarnings"] = financials.groupby("cik")["ni"].pct_change(1).replace(np.Inf, np.nan)
financials["DEarnings"].replace(-np.Inf, np.nan, inplace=True)

In [44]:
financials['naicsh'] = financials['naicsh'].fillna(financials['naics']).astype(int)
financials['sich'] = financials['sich'].fillna(financials['sic']).astype(int)

# First 3 digits of historical NAICS > Subsector
financials['naicsh'] = financials['naicsh'].map(lambda x: str(x)[:3])

# First 2 digits of historical SIC > Major sector group
financials['sich'] = financials['sich'].map(lambda x: str(x)[:2])

In [45]:
ind = financials['naicsh'].map(lambda x: str(x)[:2]).replace({'32': '31', '33': '31', '45': '44', '49': '48'})

In [None]:
set(ind.unique())

In [46]:
financials.columns

Index(['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'curcd', 'act', 'at', 'dt', 'ebit', 'ebitda', 'intan', 'lct', 'lt',
       'ni', 'revt', 'seq', 'teq', 'xopr', 'xrd', 'xt', 'cik', 'costat',
       'naicsh', 'sich', 'mkvalt', 'naics', 'sic', 'DtA', 'DtEBITDA', 'ROE',
       'NPM', 'logMC', 'logTA', 'RtINT', 'INTtA', 'current', 'TobinQ', 'BtM',
       'RDxopr', 'ProprietaryCost', 'DEarnings'],
      dtype='object')

## Merge data

In [47]:
stat_data = pd.merge(
    left=stat_data,
    right=std_returns,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

stat_data = pd.merge(
    left=stat_data,
    right=MA_vol,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

stat_data = pd.merge(
    left=stat_data,
    right=MA_BA,
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

In [48]:
stat_data = pd.merge(
    left=stat_data,
    right=Beta.reset_index()[[
        'Instrument', 'Date', 
        'BETA+30', 'BETA_30', 'BETA+90', 'BETA_90', 'BETA+250', 'BETA_250',
        'CAR_10', 'CAR_20', 'CAR_30'
    ]],
    left_on=["CIK", "filing_dt"],
    right_on=['Instrument', 'Date'],
    how="left"
).drop(columns=['Instrument', 'Date'])

stat_data = pd.merge(
    left=stat_data,
    right=Analyst_df[['Date', 'Instrument', 'NUMBEROFANALYSTS']],
    left_on=["CIK", "filing_dt"],
    right_on=['Instrument', 'Date'],
    how="left"
).drop(columns=['Instrument', 'Date'])
stat_data['NUMBEROFANALYSTS'].fillna(0, inplace=True)

stat_data = pd.merge(
    left=stat_data,
    right=financials[[
        'cik', 'datadate', 'naicsh', 'sich', 'DtA', 'DtEBITDA', 'ROE', 'NPM', 'mkvalt', 'logMC',
        'at', 'logTA', 'RtINT', 'INTtA', 'current', 'TobinQ', 'BtM', 'RDxopr', 'ProprietaryCost', 'DEarnings'
    ]],
    left_on=["CIK", "report_dt"],
    right_on=["cik", "datadate"],
    how="left"
).drop(columns=['datadate', 'cik'])

In [49]:
# For H2 and H3
if H in ['H2', 'H3']:
    stat_data = pd.merge(
        left=stat_data,
        right=disc_df[[
            'CIK', 'filing_dt', 'ryear', 'avg_topic', 'avg_added', 'avg_removed',
            'avg_topic_ind', 'avg_added_ind', 'avg_removed_ind',
            # 'std_topic', 'std_added', 'std_removed',
            # 'std_topic_inv', 'std_added_inv', 'std_removed_inv',
            # 'avg_topic_25', 'avg_topic_75', 'std_topic_25', 'std_topic_75'
        ]],
        on=["CIK", "filing_dt", "ryear"],
        how="left"
    )

In [50]:
# # NA for added/removed means no new addition or removal of RFs => fill with 0
# stat_data[['avg_added', 'avg_removed', 'avg_added_ind', 'avg_removed_ind']] = (
#     stat_data[['avg_added', 'avg_removed', 'avg_added_ind', 'avg_removed_ind']].fillna(0)
# )

stat_data['Industry'] = stat_data['Industry'].map(lambda x: re.sub('Office of ', '', x[0]))
stat_data["category"] = stat_data["category"].map(lambda x: x[0])

stat_data.drop(columns=['filing_dt', 'report_dt'], inplace=True)
stat_data.columns

Index(['CIK', 'rprt_length', 'SIC', 'Industry', 'category', 'reported_crnt',
       'reported_last', 'repeated', 'added', 'removed', 'reported_crnt_H',
       'reported_last_H', 'repeated_H', 'added_H', 'removed_H', 'rfGap',
       'fyear', 'ryear', 'stdReturn+10', 'stdReturn_10', 'stdReturn+20',
       'stdReturn_20', 'stdReturn+30', 'stdReturn_30', 'VOLUME_10', 'SHRTURN',
       'avgBA+10', 'avgBA_10', 'avgBA+20', 'avgBA_20', 'avgBA+30', 'avgBA_30',
       'BETA+30', 'BETA_30', 'BETA+90', 'BETA_90', 'BETA+250', 'BETA_250',
       'CAR_10', 'CAR_20', 'CAR_30', 'NUMBEROFANALYSTS', 'naicsh', 'sich',
       'DtA', 'DtEBITDA', 'ROE', 'NPM', 'mkvalt', 'logMC', 'at', 'logTA',
       'RtINT', 'INTtA', 'current', 'TobinQ', 'BtM', 'RDxopr',
       'ProprietaryCost', 'DEarnings', 'avg_topic', 'avg_added', 'avg_removed',
       'avg_topic_ind', 'avg_added_ind', 'avg_removed_ind'],
      dtype='object')

In [41]:
# For H2 and H3
if H in ['H2', 'H3']:
    stat_data.dropna(subset=['avg_topic'], inplace=True)
#     stat_data = stat_data[stat_data.groupby("CIK")['ryear'].transform('count')>1]

In [55]:
stat_data.shape

(27849, 66)

In [56]:
stat_data.to_csv(f'Data\stats_data_{model}.csv', index=False)

datadate    >>> reporting date\
at          >>> Total assets (m)\
teq (seq)   >>> Total equity\
lt          >>> Total liabilities\
dt          >>> Total debt\
act         >>> Total current assets\
lct         >>> Total current liabilities\
ebit        >>> Earnings Before Interest and Taxes\
ebitda      >>> Earnings Before Interest, Taxes, Depreciation and Amortization\
ni          >>> Net Income\
revt        >>> Total revenue\
mkvalt      >>> Total market value\
intan       >>> Intangible assets\
xrd         >>> R&D expenses\
xt          >>> Total expenses\
xorp        >>> Total operating expenses

In [None]:
# Get CIKs per ticker and SIC
firm_info_df = pd.read_csv("firm_info.csv")
firm_info_df["CIK"] = firm_info_df["CIK"].astype(int)

tickers = (
    firm_info_df.set_index(["CIK", "SIC"])['tickers']
    .str.strip('[]').str.replace("'", "")
    .replace(r'^\s*$', np.nan, regex=True)
    .str.split(",").dropna()
    .explode()
).reset_index()