In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import re

# Risk disclosure

In [2]:
# Load RF data 
topics_df = pd.read_csv("Data/RDdf_T2V5.csv", parse_dates=['report_dt', 'filing_dt'])

topics_df['NERs'] = topics_df['NERs'].str.replace(pat=" ", repl="").str.findall(pat=r"'(.*?)'")

NE_labels = ['PERSON', 'NORP' 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']
topics_df['Specificity'] = topics_df['NERs'].apply(lambda NERs: len([ne for ne in NERs if ne in NE_labels]))

topics_df['SIC3'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:3])

print(topics_df.shape)
topics_df.columns

(3245397, 24)


Index(['Topic', 'Score', 'Topic_H', 'Score_H', 'CIK', 'report_dt', 'filing_dt',
       'rf_seq', 'ticker', 'filerCIK', 'rf_length', 'NERs', 'Pa', 'Pr', 'Fu',
       'Sentiment', 'FOG', 'SIC', 'FF', 'ryear', 'fyear', 'rf_seq_count',
       'Specificity', 'SIC3'],
      dtype='object')

In [3]:
N = topics_df['Topic_H'].nunique()
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "FF"], 
    columns='Topic_H', values='Score'
).notna().astype(int).reset_index()

# Long format
disc_long = pd.melt(disc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='Disclosed')
disc_long.sort_values(["CIK", 'Topic_H', "filing_dt", "report_dt"], inplace=True)

disc_long['DiscSum'] = disc_long.groupby(["CIK", 'Topic_H'])['Disclosed'].cumsum()

# Total number of risk topics
disc_long['TotalRFs'] = disc_long.groupby(["CIK", "filing_dt", "report_dt"])['Disclosed'].transform('sum')

# Difference between disclosed risk topics in 2 consecutive years
disc_diff = disc_df.filter(range(N)) - disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1)

# Repeated risk factors
disc_repeat = (
    disc_df.filter(range(N))
    + disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1) 
    == 2
).astype(int)

# Whether risk factor was disclosed in the previouse year's report
disc_long['LstYrDisc'] = disc_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(['CIK', 'Topic_H'])['Disclosed'].shift(1)
disc_long['LstYrDisc'] = disc_long.groupby(['CIK', 'Topic_H'])['LstYrDisc'].ffill()

# Generate added, repeated and removed dummies
disc_long['New'] = ((disc_long['LstYrDisc']==0)&(disc_long['Disclosed']==1)).astype(int)
disc_long['Removed'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==0)).astype(int)
disc_long['Repeated'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==1)).astype(int)

# Whether risk factor was newly disclosed in the previouse year's report
disc_long['LstYrNew'] = disc_long.groupby(['CIK', 'Topic_H'])['New'].shift(1)

In [4]:
# Keep RFs that if disclosed, they are either added or repeated
disc_long = (
    disc_long[disc_long[['New', 'Repeated']]
              .sum(axis=1)==disc_long['Disclosed']]
              .reset_index(drop=True)
)

# Number of days from fiscal year end and actual filing date
disc_long['rfGap'] = (disc_long['filing_dt'] - disc_long['report_dt']).dt.days

disc_long['fyear'] = disc_long["filing_dt"].dt.year
disc_long['ryear'] = disc_long["report_dt"].dt.year

In [5]:
# Length of disclosed RFs as the total number of words 
length_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt"], 
    columns='Topic_H', values='rf_length', aggfunc='sum',
    fill_value=0 # NA length means not disclosed so equal to 0
).reset_index()

# Long format
length_long = pd.melt(length_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='RF_length')
length_long.sort_values(["CIK", "Topic_H", "filing_dt", "report_dt"], inplace=True)

# Length of the topics last year
length_long['length_1'] = length_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(["CIK", "Topic_H"])['RF_length'].shift(1)
length_long['length_1'] = length_long.groupby(['CIK', 'Topic_H'])['length_1'].ffill()

disc_long = pd.merge(
    left=disc_long,
    right=length_long,
    on=['CIK', 'filing_dt', 'report_dt', 'Topic_H'],
    how='left'
)

### Industry disclosure

In [6]:
disc_df["filing_dt-1"] = disc_df["filing_dt"] - pd.Timedelta(weeks=52)

def count_func(x):
    """
    Counts the number of firms in the industry that disclose a specific RF.
    """
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt-1"])
        &(disc_df["filing_dt"]<=x["filing_dt"])
        &(disc_df['FF']==x['FF'])
    ]
    output = (
        df_slice[df_slice['CIK']!=x['CIK']].filter(range(N)).sum() / df_slice["filing_dt"].count()
    )
    
    return output

# Running the function on disc_df
IndDisc_df = disc_df.drop(columns=range(N)).copy()
IndDisc_df.loc[:, range(N)] = disc_df.apply(count_func, axis=1)
IndDisc_df.drop(columns=['filing_dt-1'], inplace=True)

# Create the data in long format
Inddisc_long = pd.melt(IndDisc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='IndDisc')

# Drop RFs that have never been disclosed per industry 
Inddisc_long = Inddisc_long[Inddisc_long.groupby(['FF', 'Topic_H'])['IndDisc'].transform('sum')>0]

In [7]:
disc_long['IndDisc'] = pd.merge(
    left=disc_long,
    right=Inddisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['IndDisc']

disc_long.fillna({'IndDisc': 0}, inplace=True)
disc_long.reset_index(drop=True, inplace=True)

In [8]:
# Industry disclosure change
disc_long['IndDisc_1'] = disc_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(['CIK', 'Topic_H'])['IndDisc'].shift(1)
disc_long['IndDisc_1'] = disc_long.groupby(['CIK', 'Topic_H'])['IndDisc_1'].ffill()

# Linked firm disclosures

The independent variable shows how many of the firms linked to the focal firm at a specific year have disclosed the Added, Repeated or Removed risk factors in their corresponding annual report. 
At year ryear for firm CIK, we look at the linked firms in Year and count the number of disclosures in reporting year ryear. At a specfic reporting year, all firms are assumed to be exposed to similar risks. It is not important if one is reporting sooner or later.\

In [9]:
# Load BoardEX data
compo = pd.read_csv("Data/Board-Composition.csv", parse_dates=['AnnualReportDate']).drop(columns='Ticker').drop_duplicates()

compo['Year'] = pd.to_datetime(compo['AnnualReportDate']).dt.year

# Calculate number of DirectorID shared with other BoardID per year
link_df = pd.merge(
    left=compo[['BoardID', 'DirectorID', 'Year', 'CIKCode']],
    right=compo[['DirectorID', 'Year', 'BoardID', 'CIKCode']],
    on=['DirectorID', 'Year'],
    how='outer',
    suffixes=["", "_lnkd"]
)

link_df = link_df[link_df['BoardID']!=link_df['BoardID_lnkd']]

link_df = link_df.dropna(subset=['CIKCode', 'CIKCode_lnkd']).drop_duplicates().reset_index(drop=True)

# Link age
link_df = link_df.sort_values(['BoardID', 'BoardID_lnkd', 'Year']).reset_index(drop=True)
link_df['LinkTime'] = link_df.drop_duplicates(subset=['BoardID', 'BoardID_lnkd', 'Year']).groupby(['BoardID', 'BoardID_lnkd'])['Year'].cumcount()+1
link_df['LinkTime'] = link_df['LinkTime'].ffill()

# List of linked firms
lnkdCIKs = link_df.groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

disc_df['ryear'] = disc_df['report_dt'].dt.year

LnkDisc: Number of linked firms that disclose each RT in the same ryear\
OldLnkNewDisc: Number of firms linked for more than 1 year that disclose new RT in the same ryear\
NewLnkLstyrDisc: Number of new linked firms that disclosed each RT in the last ryear\
NewLnkRepDisc: Number of new linked firms that repeat each RT in ryear (disclosed last year and this year)

In [10]:
def lnkd_disc_count(x):
    """
    Counts the number of linked firms that disclose a specific RF in year ryear.
    """
    try:
        output = disc_df[
            (disc_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = np.nan

    return output

# Running the function on disc_df
LnkdDisc_df = disc_df.drop(columns=range(N)).copy()
LnkdDisc_df.loc[:, range(N)] = disc_df.apply(lnkd_disc_count, axis=1)
LnkdDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format - 
LnkdDisc_long = pd.melt(LnkdDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='LnkDisc')


H: Connected firms A and B add new RF at the same time

In [11]:
# List of linked CIKs of more than 1 year old
OldlnkdCIKs = link_df[link_df['LinkTime']>1].groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

# Count the number of old linked firms disclosing new RF
disc_new_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], (disc_diff>0).astype(int)], axis=1)


In [12]:
def lnkd_new_count(x):
    """
    Counts the number of old linked firms that disclose a new RF in year ryear.
    """
    try:
        output = disc_new_df[
            (disc_new_df['CIK'].isin(OldlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_new_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = np.nan

    return output

# Running the function on disc_df
LnkdRep_df = disc_df.drop(columns=range(N)).copy()
LnkdRep_df.loc[:, range(N)] = disc_df.apply(lnkd_new_count, axis=1)
LnkdRep_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkdRep_long = pd.melt(LnkdRep_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='OldLnkNewDisc')

H: Firm A connects with B in year t, it adds RFs disclosed by B in year t-1

In [13]:
# List of new linked CIKs
NewlnkdCIKs = link_df[link_df['LinkTime']==1].groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

disc_lstyr_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], 
                           disc_df.groupby("CIK")[list(range(N))].shift(1)], axis=1)


In [14]:
def New_lnkd_LstYr_count(x):
    """
    Counts the number of New linked firms that disclose a specific RF in year ryear-1.
    """
    try:
        output = disc_lstyr_df[
            (disc_lstyr_df['CIK'].isin(NewlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_lstyr_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
NewLnkdDisc_df = disc_df.drop(columns=range(N)).copy()
NewLnkdDisc_df.loc[:, range(N)] = disc_df.apply(New_lnkd_LstYr_count, axis=1)
NewLnkdDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
NewLnkdDisc_long = pd.melt(NewLnkdDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='NewLnkLstyrDisc')

In [None]:
# For additional analysis to check if a risk topic transfered from one firm to another is less specific
def lnkd_LstYr_count(x):
    """
    Counts the number of New linked firms that disclose a specific RF in year ryear-1.
    """
    try:
        output = disc_lstyr_df[
            (disc_lstyr_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_lstyr_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
LnkLstyrDisc_df = disc_df.drop(columns=range(N)).copy()
LnkLstyrDisc_df.loc[:, range(N)] = disc_df.apply(lnkd_LstYr_count, axis=1)
LnkLstyrDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkLstyrDisc_long = pd.melt(LnkLstyrDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='LnkLstyrDisc')

In [15]:
disc_repeat_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], disc_repeat], axis=1)

def New_lnkd_repeat_count(x):
    """
    Counts the number of linked firms that repeat a specific RF in year ryear.
    """
    try:
        output = disc_repeat_df[
            (disc_repeat_df['CIK'].isin(NewlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_repeat_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
NewLnkdRep_df = disc_repeat_df.drop(columns=range(N)).copy()
NewLnkdRep_df.loc[:, range(N)] = disc_repeat_df.apply(New_lnkd_repeat_count, axis=1)
NewLnkdRep_df.drop(columns=['ryear'], inplace=True)

# Create the data in long format
NewLnkdRep_long = pd.melt(NewLnkdRep_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='NewLnkRepDisc')

In [16]:
disc_long['LnkDisc'] = pd.merge(
    left=disc_long,
    right=LnkdDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['LnkDisc']

disc_long['OldLnkNewDisc'] = pd.merge(
    left=disc_long,
    right=LnkdRep_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['OldLnkNewDisc']

disc_long['NewLnkLstyrDisc'] = pd.merge(
    left=disc_long,
    right=NewLnkdDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['NewLnkLstyrDisc']

disc_long['NewLnkRepDisc'] = pd.merge(
    left=disc_long,
    right=NewLnkdRep_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['NewLnkRepDisc']

## Merge data

In [17]:
FirmData = pd.read_csv('Data\Study2_data1_V2.csv', parse_dates=["filing_dt", "report_dt"])\
    .drop(columns=['FF', 'rf_length', 'SIC3', 'Delta_length', 'reported', 
                   'repeated', 'added', 'removed', 'rfGap', 'fyear', 'ryear', 'cik'])

In [18]:
# Drop RTs never disclosed disclosed in industry
disc_long = disc_long[disc_long.groupby(['FF', 'Topic_H'])['IndDisc'].transform('sum')>0].reset_index(drop=True)

# Drop RTs never disclosed and always disclosed
disc_long = disc_long[disc_long.groupby(['CIK', 'Topic_H'])['DiscSum'].transform('max')>0]
disc_long = disc_long[disc_long.groupby(['CIK', 'Topic_H'])['DiscSum'].transform('min')<2]

# Drop the first year of every firm observation
disc_long.dropna(subset=['LstYrDisc'], inplace=True)

In [19]:
Study2_data = pd.merge(
    left=disc_long,
    right=FirmData,
    on=['CIK', 'filing_dt', 'report_dt'],
    how="left"
)

In [20]:
Study2_data.drop(columns=['filing_dt', 'report_dt'], inplace=True)
Study2_data.columns

Index(['CIK', 'FF', 'Topic_H', 'Disclosed', 'DiscSum', 'TotalRFs', 'LstYrDisc',
       'New', 'Removed', 'Repeated', 'LstYrNew', 'rfGap', 'fyear', 'ryear',
       'RF_length', 'length_1', 'IndDisc', 'IndDisc_1', 'LnkDisc',
       'OldLnkNewDisc', 'NewLnkLstyrDisc', 'NewLnkRepDisc', 'Specificity',
       'Pa', 'Pr', 'Fu', 'Sentiment', 'FOG', 'added+1', 'removed+1',
       'COUNT_WEAK', 'Big4', 'TimeInCo', 'NoQuals', 'GenderRatio',
       'NationalityMix', 'NumberDirectors', 'NetworkSize', 'TotCurrBrd', 'Age',
       'ShrdDir', 'LnkdFirm', 'ShrdED', 'ShrdRiskDir', 'FinLink', 'Degree',
       'LinkTime', 'D_NumberDirectors', 'D_ShrdDir', 'D_LnkdFirm', 'D_ShrdED',
       'D_ShrdRiskDir', 'D_Degree', 'Volatility+30', 'Volatility_30',
       'Volatility+60', 'Volatility_120', 'SHRTURN', 'Beta_126',
       'NUMBEROFANALYSTS', 'rmonth', 'DtA', 'ROE', 'NPM', 'mkvalt', 'logMC',
       'at', 'logTA', 'INTtA', 'Current', 'TobinQ', 'BtM', 'RDxopr',
       'ProprietaryCost', 'IndVol_', 'IndVol+', 'I

In [21]:
Study2_data = Study2_data[Study2_data.groupby('CIK')['ryear'].transform('nunique')>1]
Study2_data.dropna(subset=['Volatility_120', 'NumberDirectors'], inplace=True)

In [22]:
Study2_data.shape

(1260878, 77)

In [24]:
Study2_data.to_csv("Data/Study2_data2_V2.csv", index=False)

In [27]:
# Compare the observations in main sample and linked firms. 
# Fill the linked firms of missing observations in BoardEX with last year links
lnkdCIKs = pd.merge(
    left=disc_long[['CIK', 'ryear']].drop_duplicates(),
    right=lnkdCIKs,
    left_on=['CIK', 'ryear'],
    right_index=True,
    how='outer'
).set_index(['CIK', 'ryear']).groupby('CIK').ffill(limit=1)

In [None]:
df = pd.concat([link_df, compo[['BoardID', 'DirectorID', 'Year', 'CIKCode', 'ED', 'RiskCommittee']]])
df.sort_values(['BoardID', 'DirectorID', 'Year'], inplace=True)
df = df[(~df.duplicated(subset=['BoardID', 'DirectorID', 'Year', 'CIKCode'], keep=False))|(~df['BoardID_lnkd'].isna())]
df['SharedDir'] = df['BoardID_lnkd'].notna().astype(int)
df = df[df.groupby(['BoardID', 'DirectorID'])['SharedDir'].transform('max')>0]