In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import gc
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load BoardEX data
compo = pd.read_csv("Data/Board-Composition.csv", parse_dates=['AnnualReportDate']).drop(columns='Ticker').drop_duplicates()

committees = pd.read_csv(
    "Data/BoardEx_Committees.csv", parse_dates=['AnnualReportDate']
).drop_duplicates().reset_index(drop=True)

In [3]:
# If director is in a risk committee
committees['RiskCommittee'] = committees['CommitteeName'].str.contains(r"risk", case=False).astype(int)

# Only directors that are in risk committee (some directors are in multiple committees - to remove duplicates)
Risk_committee = committees.loc[
    committees['RiskCommittee']==1,
    ['AnnualReportDate', 'RiskCommittee', 'BoardID', 'DirectorID']
].drop_duplicates().reset_index(drop=True)

compo = pd.merge(
    left=compo,
    right=Risk_committee,
    on=['AnnualReportDate', 'BoardID', 'DirectorID'],
    how='left'
).fillna({'RiskCommittee': 0})

In [4]:
compo['Year'] = pd.to_datetime(compo['AnnualReportDate']).dt.year

compo['ED'] = (compo['NED']=='No').astype(int)

# Calculate number of DirectorID shared with other BoardID per year
link_df = pd.merge(
    left=compo[['BoardID', 'DirectorID', 'Year', 'CIKCode', 'ED', 'RiskCommittee']],
    right=compo[['DirectorID', 'Year', 'BoardID', 'CIKCode', 'ED', 'RiskCommittee']],
    on=['DirectorID', 'Year'],
    how='outer',
    suffixes=["", "_lnkd"]
)

# if the shared Dir is ED or RiskCommittee in either one of linked firms
link_df['ED'] = link_df[['ED', 'ED_lnkd']].max(axis=1)
link_df['RiskCommittee'] = link_df[['RiskCommittee', 'RiskCommittee_lnkd']].max(axis=1)

link_df.drop(columns=['ED_lnkd', 'RiskCommittee_lnkd'], inplace=True)

link_df = link_df[link_df['BoardID']!=link_df['BoardID_lnkd']]

link_df = link_df.dropna(subset=['CIKCode', 'CIKCode_lnkd']).drop_duplicates().reset_index(drop=True)

# Link age
link_df = link_df.sort_values(['BoardID', 'BoardID_lnkd', 'Year']).reset_index(drop=True)
link_df['LinkTime'] = link_df.drop_duplicates(subset=['BoardID', 'BoardID_lnkd', 'Year']).groupby(['BoardID', 'BoardID_lnkd'])['Year'].cumcount()+1
link_df['LinkTime'] = link_df['LinkTime'].ffill()

# List of linked firms
lnkdCIKs = link_df.groupby(['CIKCode'])['CIKCode_lnkd'].agg(lambda x: list(set(x)))

## Matching

In [5]:
firm_data = pd.read_csv('Data/Study2_data1_V2.csv', parse_dates=['report_dt', 'filing_dt'])
firm_data.columns

Index(['CIK', 'report_dt', 'filing_dt', 'FF', 'rf_length', 'SIC3',
       'Specificity', 'Pa', 'Pr', 'Fu', 'Sentiment', 'FOG', 'Delta_length',
       'reported', 'repeated', 'added', 'removed', 'rfGap', 'fyear', 'ryear',
       'added+1', 'removed+1', 'COUNT_WEAK', 'Big4', 'TimeInCo', 'NoQuals',
       'GenderRatio', 'NationalityMix', 'NumberDirectors', 'NetworkSize',
       'TotCurrBrd', 'Age', 'ShrdDir', 'LnkdFirm', 'ShrdED', 'ShrdRiskDir',
       'FinLink', 'Degree', 'LinkTime', 'D_NumberDirectors', 'D_ShrdDir',
       'D_LnkdFirm', 'D_ShrdED', 'D_ShrdRiskDir', 'D_Degree', 'Volatility+30',
       'Volatility_30', 'Volatility+60', 'Volatility_120', 'SHRTURN',
       'Beta_126', 'NUMBEROFANALYSTS', 'rmonth', 'cik', 'DtA', 'ROE', 'NPM',
       'mkvalt', 'logMC', 'at', 'logTA', 'INTtA', 'Current', 'TobinQ', 'BtM',
       'RDxopr', 'ProprietaryCost', 'IndVol_', 'IndVol+', 'InstOwnership'],
      dtype='object')

In [6]:
firm_data.dropna(subset=[
    'added', 'removed', 'Delta_length', 'Beta_126', 'Volatility_120', 'logTA', 'ROE', 'DtA', 'Current', 'BtM',
    'rfGap', 'GenderRatio', 'NUMBEROFANALYSTS', 'NumberDirectors', 'RDxopr'
], inplace=True)

firm_data = firm_data[firm_data.groupby('CIK')['ryear'].transform('nunique')>1].reset_index(drop=True)

match_cols = [
    'Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM', 'rfGap', 'COUNT_WEAK', 'Big4',
    'GenderRatio', 'NUMBEROFANALYSTS', 'NumberDirectors', 'IndVol_'
]

In [7]:
firm_data.fillna(dict((c,0) for c in match_cols), inplace=True)

# firm_data.fillna({'NPM': firm_data.groupby('CIK')['NPM'].transform('mean')}, inplace=True)

firm_data.shape

(45400, 70)

In [8]:
# Normalize variable for matching
firm_data[match_cols] = (firm_data[match_cols] - firm_data[match_cols].mean())/firm_data[match_cols].std()

# Winsorize outliers
firm_data[match_cols] = firm_data[match_cols].map(lambda x: (3 if x>3 else x) if x>-3 else -3)

In [9]:
# For each year and CIK find the best match
knn = NearestNeighbors(n_neighbors=1)

for yr in tqdm(firm_data['ryear'].unique()):
    # Firms in fiscal year yr
    df1 = firm_data[firm_data['ryear']==yr]
    
    # Find a match for each cik in year yr
    for cik in df1['CIK'].unique():
        try:
            # all firms except cik and those it is connected with in the whole sample
            knn_data = df1.loc[~df1['CIK'].isin([cik, *lnkdCIKs[cik]]), match_cols]
            knn.fit(knn_data)
            
            # Find KNN for cik in the pool of knn_data
            neighs = knn.kneighbors(df1.loc[df1['CIK']==cik, match_cols], return_distance=False)
            matched_cik = df1.loc[knn_data.index[neighs.reshape(1,-1)[0]]]['CIK']

            firm_data.loc[(firm_data["CIK"]==cik)&(firm_data['ryear']==yr), 'matched_cik'] = matched_cik.values

        except:
            continue

  0%|          | 0/18 [00:00<?, ?it/s]

## Shared RFs

H: Firm A discloses (new) RF in year t, it connects with B in t+1 that has disclosed RF in year t

In [10]:
# Load RF data 
topics_df = pd.read_csv("Data/RDdf_T2V5.csv", parse_dates=['report_dt', 'filing_dt'])

topics_df['NERs'] = topics_df['NERs'].str.replace(pat=" ", repl="").str.findall(pat=r"'(.*?)'")

NE_labels = ['PERSON', 'NORP' 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']
topics_df['Specificity'] = topics_df['NERs'].apply(lambda NERs: len([ne for ne in NERs if ne in NE_labels]))

topics_df['SIC3'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:3])

print(topics_df.shape)
topics_df.columns

(3245397, 24)


Index(['Topic', 'Score', 'Topic_H', 'Score_H', 'CIK', 'report_dt', 'filing_dt',
       'rf_seq', 'ticker', 'filerCIK', 'rf_length', 'NERs', 'Pa', 'Pr', 'Fu',
       'Sentiment', 'FOG', 'SIC', 'FF', 'ryear', 'fyear', 'rf_seq_count',
       'Specificity', 'SIC3'],
      dtype='object')

In [11]:
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "FF"], 
    columns='Topic_H', values='Score'
).notna().astype(int).reset_index()

disc_df.sort_values(['CIK', 'filing_dt', 'report_dt'], inplace=True)

disc_df['ryear'] = disc_df["report_dt"].dt.year

# Drop firm-year observations with more than 1 report in one fiscal year
disc_df.drop_duplicates(subset=disc_df.columns.difference(["filing_dt", "report_dt"]), keep="first", inplace=True)

In [12]:
# Firms with at least 4 years of observation for DID [-2, 2]
disc_df = disc_df[disc_df.groupby('CIK')['ryear'].transform('nunique')>3]

# Drop disclosure data with missing control variables
disc_df = disc_df[disc_df['CIK'].isin(firm_data['CIK'].unique())].reset_index(drop=True)

# # Firms that have links with other firms
# disc_df = disc_df[disc_df['CIK'].isin(lnkdCIKs.index)]

In [13]:
# duplicated ryears with report at the begining of the year
disc_df['ryear_dupd'] = disc_df.duplicated(subset=['CIK', 'ryear'], keep='last')

disc_df["ryear-1"] = disc_df.groupby('CIK')['ryear'].shift(1)

# change ryear if duplicated and there is a gap between two report years 
disc_df['ryear'] = disc_df[['ryear_dupd', 'ryear', 'ryear-1']].apply(
    lambda x: x['ryear']-1 if x['ryear_dupd'] and x['ryear']-1>x['ryear-1'] else x['ryear'],
    axis=1
)

disc_df = disc_df\
    .drop_duplicates(subset=['CIK', 'ryear'], keep='first')\
        .reset_index(drop=True).drop(columns=['ryear_dupd', 'ryear-1'])

In [14]:
# Number of shared RFs
Years = disc_df['ryear'].unique()

shared_disc_list = []

for yr in tqdm(Years):
    df1 = disc_df[disc_df['ryear']==yr].copy()

    # Caculate the number of shared RFs as the matrix multiplication of disclosure matrix by itself
    # Only keep the upper triangle of product
    # Add 1 to product values so only elements below diagonal is zero
    a = df1.loc[:, range(105)].to_numpy()
    df2 = pd.DataFrame(
        np.triu(np.matmul(a, a.T)+1, k=1), index=df1['CIK'], columns=df1[['CIK', "report_dt"]]
    ).reset_index()

    df2["report_dt"] = df1["report_dt"].values

    df3 = pd.melt(df2, id_vars=['CIK', "report_dt"], value_name='SharedRF')

    df3 = df3[df3['SharedRF']>0]

    df3['ryear'] = yr

    shared_disc_list.append(df3)


  0%|          | 0/18 [00:00<?, ?it/s]

In [15]:
shared_disc_df = pd.concat(shared_disc_list)

# Subtract the 1 added to np.matmul
shared_disc_df['SharedRF'] = shared_disc_df['SharedRF'] - 1

shared_disc_df['CIK_pair'] = shared_disc_df['variable'].apply(lambda x: x[0])
shared_disc_df['report_dt_pair'] = shared_disc_df['variable'].apply(lambda x: x[1])

shared_disc_df.drop(columns='variable', inplace=True)

In [16]:
del df1
del df2
del df3
del shared_disc_list

gc.collect()

20

In [17]:
# Number of shared Dir between 2 individual firms in a specific year
shared_disc_df['NoSharedDir'] = pd.merge(
    left=shared_disc_df,
    right=link_df.groupby(['CIKCode', 'Year', 'CIKCode_lnkd'])['DirectorID'].nunique(),
    left_on=['CIK', 'ryear', 'CIK_pair'],
    right_index=True,
    how='left'
)['DirectorID']

# Check if CIK pairs are linked
shared_disc_df['Linked'] = shared_disc_df['NoSharedDir'].notna().astype(int)

shared_disc_df = shared_disc_df[shared_disc_df['CIK']!=shared_disc_df['CIK_pair']]

shared_disc_df['Treatment'] = (shared_disc_df.groupby(['CIK', 'CIK_pair'])['Linked'].transform('max')>0)

In [18]:
# For each pair of CIKs in the treatment group, find the mathched CIK
df1 = shared_disc_df.loc[shared_disc_df['Treatment'], ['CIK', 'CIK_pair', 'ryear']].copy()
df2 = firm_data[['CIK', 'ryear', 'matched_cik']].dropna().copy()

# matched CIK for CIK
df3 = pd.merge(
    left=df1,
    right=df2,
    left_on=['CIK', 'ryear'],
    right_on=['CIK', 'ryear'],
    how='left'
)

# matched CIK for CIK_pair
df3 = pd.merge(
    left=df3,
    right=df2,
    left_on=['CIK_pair', 'ryear'],
    right_on=['CIK', 'ryear'],
    how='left',
    suffixes=['', '_2']
).drop(columns='CIK_2')


# Check if the pair of matched CIKs are actually linked in the sample
df3['match_linked'] = pd.merge(
    left=df3,
    right=link_df.groupby(['CIKCode', 'Year', 'CIKCode_lnkd'])['DirectorID'].nunique(),
    left_on=['matched_cik', 'ryear', 'matched_cik_2'],
    right_on=['CIKCode', 'Year', 'CIKCode_lnkd'],
    how='left'
)['DirectorID'].notna()

# Keep matched CIK pairs that are not linked in the sample
df3 = df3[~df3['match_linked']].dropna()

# Drop pairs that cannot be matched (to reduce size of shared_disc_df)
shared_disc_df = shared_disc_df[
    (shared_disc_df['Treatment'])
    |((shared_disc_df['CIK'].isin(df3['matched_cik']))&(shared_disc_df['CIK_pair'].isin(df3['matched_cik_2'])))
    |((shared_disc_df['CIK'].isin(df3['matched_cik_2']))&(shared_disc_df['CIK_pair'].isin(df3['matched_cik'])))]

gc.collect()

0

In [19]:
# From shared_disc_df, only keep obs that are either in treatment or matched pairs
# If pair of CIK and CIK_pair is a matched CIK pair 
# df3['match_linked'] is all False. notna()=True means matched but not linked
shared_disc_df['match_linked'] = pd.merge(
    left=shared_disc_df,
    right=df3[['ryear', 'matched_cik', 'matched_cik_2', 'match_linked']].astype(int),
    left_on=['CIK', 'CIK_pair', 'ryear'],
    right_on=['matched_cik', 'matched_cik_2', 'ryear'],
    how='left'
)['match_linked'].notna()

shared_disc_df['match_linked_2'] = pd.merge(
    left=shared_disc_df,
    right=df3[['ryear', 'matched_cik', 'matched_cik_2', 'match_linked']],
    left_on=['CIK', 'CIK_pair', 'ryear'],
    right_on=['matched_cik_2', 'matched_cik', 'ryear'],
    how='left',
    suffixes=['', '_2']
)['match_linked_2'].notna()

refined_shared_disc_df = shared_disc_df[
    (shared_disc_df['Treatment'])
    |(shared_disc_df['match_linked'])
    |(shared_disc_df['match_linked_2'])
].reset_index(drop=True)

In [20]:
# If the shared director is ED or Risk Committee in any of the linked firms
refined_shared_disc_df = pd.merge(
    left=refined_shared_disc_df,
    right=link_df.groupby(['CIKCode', 'CIKCode_lnkd', 'Year'])[['ED', 'RiskCommittee', 'LinkTime']].max(),
    left_on=['CIK', 'CIK_pair', 'ryear'],
    right_index=True,
    how='left'
)

refined_shared_disc_df.drop(columns=['match_linked', 'match_linked_2'], inplace=True)

In [21]:
refined_shared_disc_df.shape

(336081, 12)

In [22]:
refined_shared_disc_df['Linked'].value_counts()

Linked
0    258555
1     77526
Name: count, dtype: int64

In [23]:
refined_shared_disc_df['Treatment'].value_counts()

Treatment
True     201118
False    134963
Name: count, dtype: int64

In [24]:
del shared_disc_df
del firm_data

gc.collect()

0

In [25]:
refined_shared_disc_df.sort_values(['CIK', 'CIK_pair', 'ryear'], inplace=True)

# Year when the event happens for treated pairs
refined_shared_disc_df['eventX'] = refined_shared_disc_df[refined_shared_disc_df['LinkTime']==1]['ryear']
refined_shared_disc_df['eventX'] = refined_shared_disc_df.groupby(['CIK', 'CIK_pair'])['eventX'].bfill()

# If firms are disconected after a couple of years, treat as control group
refined_shared_disc_df['eventX'] = refined_shared_disc_df['eventX'].fillna(refined_shared_disc_df[refined_shared_disc_df['Linked']==1].groupby(['CIK', 'CIK_pair'])['eventX'].ffill())

# Merge data

In [26]:
firm_data = pd.read_csv('Data/Study2_data1_V2.csv', parse_dates=['report_dt', 'filing_dt'])
firm_data.dropna(subset=[
    'added', 'removed', 'Delta_length', 'Beta_126', 'Volatility_120', 'logTA', 'ROE', 'DtA', 'Current', 'BtM',
    'rfGap', 'GenderRatio', 'NUMBEROFANALYSTS', 'NumberDirectors', 'RDxopr'
], inplace=True)

firm_data = firm_data[firm_data.groupby('CIK')['ryear'].transform('nunique')>1].reset_index(drop=True)

firm_data.drop(
    columns=['ryear', 'LinkTime', 'Degree', 'D_Degree', 'rmonth', 'cik', 'mkvalt', 'logMC', 'ProprietaryCost', 'IndVol+'],
    inplace=True
)

firm_data.shape

(45400, 60)

In [27]:
Study2_data3 = pd.merge(
    left=refined_shared_disc_df,
    right=firm_data,
    on=["CIK", "report_dt"],
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=firm_data,
    left_on=["CIK_pair", "report_dt_pair"],
    right_on=["CIK", "report_dt"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['CIK_2', 'report_dt_2'])

In [28]:
cols = ['added', 'removed', 'Delta_length', 'Beta_126', 'Volatility_120', 'logTA', 'ROE', 'DtA', 'Current', 'BtM',
        'rfGap', 'GenderRatio', 'NUMBEROFANALYSTS', 'NumberDirectors', 'RDxopr']

Study2_data3.dropna(subset=cols, inplace=True)
Study2_data3.dropna(subset=[f"{c}_2" for c in cols], inplace=True)

In [29]:
Study2_data3['CIKpair_ID'] = Study2_data3[['CIK', 'CIK_pair']].astype(str).apply(lambda x: '-'.join(x), axis=1)
Study2_data3.columns

Index(['CIK', 'report_dt', 'SharedRF', 'ryear', 'CIK_pair', 'report_dt_pair',
       'NoSharedDir', 'Linked', 'Treatment', 'ED',
       ...
       'at_2', 'logTA_2', 'INTtA_2', 'Current_2', 'TobinQ_2', 'BtM_2',
       'RDxopr_2', 'IndVol__2', 'InstOwnership_2', 'CIKpair_ID'],
      dtype='object', length=130)

In [30]:
Study2_data3.shape

(267235, 130)

In [31]:
Study2_data3.to_csv('Data/Study2_data3_V2.csv', index=False)

In [None]:
Study2_data3 = pd.merge(
    left=shared_disc_df,
    right=agg_tops,
    on=["CIK", "report_dt"],
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=agg_tops,
    left_on=["CIK_pair", "report_dt_pair"],
    right_on=["CIK", "report_dt"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['CIK_2', 'report_dt_2'])

In [None]:
Study2_data3 = pd.merge(
    left=shared_disc_df,
    right=agg_tops,
    on=["CIK", "report_dt"],
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=agg_tops,
    left_on=["CIK_pair", "report_dt_pair"],
    right_on=["CIK", "report_dt"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['CIK_2', 'report_dt_2'])

In [None]:
cols = ['Age', 'GenderRatio', 'NationalityMix']

# Fill missing values with previous year (if any)
compo_sum[cols] = compo_sum.groupby('CIKCode')[cols].ffill(limit=1)

# Fill missing values with next year (if any)
compo_sum[cols] = compo_sum.groupby('CIKCode')[cols].bfill(limit=1)

In [None]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=compo_sum,
    left_on=['CIK', 'ryear'],
    right_on=['CIKCode', 'Year'],
    how='left'
).drop(columns=['CIKCode', 'Year'])

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=compo_sum,
    left_on=['CIK', 'ryear'],
    right_on=['CIKCode', 'Year'],
    how='left',
    suffixes=['', '_2']
).drop(columns=['CIKCode', 'Year'])

In [None]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=std_returns['Volatility_120'],
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=MA_BA['Spread_120'],
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

In [None]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=std_returns['Volatility_120'],
    left_on=["CIK_pair", "filing_dt_2"],
    right_index=True,
    how="left",
    suffixes=['', '_2']
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=MA_BA['Spread_120'],
    left_on=["CIK_pair", "filing_dt_2"],
    right_index=True,
    how="left",
    suffixes=['', '_2']
)

In [None]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Beta['Beta_120'],
    left_on=["CIK", "filing_dt"],
    right_index=True,
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Beta['Beta_120'],
    left_on=["CIK_pair", "filing_dt_2"],
    right_index=True,
    how="left",
    suffixes=['', '_2']
)

In [None]:
Study2_data3["ryear"] = Study2_data3['report_dt'].dt.year
Study2_data3["rmonth"] = Study2_data3['report_dt'].dt.month
Study2_data3["rmonth_pair"] = Study2_data3['report_dt_pair'].dt.month

financials = financials[[
    'cik', 'ryear', 'rmonth', 'DtA', 'ROE', 'NPM', 'mkvalt',
    'at', 'INTtA', 'Current', 'BtM', 'RDxopr'
]]

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=financials,
    left_on=["CIK", "ryear", "rmonth"],
    right_on=["cik", "ryear", "rmonth"],
    how="left"
).drop(columns=['cik'])

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=financials,
    left_on=["CIK_pair", "ryear", "rmonth_pair"],
    right_on=["cik", "ryear", "rmonth"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['cik'])

In [None]:
Owner['ryear'] = Owner['Date'].dt.year
Owner['rmonth'] = Owner['Date'].dt.month

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Owner,
    left_on=["CIK", "ryear", "rmonth"],
    right_on=["Instrument", "ryear", "rmonth"],
    how="left"
).drop(columns=['Instrument', 'Date'])

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Owner,
    left_on=["CIK", "ryear", "rmonth_pair"],
    right_on=["Instrument", "ryear", "rmonth"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['Instrument', 'Date'])

In [None]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Analyst_df[["CIK", "filing_dt", 'NUMBEROFANALYSTS']],
    on=["CIK", "filing_dt"],
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=Analyst_df[["CIK", "filing_dt", 'NUMBEROFANALYSTS']],
    left_on=["CIK_pair", "filing_dt_2"],
    right_on=["CIK", "filing_dt"],
    how="left",
    suffixes=['', '_2']
)

In [None]:
Study2_data3.drop(columns=['rmonth_2', 'rmonth', 'rmonth_pair', 'CIK_2',
                           'report_dt', 'report_dt_pair', 
                           'filing_dt', 'filing_dt_2'], inplace=True)

# Study2_data3 = Study2_data3.loc[:, ~Study2_data3.columns.duplicated()]
Study2_data3 = Study2_data3.sort_values(['CIK', 'CIK_pair', 'ryear']).reset_index(drop=True)

In [None]:
shared_disc_df[shared_disc_df.groupby(['CIK', 'CIK_pair'])['NoSharedDir'].transform('sum')>2]

In [None]:
Study2_data3[Study2_data3.groupby(['CIK', 'CIK_pair'])['Linked'].transform('sum')>1]

In [None]:
Study2_data3.groupby('CIKpair_ID')['Linked'].sum()

In [None]:
# Determine the risk topics disclosed by every CIK pairs in avery year
Years = disc_df['ryear'].unique()

# Every element belongs to a year
shrd_disc_list = []

for yr in tqdm(Years):
    df1 = disc_df[disc_df['ryear']==yr]

    a = df1.loc[:, range(0,95)].to_numpy()

    idx = df1["CIK"].nunique()

    df2 = pd.DataFrame(
        [[np.where((a[i,:]==1)&(a[j,:]==1))[0] for j in range(idx)[:i+1]] for i in range(idx)], 
        columns=df1["CIK"], index=df1["CIK"]
    )
    
    df2.columns.name = 'CIK_pair'
    df2["report_dt"] = df1["report_dt"].values

    shrd_disc_list.append(df2)
    
shared_disc_df = pd.concat([pd.melt(df.reset_index(), id_vars=["CIK", "report_dt"], value_name='SharedRF').dropna() for df in shrd_disc_list])