In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import gc
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
# Load BoardEX data
compo = pd.read_csv("Data/Board-Composition.csv", parse_dates=['AnnualReportDate']).drop(columns='Ticker').drop_duplicates()

committees = pd.read_csv(
    "Data/BoardEx_Committees.csv", parse_dates=['AnnualReportDate']
).drop_duplicates().reset_index(drop=True)

In [3]:
# If director is in a risk committee
committees['RiskCommittee'] = committees['CommitteeName'].str.contains(r"risk", case=False).astype(int)

# Only directors that are in risk committee (some directors are in multiple committees - to remove duplicates)
Risk_committee = committees.loc[
    committees['RiskCommittee']==1,
    ['AnnualReportDate', 'RiskCommittee', 'BoardID', 'DirectorID']
].drop_duplicates().reset_index(drop=True)

compo = pd.merge(
    left=compo,
    right=Risk_committee,
    on=['AnnualReportDate', 'BoardID', 'DirectorID'],
    how='left'
).fillna({'RiskCommittee': 0})

In [4]:
committees['AuditCommittee'] = committees['CommitteeName'].str.contains(r"audit", case=False).astype(int)

# Only directors that are in audit committee (some directors are in multiple committees - to remove duplicates)
Audit_committee = committees.loc[
    committees['AuditCommittee']==1,
    ['AnnualReportDate', 'AuditCommittee', 'BoardID', 'DirectorID']
].drop_duplicates().reset_index(drop=True)

compo = pd.merge(
    left=compo,
    right=Audit_committee,
    on=['AnnualReportDate', 'BoardID', 'DirectorID'],
    how='left'
).fillna({'AuditCommittee': 0})

In [5]:
compo['Year'] = pd.to_datetime(compo['AnnualReportDate']).dt.year

compo['ED'] = (compo['NED']=='No').astype(int)

compo['TotCurrBrd'] = compo[['TotCurrNoLstdBrd', 'TotCurrNoUnLstdBrd']].sum(axis=1)

# Calculate number of DirectorID shared with other BoardID per year
link_df = pd.merge(
    left=compo[['BoardID', 'DirectorID', 'Year', 'CIKCode', 'ED', 'RiskCommittee', 'AuditCommittee']],
    right=compo[['DirectorID', 'Year', 'BoardID', 'CIKCode', 'ED', 'RiskCommittee', 'AuditCommittee']],
    on=['DirectorID', 'Year'],
    how='outer',
    suffixes=["", "_lnkd"]
)

# if the shared Dir is ED or RiskCommittee in either one of linked firms
link_df['ED'] = link_df[['ED', 'ED_lnkd']].max(axis=1)
link_df['RiskCommittee'] = link_df[['RiskCommittee', 'RiskCommittee_lnkd']].max(axis=1)
link_df['AuditCommittee'] = link_df[['AuditCommittee', 'AuditCommittee_lnkd']].max(axis=1)

link_df.drop(columns=['ED_lnkd', 'RiskCommittee_lnkd', 'AuditCommittee_lnkd'], inplace=True)

link_df = link_df[link_df['BoardID']!=link_df['BoardID_lnkd']]

link_df = link_df.dropna(subset=['CIKCode', 'CIKCode_lnkd']).drop_duplicates().reset_index(drop=True)

# Link age
link_df = link_df.sort_values(['BoardID', 'BoardID_lnkd', 'Year']).reset_index(drop=True)
link_df['LinkTime'] = link_df.drop_duplicates(subset=['BoardID', 'BoardID_lnkd', 'Year']).groupby(['BoardID', 'BoardID_lnkd'])['Year'].cumcount()+1
link_df['LinkTime'] = link_df['LinkTime'].ffill()

# List of linked firms
lnkdCIKs = link_df.groupby(['CIKCode'])['CIKCode_lnkd'].agg(lambda x: list(set(x)))

link_df[['CIKCode', 'CIKCode_lnkd']] = link_df[['CIKCode', 'CIKCode_lnkd']].astype(int)

## Firm data

In [2]:
firm_data = pd.read_csv('Data/Study2_data1_V3.csv', parse_dates=['report_dt', 'filing_dt'])
firm_data.columns

Index(['CIK', 'report_dt', 'filing_dt', 'FF', 'rf_length', 'SIC3',
       'Specificity', 'Pa', 'Pr', 'Fu', 'Sentiment', 'Delta_length',
       'reported', 'repeated', 'added', 'removed', 'rfGap', 'fyear', 'ryear',
       'Item1AFOG', 'added+1', 'removed+1', 'Specificity+1', 'Sentiment+1',
       'Item1AFOG+1', 'added_1', 'removed_1', 'Specificity_1', 'Sentiment_1',
       'Item1AFOG_1', 'COUNT_WEAK', 'Big4', 'AUDITOR_FKEY', 'GenderRatio',
       'NationalityMix', 'NumberDirectors', 'TotCurrBrd', 'Age', 'ShrdDir',
       'LnkdFirm', 'ShrdED', 'ShrdRC', 'ShrdAC', 'FinLink', 'LinkTime',
       'Degree', 'Independent', 'Volatility+30', 'Volatility_30',
       'Volatility+60', 'Volatility_120', 'SHRTURN', 'Beta_126',
       'NUMBEROFANALYSTS', 'rmonth', 'cik', 'DtA', 'ROE', 'NPM', 'mkvalt',
       'logMC', 'at', 'logTA', 'INTtA', 'Current', 'TobinQ', 'BtM', 'RDxopr',
       'ProprietaryCost', 'ROA', 'IndVol_', 'IndVol+', 'InstOwnership',
       'NumberDirectors_1', 'ShrdDir_1', 'LnkdFirm_1'

In [3]:
firm_data.dropna(subset=[
    'Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM',
    'GenderRatio', 'NUMBEROFANALYSTS', 'Age', 'Independent', 'TotCurrBrd'
], inplace=True)

firm_data = firm_data[firm_data.groupby('CIK')['ryear'].transform('nunique')>1].reset_index(drop=True)

match_cols = [
    'Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM', 'COUNT_WEAK',
    'GenderRatio', 'NUMBEROFANALYSTS', 'Age', 'Independent', 'TotCurrBrd'
]

In [4]:
firm_data.fillna(dict((c,0) for c in match_cols), inplace=True)

firm_data.shape

(45319, 96)

## Shared RFs

H: Firm A discloses (new) RF in year t, it connects with B in t+1 that has disclosed RF in year t

In [5]:
# Load RF data 
topics_df = pd.read_csv("Data/RDdf_T2V5.csv", parse_dates=['report_dt', 'filing_dt'])

topics_df['NERs'] = topics_df['NERs'].str.replace(pat=" ", repl="").str.findall(pat=r"'(.*?)'")

NE_labels = ['PERSON', 'NORP' 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']
topics_df['Specificity'] = topics_df['NERs'].apply(lambda NERs: len([ne for ne in NERs if ne in NE_labels]))

topics_df['SIC3'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:3])

print(topics_df.shape)
topics_df.columns

(3245397, 25)


Index(['Topic', 'Score', 'Topic_H', 'Score_H', 'CIK', 'report_dt', 'filing_dt',
       'rf_seq', 'ticker', 'filerCIK', 'rf_length', 'NERs', 'Pa', 'Pr', 'Fu',
       'Sentiment', 'FOG', 'clean_len', 'SIC', 'FF', 'ryear', 'fyear',
       'rf_seq_count', 'Specificity', 'SIC3'],
      dtype='object')

In [10]:
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt"], 
    columns='Topic_H', values='Score'
).notna().astype(int).reset_index()

disc_df.sort_values(['CIK', 'filing_dt', 'report_dt'], inplace=True)

disc_df['ryear'] = disc_df["report_dt"].dt.year

In [11]:
# Drop disclosure data with missing control variables
disc_df = pd.merge(
    left=disc_df,
    right=firm_data[['CIK', 'report_dt', 'filing_dt', 'logTA']],
    on=['CIK', 'report_dt', 'filing_dt'],
    how='inner' # keeps only firm-year observations with control variables
).drop(columns='logTA')

In [12]:
# Drop firm-year observations with more than 1 report in one fiscal year
disc_df.drop_duplicates(subset=disc_df.columns.difference(["filing_dt", "report_dt"]), keep="first", inplace=True)

# duplicated ryears with report at the begining of the year
disc_df['ryear_dupd'] = disc_df.duplicated(subset=['CIK', 'ryear'], keep='last')

disc_df["ryear-1"] = disc_df.groupby('CIK')['ryear'].shift(1)

# change ryear if duplicated and there is a gap between two report years 
disc_df['ryear'] = disc_df[['ryear_dupd', 'ryear', 'ryear-1']].apply(
    lambda x: x['ryear']-1 if x['ryear_dupd'] and x['ryear']-1>x['ryear-1'] else x['ryear'],
    axis=1
)

disc_df = disc_df\
    .drop_duplicates(subset=['CIK', 'ryear'], keep='first')\
        .reset_index(drop=True).drop(columns=['ryear_dupd', 'ryear-1'])

In [42]:
# Number of shared RFs
Years = disc_df['ryear'].unique()

shared_disc_list = []

for yr in tqdm(Years):
    df1 = disc_df[disc_df['ryear']==yr].copy()

    # Caculate the number of shared RFs as the matrix multiplication of disclosure matrix by itself
    # Only keep the upper triangle of product
    # Add 1 to product values so only elements below diagonal is zero
    a = df1.loc[:, range(105)].to_numpy()
    df2 = pd.DataFrame(
        np.triu(np.matmul(a, a.T)+1, k=1), index=df1['CIK'], columns=df1[['CIK', "report_dt"]]
    ).reset_index()

    df2["report_dt"] = df1["report_dt"].values

    df3 = pd.melt(df2, id_vars=['CIK', "report_dt"], value_name='SharedRF')

    df3 = df3[df3['SharedRF']>0]

    df3['ryear'] = yr

    shared_disc_list.append(df3)

  0%|          | 0/18 [00:00<?, ?it/s]

In [43]:
shared_disc_df = pd.concat(shared_disc_list)

# Subtract the 1 added to np.matmul
shared_disc_df['SharedRF'] = shared_disc_df['SharedRF'] - 1

shared_disc_df['CIK_pair'] = shared_disc_df['variable'].apply(lambda x: x[0])
shared_disc_df['report_dt_pair'] = shared_disc_df['variable'].apply(lambda x: x[1])

shared_disc_df.drop(columns='variable', inplace=True)

shared_disc_df = shared_disc_df[shared_disc_df['CIK']!=shared_disc_df['CIK_pair']]

# Firm pairs with at least 4 years of observation for DID
shared_disc_df = shared_disc_df[shared_disc_df.groupby(['CIK', 'CIK_pair'])['ryear'].transform('nunique')>=4]

In [44]:
del df1
del df2
del df3
del shared_disc_list

gc.collect()

195

In [45]:
# Number of shared Dir between 2 individual firms in a specific year
shared_disc_df['NoSharedDir'] = pd.merge(
    left=shared_disc_df,
    right=link_df.groupby(['CIKCode', 'Year', 'CIKCode_lnkd'])['DirectorID'].nunique(),
    left_on=['CIK', 'ryear', 'CIK_pair'],
    right_index=True,
    how='left'
)['DirectorID']

# Check if CIK pairs are linked
shared_disc_df['Linked'] = shared_disc_df['NoSharedDir'].notna().astype(int)

shared_disc_df['Treatment'] = (shared_disc_df.groupby(['CIK', 'CIK_pair'])['Linked'].transform('max')>0)

# Drop always treated firms pairs
shared_disc_df = shared_disc_df[shared_disc_df.groupby(['CIK', 'CIK_pair'])['Linked'].transform('min')==0]

## Matching

**Matching procedure:**\
For every firm-pair AB in the treament group, we match the most similar firm to A in the last year before firms A and B share a director. Then we find the closest firm to B in that year which is never linked to firm A'.

In [17]:
Treat_df = shared_disc_df[shared_disc_df['Linked']==1]\
    .sort_values(['CIK', 'CIK_pair', 'ryear'])\
        .reset_index(drop=True).copy()

# First year 2 firms share a director
Treat_df['eventX'] = Treat_df.groupby(['CIK', 'CIK_pair'])['ryear'].transform('min')

# treated firm pairs at the first year of treatement
treated_pairs = Treat_df[
    (Treat_df['ryear']==Treat_df['eventX'])
    &(Treat_df['eventX']<=2022) # and drop firms that are linked in 2023 for the first time
][['CIK', 'report_dt', 'CIK_pair', 'report_dt_pair', 'eventX']]

### Identify All Valid Control Firm-Pairs at T−1

In [18]:
# Function to get top-k neighbors for a single firm
def get_top_k_neighbors(firm_id, year, k=10):
    year_firms = firm_data[firm_data['ryear'] == year]
    firm_row = year_firms[year_firms['CIK'] == firm_id]

    if firm_row.empty:
        return []
    
    X = year_firms[match_cols].values
    X_firm = firm_row[match_cols].values
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    X_firm_scaled = scaler.transform(X_firm)

    knn = NearestNeighbors(n_neighbors=k+1)  # +1 to exclude the firm itself
    knn.fit(X_scaled)
    dists, indices = knn.kneighbors(X_firm_scaled)
    
    neighbors = year_firms.iloc[indices[0]]
    neighbors = neighbors[neighbors['CIK'] != firm_id]  # Exclude self
    return neighbors['CIK'].tolist()[:k]


In [19]:
# Step 1: Melt to long format
firm_a = treated_pairs[['CIK', 'eventX']].rename(columns={'CIK': 'CIK'})
firm_b = treated_pairs[['CIK_pair', 'eventX']].rename(columns={'CIK_pair': 'CIK'})

# Combine and drop duplicates
firms_for_knn = pd.concat([firm_a, firm_b], ignore_index=True)

# get pre-treatment year (T−1)
firms_for_knn['year'] = firms_for_knn['eventX'] - 1
firms_for_knn = firms_for_knn.drop(columns='eventX').drop_duplicates()

# Store results in a dictionary
knn_dict = {}

for row in tqdm(firms_for_knn.itertuples(), total=len(firms_for_knn)):
    cik, year = row.CIK, row.year
    neighbors = get_top_k_neighbors(cik, year, k=10)  # your earlier function
    if neighbors:
        knn_dict[(cik, year)] = neighbors
    else:
        year = year + 1
        neighbors = get_top_k_neighbors(cik, year, k=10)
        if neighbors:
            knn_dict[(cik, year)] = neighbors

  0%|          | 0/14883 [00:00<?, ?it/s]

In [20]:
initial_control_pairs = shared_disc_df[~shared_disc_df["Treatment"]][['CIK', 'CIK_pair', 'ryear']]\
    .reset_index(drop=True)

In [21]:
# Save treated_pairs DataFrame
treated_pairs.to_pickle("treated_pairs3.pkl")

# Save initial_control_pairs DataFrame
initial_control_pairs.to_pickle("initial_control_pairs3.pkl")

# Save knn_dict
with open("knn_dict3.pkl", "wb") as f:
    pickle.dump(knn_dict, f)

### Compute Covariates for Control Pairs

In [18]:
with open("valid_control_pairs3.pkl", "rb") as f:
    valid_control_pairs = pickle.load(f)

# Convert to DataFrame
control_df = pd.DataFrame(valid_control_pairs)

In [19]:
# Flatten control pairs
flattened_control = []
for i, row in control_df.iterrows():
    A, B = row.treated_pair
    A_, B_ = row.control_pair
    year = row.match_year
    flattened_control.append({
        'treated_firm1': A,
        'treated_firm2': B,
        'firm1': A_,
        'firm2': B_,
        'year': year,
        'treated': 0
    })
control_df_flat = pd.DataFrame(flattened_control)

# Create the Treated Sample (at T−1)
treated_records = []
for i, row in treated_pairs.iterrows():
    A, B, T = row['CIK'], row['CIK_pair'], row['eventX']
    year = max(T - 1, 2006)
    treated_records.append({
        'treated_firm1': A,
        'treated_firm2': B,
        'firm1': A,
        'firm2': B,
        'year': year,
        'treated': 1
    })
treated_df = pd.DataFrame(treated_records)

# Combine Treated + Control Samples
pair_df = pd.concat([treated_df, control_df_flat], ignore_index=True)

In [20]:
# Merge firm1 covariates
pair_df = pair_df.merge(
    firm_data[['CIK', 'ryear', 'FF']+match_cols].rename(columns=lambda x: f"{x}_1" if x not in ['CIK', 'ryear'] else x),
    left_on=['firm1', 'year'],
    right_on=['CIK', 'ryear'],
    how='left'
)

# Merge firm2 covariates
pair_df = pair_df.merge(
    firm_data[['CIK', 'ryear', 'FF']+match_cols].rename(columns=lambda x: f"{x}_2" if x not in ['CIK', 'ryear'] else x),
    left_on=['firm2', 'year'],
    right_on=['CIK', 'ryear'],
    how='left'
)

# Drop redundant CIK_x columns
pair_df.drop(columns=['CIK_x', 'CIK_y', 'ryear_x', 'ryear_y'], inplace=True)

# Compute Pair-Level Covariates
for col in match_cols:
    pair_df[f'{col}_diff'] = abs(pair_df[f'{col}_1'] - pair_df[f'{col}_2'])

pair_df.dropna(subset=[f"{x}_diff" for x in match_cols], inplace=True)

pair_df['SameInd'] = (pair_df['FF_1']==pair_df['FF_2']).astype(int)

In [21]:
# Propensity Score Estimation + Matching
X = pair_df[[f"{x}_1" for x in match_cols] + [f"{x}_2" for x in match_cols] + ['SameInd']]
X_scaled = StandardScaler().fit_transform(X)
y = pair_df['treated']

model = LogisticRegression(max_iter=1000)
pair_df['propensity_score'] = model.fit(X_scaled, y).predict_proba(X_scaled)[:,1]

In [None]:
# Match Treated to Control Pairs Using Propensity Score
matched_pairs = []

pair_df["pair_id"] = pair_df['treated_firm1'].astype(str) + "_" + pair_df['treated_firm2'].astype(str)

for pair_id in pair_df['pair_id'].unique():
    group = pair_df[pair_df['pair_id'] == pair_id]

    treated = group[group['treated'] == 1]
    controls = group[group['treated'] == 0]

    if treated.empty or controls.empty:
        continue

    treated_score = treated['propensity_score'].values[0]

    # Find control pair with closest propensity score
    controls.loc[:, 'score_diff'] = (controls['propensity_score'] - treated_score).abs()
    best_match = controls.loc[controls['score_diff'].idxmin()]

    # Add both treated and matched control to final list
    matched_pairs.append(treated.iloc[0].to_dict())
    matched_pairs.append(best_match.to_dict())

# Convert to Final Matched Sample
matched_df = pd.DataFrame(matched_pairs).reset_index(drop=True)

In [46]:
# To remove duplicate matched firms and keep matched year
trunc_matched_df = matched_df[['firm1', 'firm2', 'treated', 'year']].groupby(['firm1', 'firm2', 'treated'])['year'].max().reset_index()

# Ensure matching columns are of the same type (critical)
shared_disc_df[['CIK', 'CIK_pair']] = shared_disc_df[['CIK', 'CIK_pair']].astype(int)
trunc_matched_df[['firm1', 'firm2']] = trunc_matched_df[['firm1', 'firm2']].astype(int)

# Create panel data
shared_disc_df = pd.merge(
    left=shared_disc_df,
    right=trunc_matched_df,
    left_on=['CIK', 'CIK_pair'],
    right_on=['firm1', 'firm2'],
    how='left'
).drop(columns=['firm1', 'firm2'])

shared_disc_df = pd.merge(
    left=shared_disc_df,
    right=trunc_matched_df,
    left_on=['CIK', 'CIK_pair'],
    right_on=['firm2', 'firm1'],
    how='left'
).drop(columns=['firm1', 'firm2'])

In [47]:
# Keep only firm-pairs that are either in treatment group or are matched with linked firm-pairs
refined_shared_disc_df = shared_disc_df.dropna(subset=["treated_x", "treated_y"], how='all')\
    .sort_values(['CIK', 'CIK_pair', 'ryear']).reset_index(drop=True)

refined_shared_disc_df.fillna({'year_x': refined_shared_disc_df['year_y']}, inplace=True)
refined_shared_disc_df['match_year'] = refined_shared_disc_df['year_x'].astype(int)

refined_shared_disc_df.drop(columns=["treated_x", "treated_y", "year_x", "year_y"], inplace=True)

# Check if the pair of matched CIKs are actually linked in the sample
refined_shared_disc_df[~refined_shared_disc_df['Treatment']]['Linked'].sum()

0

In [48]:
# If the shared director is ED or Risk Committee in any of the linked firms
refined_shared_disc_df = pd.merge(
    left=refined_shared_disc_df,
    right=link_df.groupby(['CIKCode', 'CIKCode_lnkd', 'Year'])[['ED', 'RiskCommittee', 'AuditCommittee', 'LinkTime']].max(),
    left_on=['CIK', 'CIK_pair', 'ryear'],
    right_index=True,
    how='left'
)

In [49]:
refined_shared_disc_df.shape

(153506, 14)

In [50]:
refined_shared_disc_df['Linked'].value_counts()

Linked
0    128803
1     24703
Name: count, dtype: int64

In [51]:
refined_shared_disc_df['Treatment'].value_counts()

Treatment
True     77134
False    76372
Name: count, dtype: int64

In [52]:
del shared_disc_df
del firm_data

gc.collect()

0

In [53]:
refined_shared_disc_df.sort_values(['CIK', 'CIK_pair', 'ryear'], inplace=True)

# Year when the treated pairs are linked for the first time
refined_shared_disc_df['eventX'] = refined_shared_disc_df[refined_shared_disc_df["Linked"]==1]\
    .groupby(["CIK", "CIK_pair"])['ryear'].transform('min')
refined_shared_disc_df['eventX'] = refined_shared_disc_df.groupby(['CIK', 'CIK_pair'])['eventX'].bfill()

In [62]:
# Keep observations in the same range used for DiD 
refined_shared_disc_df['Time_to_match'] = refined_shared_disc_df['match_year'] - refined_shared_disc_df['ryear']

refined_shared_disc_df = refined_shared_disc_df[
    (refined_shared_disc_df['Treatment'])
    |(refined_shared_disc_df['Time_to_match'].isin(range(-5,10)))
]

In [63]:
refined_shared_disc_df.columns

Index(['CIK', 'report_dt', 'SharedRF', 'ryear', 'CIK_pair', 'report_dt_pair',
       'NoSharedDir', 'Linked', 'Treatment', 'match_year', 'ED',
       'RiskCommittee', 'AuditCommittee', 'LinkTime', 'eventX',
       'Time_to_match'],
      dtype='object')

# Merge data

In [64]:
firm_data = pd.read_csv('Data/Study2_data1_V3.csv', parse_dates=['report_dt', 'filing_dt'])
firm_data.dropna(subset=[
    'added', 'Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM',
    'GenderRatio', 'NUMBEROFANALYSTS', 'Age', 'Independent', 'TotCurrBrd'
], inplace=True)

firm_data = firm_data[firm_data.groupby('CIK')['ryear'].transform('nunique')>1].reset_index(drop=True)

firm_data.drop(
    columns=[
        'SharedRF', 'ryear', 'LinkTime', 'Degree', 'rmonth', 'cik', 'mkvalt', 'logMC', 'ProprietaryCost', 'IndVol+',
        'NumberDirectors_1', 'ShrdDir_1', 'LnkdFirm_1', 'ShrdED_1', 'ShrdRC_1', 'ShrdAC_1', 
        'Independent_1', 'Volatility_120_1', 'Beta_126_1', 'IndVol__1', 'logTA_1', 'ROE_1', 'DtA_1', 'Current_1',
        'RDxopr_1', 'BtM_1', 'rfGap_1', 'Big4_1', 'COUNT_WEAK_1', 'NUMBEROFANALYSTS_1', 'reported_1',
        'COUNT_WEAK', 'Big4', 'AUDITOR_FKEY'],
    inplace=True
)

firm_data.shape

(45319, 62)

In [65]:
ICW = pd.read_csv("Data/ICW2.csv", parse_dates=['FYE_IC_OP', 'FILE_DATE'])

ICW.sort_values(['COMPANY_FKEY', 'FILE_DATE'], inplace=True)

ICW["OPyr"] = ICW['FYE_IC_OP'].dt.year
ICW["fyear"] = ICW['FILE_DATE'].dt.year

ICW.dropna(subset=['LAST_AUD_NAME'], inplace=True)

# Replace unknown auditors with last known auditors
ICW.loc[ICW['OP_AUD_NAME']=='unknown', 'OP_AUD_NAME'] = ICW.loc[ICW['OP_AUD_NAME']=='unknown', 'LAST_AUD_NAME']
ICW.loc[ICW['AUDITOR_FKEY']==216, 'AUDITOR_FKEY'] = ICW.loc[ICW['AUDITOR_FKEY']==216, 'LAST_AUD_FKEY']

ICW['Big4'] = ICW['OP_AUD_NAME'].str.contains(r'Deloitte|KPMG|Ernst|Pricewaterhouse', case=False).astype(int)

ICW_gr = ICW.groupby(['COMPANY_FKEY', 'FILE_DATE'])[['COUNT_WEAK', 'Big4', 'AUDITOR_FKEY']].max().reset_index()
ICW_gr2 = ICW.groupby(['COMPANY_FKEY', 'fyear'])[['COUNT_WEAK', 'Big4', 'AUDITOR_FKEY']].max().reset_index()

firm_data = pd.merge(
    left=firm_data,
    right=ICW_gr,
    left_on=['CIK', 'filing_dt'],
    right_on=['COMPANY_FKEY', 'FILE_DATE'],
    how="left"
).drop(columns=['COMPANY_FKEY', 'FILE_DATE'])

firm_data[['COUNT_WEAK_2', 'Big4_2', 'AUDITOR_FKEY_2']] = pd.merge(
    left=firm_data,
    right=ICW_gr2,
    left_on=['CIK', 'fyear'],
    right_on=['COMPANY_FKEY', 'fyear'],
    how="left"
)[['COUNT_WEAK_y', 'Big4_y', 'AUDITOR_FKEY_y']]

firm_data.fillna({'COUNT_WEAK': firm_data['COUNT_WEAK_2']}, inplace=True)
firm_data.fillna({'Big4': firm_data['Big4_2']}, inplace=True)
firm_data.fillna({'AUDITOR_FKEY': firm_data['AUDITOR_FKEY_2']}, inplace=True)

firm_data.drop(columns=['COUNT_WEAK_2', 'Big4_2', 'AUDITOR_FKEY_2'], inplace=True)

In [66]:
Study2_data3 = pd.merge(
    left=refined_shared_disc_df,
    right=firm_data,
    on=["CIK", "report_dt"],
    how="left"
)

Study2_data3 = pd.merge(
    left=Study2_data3,
    right=firm_data,
    left_on=["CIK_pair", "report_dt_pair"],
    right_on=["CIK", "report_dt"],
    how="left",
    suffixes=['', '_2']
).drop(columns=['CIK_2', 'report_dt_2'])

In [67]:
cols = ['Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM',
    'GenderRatio', 'NUMBEROFANALYSTS', 'Age', 'Independent', 'TotCurrBrd']

Study2_data3.dropna(subset=cols, inplace=True)
Study2_data3.dropna(subset=[f"{c}_2" for c in cols], inplace=True)

In [7]:
df = topics_df[['CIK', 'SIC', 'ryear', 'fyear']].drop_duplicates().reset_index(drop=True)
# SIC code of high-tech and regulated industries (Kothari, 2009, Kim 2012)
high_tech = []
high_tech.extend(range(2833, 2838))
high_tech.extend(range(3570, 3577))
high_tech.extend(range(3600, 3674))
high_tech.extend(range(7370, 7374))
high_tech.extend(range(8731, 8734))

retail_firms =[] 
retail_firms.extend(range(5200, 5961))

regulated = [4812, 4813, 4833, 4841, 4922, 4923, 4924, 4931, 4941]
regulated.extend(range(4811, 4900))

df['high_litigation'] = df['SIC'].isin(high_tech + retail_firms + regulated).astype(int)

In [14]:
Study2_data3 = pd.merge(
    left=Study2_data3,
    right=df.drop(columns=['SIC', 'ryear']).drop_duplicates(),
    on=['CIK', 'fyear'],
    how='left'
)

Study2_data3['high_litigation_2'] = pd.merge(
    left=Study2_data3,
    right=df.drop(columns=['SIC', 'ryear']).drop_duplicates(),
    left_on=['CIK_pair','fyear_2'],
    right_on=['CIK', 'fyear'],
    how='left'
)['high_litigation_y']

In [10]:
Study2_data3['CIKpair_ID'] = Study2_data3[['CIK', 'CIK_pair']].astype(str).apply(lambda x: '-'.join(x), axis=1)
Study2_data3.columns

Index(['CIK', 'report_dt', 'SharedRF', 'ryear', 'CIK_pair', 'report_dt_pair',
       'NoSharedDir', 'Linked', 'Treatment', 'match_year',
       ...
       'ROA_2', 'IndVol__2', 'InstOwnership_2', 'GenderRatio_1_2',
       'COUNT_WEAK_2', 'Big4_2', 'AUDITOR_FKEY_2', 'CIKpair_ID',
       'high_litigation', 'high_litigation_2'],
      dtype='object', length=145)

In [15]:
Study2_data3.shape

(139118, 145)

In [16]:
Study2_data3.to_csv('Data/Study2_data3_V6.csv', index=False) 

In [40]:
Study2_data3.groupby("Treatment")['CIKpair_ID'].nunique()

Treatment
False    7279
True     7302
Name: CIKpair_ID, dtype: int64

In [67]:
Study2_data3['SumLinked'] = Study2_data3.groupby('CIKpair_ID')['Linked'].transform('sum')

## END

In [12]:
Study2_data3 = pd.read_csv('Data/Study2_data3_V6.csv')

In [3]:
Study2_data3['Treatment'].value_counts()

Treatment
True     80073
False    79205
Name: count, dtype: int64

In [10]:
Study2_data3[Study2_data3["Treatment"]][['CIKpair_ID', 'Linked', 'ryear', 'eventX']].sort_values(['CIKpair_ID', 'ryear'])

Unnamed: 0,CIKpair_ID,Linked,ryear,eventX
33724,1000180-1010552,0,2007,2013.0
33725,1000180-1010552,0,2008,2013.0
33726,1000180-1010552,0,2010,2013.0
33727,1000180-1010552,0,2011,2013.0
33728,1000180-1010552,0,2012,2013.0
...,...,...,...,...
2501,9984-89089,0,2019,2021.0
2502,9984-89089,0,2020,2021.0
2503,9984-89089,1,2021,2021.0
2504,9984-89089,1,2022,2021.0


In [3]:
Study2_data3.groupby("Treatment")['CIKpair_ID'].nunique()

Treatment
False    104940
True      20727
Name: CIKpair_ID, dtype: int64

In [33]:
Study2_data3 = pd.read_csv('Data/Study2_data3_V3.csv')

In [None]:
Study2_data3[Study2_data3['Linked']==1].sort_values(['at', 'at_2', 'SharedRF'], ascending=False).head(20)

In [11]:
Study2_data3[(Study2_data3['CIK']==789019)&(Study2_data3['CIK_pair']==1403161)]

Unnamed: 0,CIK,report_dt,SharedRF,ryear,CIK_pair,report_dt_pair,NoSharedDir,Linked,Treatment,ED,...,Current_2,TobinQ_2,BtM_2,RDxopr_2,ROA_2,IndVol__2,InstOwnership_2,GenderRatio_1_2,IndDisc_2,CIKpair_ID
134411,789019,2013-06-30,7,2013,1403161,2013-09-30,,0,True,,...,1.804383,3.391111,0.220371,0.0,0.138503,0.790331,49.8214,0.7,0.235201,789019-1403161
134412,789019,2014-06-30,3,2014,1403161,2014-09-30,,0,True,,...,1.592075,3.161367,0.224824,0.0,0.140994,0.779506,50.9105,0.7,0.241938,789019-1403161
134413,789019,2018-06-30,2,2018,1403161,2018-09-30,,0,True,,...,1.611322,4.812926,0.102067,0.0,0.148805,1.103598,52.9171,0.8,0.274508,789019-1403161
134414,789019,2019-06-30,2,2019,1403161,2019-09-30,,0,True,,...,1.563176,5.133312,0.0931,0.0,0.166451,1.073451,54.2717,0.7,0.281343,789019-1403161
134415,789019,2020-06-30,3,2020,1403161,2020-09-30,,0,True,,...,1.905238,4.603925,0.097196,0.0,0.134282,1.469331,55.4155,0.7,0.299615,789019-1403161
134416,789019,2022-06-30,3,2022,1403161,2022-09-30,1.0,1,True,0.0,...,1.448473,4.319304,0.096346,0.0,0.174934,1.959102,58.464,0.667,0.321826,789019-1403161
134417,789019,2023-06-30,4,2023,1403161,2023-09-30,1.0,1,True,0.0,...,1.451727,4.080761,0.104881,0.0,0.190864,1.070053,55.5927,0.636,0.33533,789019-1403161


In [3]:
from scipy import stats

In [None]:
def ttest_func(x):
    A = x['IndDisc'].values
    B = x['OtherIndDisc'].values

    tstat = stats.ttest_ind(a=A, b=B, equal_var=False).statistic
    pvalue = stats.ttest_ind(a=A, b=B, equal_var=False).pvalue

    return (tstat, pvalue)

In [7]:
match_cols = [
    'Volatility_120', 'Beta_126', 'logTA', 'ROE', 'DtA', 'Current', 'RDxopr', 'BtM', 'rfGap', 'COUNT_WEAK', 'Big4',
    'GenderRatio', 'NUMBEROFANALYSTS', 'NumberDirectors', 'IndVol_', 'Age', 'Independent'
]

In [11]:
df = Study2_data3[['CIK', 'ryear', "CIK_pair", 'reported', 'reported_2', 'SharedRF', 'Linked', 'Treatment']]

In [None]:
for col in match_cols:
    df[f"pair{col}"] = (Study2_data3[col] + Study2_data3[f"{col}_2"])/2

In [15]:
pair_cols = [f"pair{col}" for col in match_cols]

In [32]:
des_stats = df.groupby('Treatment').describe()[pair_cols]

In [33]:
df2 = des_stats.loc(axis=1)[:, ['mean', 'std']].T

Unnamed: 0,Treatment,False,True
pairVolatility_120,mean,0.235911,56.442966
pairVolatility_120,std,7.504195,12136.653986
pairBeta_126,mean,1.460063,88.239086
pairBeta_126,std,66.848763,128242.300487
pairlogTA,mean,6.315588,7.480883
pairlogTA,std,1.580427,1.863061
pairROE,mean,0.372844,0.324211
pairROE,std,34.666789,36.645229
pairDtA,mean,0.271379,0.280118
pairDtA,std,0.775053,0.299748
