In [1]:
# Import liberaries and functions
import pandas as pd
import numpy as np
import re

# Risk disclosure

In [2]:
# Load RF data 
topics_df = pd.read_csv("Data/RDdf_T2V5.csv", parse_dates=['report_dt', 'filing_dt'])

topics_df['NERs'] = topics_df['NERs'].str.replace(pat=" ", repl="").str.findall(pat=r"'(.*?)'")

NE_labels = ['PERSON', 'NORP' 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']
topics_df['Specificity'] = topics_df['NERs'].apply(lambda NERs: len([ne for ne in NERs if ne in NE_labels]))

topics_df['SIC3'] = topics_df['SIC'].map(lambda x: f"{int(x):04d}"[:3])

print(topics_df.shape)
topics_df.columns

(3245397, 25)


Index(['Topic', 'Score', 'Topic_H', 'Score_H', 'CIK', 'report_dt', 'filing_dt',
       'rf_seq', 'ticker', 'filerCIK', 'rf_length', 'NERs', 'Pa', 'Pr', 'Fu',
       'Sentiment', 'FOG', 'clean_len', 'SIC', 'FF', 'ryear', 'fyear',
       'rf_seq_count', 'Specificity', 'SIC3'],
      dtype='object')

In [3]:
# replace length of raw RF with cleaned RF
topics_df["rf_length"] = topics_df["clean_len"]
topics_df.drop(columns="clean_len", inplace=True)

In [4]:
N = topics_df['Topic_H'].nunique()
# Risk topics disclosed and not disclosed per report 
disc_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "FF"], 
    columns='Topic_H', values='Score'
).notna().astype(int).reset_index()

# Long format
disc_long = pd.melt(disc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='Disclosed')
disc_long.sort_values(["CIK", 'Topic_H', "filing_dt", "report_dt"], inplace=True)

disc_long['DiscSum'] = disc_long.groupby(["CIK", 'Topic_H'])['Disclosed'].cumsum()

# Total number of risk topics
disc_long['TotalRFs'] = disc_long.groupby(["CIK", "filing_dt", "report_dt"])['Disclosed'].transform('sum')

# Difference between disclosed risk topics in 2 consecutive years
disc_diff = disc_df.filter(range(N)) - disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1)

# Repeated risk factors
disc_repeat = (
    disc_df.filter(range(N))
    + disc_df.groupby("CIK")[disc_df.filter(range(N)).columns].shift(1) 
    == 2
).astype(int)

# Whether risk factor was disclosed in the previouse year's report
disc_long['LstYrDisc'] = disc_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(['CIK', 'Topic_H'])['Disclosed'].shift(1)
disc_long['LstYrDisc'] = disc_long.groupby(['CIK', 'Topic_H'])['LstYrDisc'].ffill()

# Generate added, repeated and removed dummies
disc_long['New'] = ((disc_long['LstYrDisc']==0)&(disc_long['Disclosed']==1)).astype(int)
disc_long['Removed'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==0)).astype(int)
disc_long['Repeated'] = ((disc_long['LstYrDisc']==1)&(disc_long['Disclosed']==1)).astype(int)

# Whether risk factor was newly disclosed in the previouse year's report
disc_long['LstYrNew'] = disc_long.groupby(['CIK', 'Topic_H'])['New'].shift(1)

In [5]:
# Keep RFs that if disclosed, they are either added or repeated
disc_long = (
    disc_long[disc_long[['New', 'Repeated']]
              .sum(axis=1)==disc_long['Disclosed']]
              .reset_index(drop=True)
)

# Number of days from fiscal year end and actual filing date
disc_long['rfGap'] = (disc_long['filing_dt'] - disc_long['report_dt']).dt.days

disc_long['fyear'] = disc_long["filing_dt"].dt.year
disc_long['ryear'] = disc_long["report_dt"].dt.year

In [6]:
# Length of disclosed RFs as the total number of words 
length_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "ryear"], 
    columns='Topic_H', values='rf_length', aggfunc='sum',
    fill_value=0 # NA length means not disclosed so equal to 0
).reset_index()

# Long format
length_long = pd.melt(length_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='RF_length')
length_long.sort_values(["CIK", "Topic_H", "filing_dt", "report_dt"], inplace=True)

# Length of the topics last year
length_long['length_1'] = length_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(["CIK", "Topic_H"])['RF_length'].shift(1)
length_long['length_1'] = length_long.groupby(['CIK', 'Topic_H'])['length_1'].ffill()

disc_long = pd.merge(
    left=disc_long,
    right=length_long,
    on=['CIK', 'filing_dt', 'report_dt', 'Topic_H'],
    how='left'
)

In [7]:
disc_long['FOG'] = pd.merge(
    left=disc_long,
    right=topics_df.groupby(['CIK', 'report_dt', 'filing_dt', 'Topic_H'])['FOG'].mean().reset_index(),
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['FOG']

disc_long['Specificity'] = pd.merge(
    left=disc_long,
    right=topics_df.groupby(['CIK', 'report_dt', 'filing_dt', 'Topic_H'])['Specificity'].sum().reset_index(),
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['Specificity']

# specificity relative to the RF length
topics_df["Specificity"] = topics_df["Specificity"]/topics_df["rf_length"]

### Industry disclosure

In [8]:
disc_df["filing_dt-1"] = disc_df["filing_dt"] - pd.Timedelta(weeks=52)

def count_func(x):
    """
    Counts the number of firms in the industry that disclose a specific RF.
    """
    df_slice = disc_df[
        (disc_df["filing_dt"]>x["filing_dt-1"])
        &(disc_df["filing_dt"]<=x["filing_dt"])
        &(disc_df['FF']==x['FF'])
    ]
    output = (
        df_slice[df_slice['CIK']!=x['CIK']].filter(range(N)).sum() / df_slice["filing_dt"].count()
    )
    
    return output

# Running the function on disc_df
IndDisc_df = disc_df.drop(columns=range(N)).copy()
IndDisc_df.loc[:, range(N)] = disc_df.apply(count_func, axis=1)
IndDisc_df.drop(columns=['filing_dt-1'], inplace=True)

# Create the data in long format
Inddisc_long = pd.melt(IndDisc_df, id_vars=["CIK", "filing_dt", "report_dt", "FF"], value_name='IndDisc')

# Drop RFs that have never been disclosed per industry 
Inddisc_long = Inddisc_long[Inddisc_long.groupby(['FF', 'Topic_H'])['IndDisc'].transform('sum')>0]

In [9]:
disc_long['IndDisc'] = pd.merge(
    left=disc_long,
    right=Inddisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['IndDisc']

disc_long.fillna({'IndDisc': 0}, inplace=True)
disc_long.reset_index(drop=True, inplace=True)

In [10]:
# Industry disclosure change
disc_long['IndDisc_1'] = disc_long.drop_duplicates(subset=['CIK', 'Topic_H', 'filing_dt']).groupby(['CIK', 'Topic_H'])['IndDisc'].shift(1)
disc_long['IndDisc_1'] = disc_long.groupby(['CIK', 'Topic_H'])['IndDisc_1'].ffill()

# Linked firm disclosures

The independent variable shows how many of the firms linked to the focal firm at a specific year have disclosed the Added, Repeated or Removed risk factors in their corresponding annual report. 
At year ryear for firm CIK, we look at the linked firms in Year and count the number of disclosures in reporting year ryear. At a specfic reporting year, all firms are assumed to be exposed to similar risks. It is not important if one is reporting sooner or later.\

In [None]:
# Load BoardEX data
compo = pd.read_csv("Data/Board-Composition.csv", parse_dates=['AnnualReportDate']).drop(columns='Ticker').drop_duplicates()

compo['Year'] = pd.to_datetime(compo['AnnualReportDate']).dt.year

# Calculate number of DirectorID shared with other BoardID per year
link_df = pd.merge(
    left=compo[['BoardID', 'DirectorID', 'Year', 'CIKCode']],
    right=compo[['DirectorID', 'Year', 'BoardID', 'CIKCode']],
    on=['DirectorID', 'Year'],
    how='outer',
    suffixes=["", "_lnkd"]
)

link_df = link_df[link_df['BoardID']!=link_df['BoardID_lnkd']]

link_df = link_df.dropna(subset=['CIKCode', 'CIKCode_lnkd']).drop_duplicates().reset_index(drop=True)

# Link age
link_df = link_df.sort_values(['BoardID', 'BoardID_lnkd', 'Year']).reset_index(drop=True)
link_df['LinkTime'] = link_df.drop_duplicates(subset=['BoardID', 'BoardID_lnkd', 'Year']).groupby(['BoardID', 'BoardID_lnkd'])['Year'].cumcount()+1
link_df['LinkTime'] = link_df['LinkTime'].ffill()

# List of linked firms
lnkdCIKs = link_df.groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

disc_df['ryear'] = disc_df['report_dt'].dt.year

LnkDisc: Number of linked firms that disclose each RT in the same ryear\
OldLnkNewDisc: Number of firms linked for more than 1 year that disclose new RT in the same ryear\
NewLnkLstyrDisc: Number of new linked firms that disclosed each RT in the last ryear\
NewLnkRepDisc: Number of new linked firms that repeat each RT in ryear (disclosed last year and this year)\
LnkLstyrDisc: Number of linked firms that disclosed each RT in the last ryear

In [12]:
def lnkd_disc_count(x):
    """
    Counts the number of linked firms that disclose a specific RF in year ryear.
    """
    try:
        output = disc_df[
            (disc_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = np.nan

    return output

# Running the function on disc_df
LnkdDisc_df = disc_df.drop(columns=range(N)).copy()
LnkdDisc_df.loc[:, range(N)] = disc_df.apply(lnkd_disc_count, axis=1)
LnkdDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format - 
LnkdDisc_long = pd.melt(LnkdDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='LnkDisc')


H: Connected firms A and B add new RF at the same time

In [13]:
# List of linked CIKs of more than 1 year old
OldlnkdCIKs = link_df[link_df['LinkTime']>1].groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

# Count the number of old linked firms disclosing new RF
disc_new_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], (disc_diff>0).astype(int)], axis=1)


In [14]:
def lnkd_new_count(x):
    """
    Counts the number of old linked firms that disclose a new RF in year ryear.
    """
    try:
        output = disc_new_df[
            (disc_new_df['CIK'].isin(OldlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_new_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = np.nan

    return output

# Running the function on disc_df
LnkdRep_df = disc_df.drop(columns=range(N)).copy()
LnkdRep_df.loc[:, range(N)] = disc_df.apply(lnkd_new_count, axis=1)
LnkdRep_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkdRep_long = pd.melt(LnkdRep_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='OldLnkNewDisc')

H: Firm A connects with B in year t, it adds RFs disclosed by B in year t-1

In [15]:
# List of new linked CIKs
NewlnkdCIKs = link_df[link_df['LinkTime']==1].groupby(['CIKCode', 'Year'])['CIKCode_lnkd'].agg(lambda x: list(x))

disc_lstyr_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], 
                           disc_df.groupby("CIK")[list(range(N))].shift(1)], axis=1)


In [16]:
def New_lnkd_LstYr_count(x):
    """
    Counts the number of New linked firms that disclose a specific RF in year ryear-1.
    """
    try:
        output = disc_lstyr_df[
            (disc_lstyr_df['CIK'].isin(NewlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_lstyr_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
NewLnkdDisc_df = disc_df.drop(columns=range(N)).copy()
NewLnkdDisc_df.loc[:, range(N)] = disc_df.apply(New_lnkd_LstYr_count, axis=1)
NewLnkdDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
NewLnkdDisc_long = pd.melt(NewLnkdDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='NewLnkLstyrDisc')

In [17]:
# For additional analysis to check if a risk topic transfered from one firm to another is less specific
def lnkd_LstYr_count(x):
    """
    Counts the number of linked firms that disclose a specific RF in year ryear-1.
    """
    try:
        output = disc_lstyr_df[
            (disc_lstyr_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(disc_lstyr_df['ryear']==x['ryear'])
        ].filter(range(N)).sum()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
LnkLstyrDisc_df = disc_df.drop(columns=range(N)).copy()
LnkLstyrDisc_df.loc[:, range(N)] = disc_df.apply(lnkd_LstYr_count, axis=1)
LnkLstyrDisc_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkLstyrDisc_long = pd.melt(LnkLstyrDisc_df, id_vars=["CIK", "filing_dt", "report_dt"], value_name='LnkLstyrDisc')

In [18]:
# disc_repeat_df = pd.concat([disc_df[['CIK', 'filing_dt', 'report_dt', 'ryear']], disc_repeat], axis=1)

# def New_lnkd_repeat_count(x):
#     """
#     Counts the number of linked firms that repeat a specific RF in year ryear.
#     """
#     try:
#         output = disc_repeat_df[
#             (disc_repeat_df['CIK'].isin(NewlnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
#             &(disc_repeat_df['ryear']==x['ryear'])
#         ].filter(range(N)).sum()
#     except:
#         output = pd.Series(np.ones((N,))*np.nan)

#     return output

# # Running the function on disc_df
# NewLnkdRep_df = disc_repeat_df.drop(columns=range(N)).copy()
# NewLnkdRep_df.loc[:, range(N)] = disc_repeat_df.apply(New_lnkd_repeat_count, axis=1)
# NewLnkdRep_df.drop(columns=['ryear'], inplace=True)

# # Create the data in long format
# NewLnkdRep_long = pd.melt(NewLnkdRep_df, id_vars=["CIK", "filing_dt", "report_dt"], 
#                        var_name='Topic_H', value_name='NewLnkRepDisc')

In [19]:
# Average specificity of RT disclosed by linked firms
Specificity_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", 'ryear'], 
    columns='Topic_H', values='Specificity', aggfunc='sum'
).reset_index()

def lnkd_disc_Spec(x):
    """
    Average Specificity of RFs disclosed by linked firms in ryear.
    """
    try:
        output = Specificity_df[
            (Specificity_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(Specificity_df['ryear'].isin([x['ryear'], x['ryear']-1]))
        ].filter(range(N)).mean()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
LnkdSpec_df = disc_df.drop(columns=range(N)).copy()
LnkdSpec_df.loc[:, range(N)] = Specificity_df.apply(lnkd_disc_Spec, axis=1)
LnkdSpec_df.drop(columns=['filing_dt-1', 'FF'], inplace=True)

# Create the data in long format
LnkdSpec_long = pd.melt(LnkdSpec_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='meanSpec')

In [None]:
# Average FOG of RT disclosed by linked firms
FOG_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", 'ryear'], 
    columns='Topic_H', values='FOG'
).reset_index()

def lnkd_disc_FOG(x):
    """
    Counts the number of linked firms that disclose a specific RF in year ryear.
    """
    try:
        output = FOG_df[
            (FOG_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(FOG_df['ryear'].isin([x['ryear'], x['ryear']-1]))
        ].filter(range(N)).mean()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
LnkdFOG_df = disc_df.drop(columns=range(N)).copy()
LnkdFOG_df.loc[:, range(N)] = FOG_df.apply(lnkd_disc_FOG, axis=1)
LnkdFOG_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkdFOG_long = pd.melt(LnkdFOG_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='meanFOG')

In [None]:
# Average length of RT disclosed by linked firms
length_df = pd.pivot_table(
    topics_df, index = ["CIK", "filing_dt", "report_dt", "ryear"], 
    columns='Topic_H', values='rf_length', aggfunc='sum',
).reset_index()

def lnkd_disc_Len(x):
    """
    Counts the number of linked firms that disclose a specific RF in year ryear.
    """
    try:
        output = length_df[
            (length_df['CIK'].isin(lnkdCIKs.loc[tuple(x[['CIK', 'ryear']])]))
            &(length_df['ryear'].isin([x['ryear'], x['ryear']-1]))
        ].filter(range(N)).mean()
    except:
        output = pd.Series(np.ones((N,))*np.nan)

    return output

# Running the function on disc_df
LnkdLen_df = disc_df.drop(columns=range(N)).copy()
LnkdLen_df.loc[:, range(N)] = length_df.apply(lnkd_disc_Len, axis=1)
LnkdLen_df.drop(columns=['filing_dt-1', 'ryear', 'FF'], inplace=True)

# Create the data in long format
LnkdLen_long = pd.melt(LnkdLen_df, id_vars=["CIK", "filing_dt", "report_dt"], 
                       var_name='Topic_H', value_name='meanLen')

In [22]:
disc_long['LnkDisc'] = pd.merge(
    left=disc_long,
    right=LnkdDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['LnkDisc']

disc_long['OldLnkNewDisc'] = pd.merge(
    left=disc_long,
    right=LnkdRep_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['OldLnkNewDisc']

disc_long['NewLnkLstyrDisc'] = pd.merge(
    left=disc_long,
    right=NewLnkdDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['NewLnkLstyrDisc']

# disc_long['NewLnkRepDisc'] = pd.merge(
#     left=disc_long,
#     right=NewLnkdRep_long,
#     on=["CIK", "filing_dt", "report_dt", "Topic_H"],
#     how='left'
# )['NewLnkRepDisc']

disc_long['LnkLstyrDisc'] = pd.merge(
    left=disc_long,
    right=LnkLstyrDisc_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['LnkLstyrDisc']

In [23]:
disc_long['meanSpec'] = pd.merge(
    left=disc_long,
    right=LnkdSpec_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['meanSpec']

disc_long['meanFOG'] = pd.merge(
    left=disc_long,
    right=LnkdFOG_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['meanFOG']

disc_long['meanLen'] = pd.merge(
    left=disc_long,
    right=LnkdLen_long,
    on=["CIK", "filing_dt", "report_dt", "Topic_H"],
    how='left'
)['meanLen']

## Merge data

In [24]:
FirmData = pd.read_csv('Data\Study2_data1_V3.csv', parse_dates=["filing_dt", "report_dt"])\
    .drop(columns=['FF', 'SIC3', 'reported', 'Specificity', 'Pa', 'Pr', 'Fu',
                   'Sentiment', 'Item1AFOG', 'repeated', 'added', 'removed', 'rfGap', 'fyear', 'ryear', 'cik', 
                   'added+1', 'removed+1', 'Specificity+1', 'Sentiment+1', 'Item1AFOG+1',
                    'added_1', 'removed_1', 
                   'Independent_1', 'Volatility_120_1', 'Beta_126_1', 
                   'IndVol__1', 'logTA_1', 'ROE_1', 'DtA_1', 'Current_1', 'RDxopr_1', 'BtM_1', 
                   'rfGap_1', 'Big4_1', 'COUNT_WEAK_1', 'NUMBEROFANALYSTS_1', 'reported_1',
                   'LnkdFirm_1', 'ShrdED_1', 'ShrdRC_1', 'ShrdAC_1']
                   )

In [25]:
FirmData["RR_len_1"] = FirmData["rf_length"] - FirmData["Delta_length"]
FirmData.drop(columns=['rf_length', 'Delta_length'], inplace=True)

In [26]:
# Drop RTs never disclosed in an industry
disc_long = disc_long[disc_long.groupby(['FF', 'Topic_H'])['IndDisc'].transform('sum')>0].reset_index(drop=True)

# Drop RTs never disclosed and always disclosed
disc_long = disc_long[disc_long.groupby(['CIK', 'Topic_H'])['DiscSum'].transform('max')>0]
disc_long = disc_long[disc_long.groupby(['CIK', 'Topic_H'])['DiscSum'].transform('min')<=1]

# Drop the first year of every firm observation
disc_long.dropna(subset=['LstYrDisc'], inplace=True)

In [27]:
Study2_data = pd.merge(
    left=disc_long,
    right=FirmData,
    on=['CIK', 'filing_dt', 'report_dt'],
    how="left"
)

In [28]:
Study2_data.drop(columns=['filing_dt', 'report_dt'], inplace=True)
Study2_data.columns

Index(['CIK', 'FF', 'Topic_H', 'Disclosed', 'DiscSum', 'TotalRFs', 'LstYrDisc',
       'New', 'Removed', 'Repeated', 'LstYrNew', 'rfGap', 'fyear', 'ryear',
       'RF_length', 'length_1', 'FOG', 'Specificity', 'IndDisc', 'IndDisc_1',
       'LnkDisc', 'OldLnkNewDisc', 'NewLnkLstyrDisc', 'LnkLstyrDisc',
       'meanSpec', 'meanFOG', 'meanLen', 'Specificity_1', 'Sentiment_1',
       'Item1AFOG_1', 'COUNT_WEAK', 'Big4', 'GenderRatio', 'NationalityMix',
       'NumberDirectors', 'TotCurrBrd', 'Age', 'ShrdDir', 'LnkdFirm', 'ShrdED',
       'ShrdRC', 'ShrdAC', 'FinLink', 'LinkTime', 'Degree', 'Independent',
       'Volatility+30', 'Volatility_30', 'Volatility+60', 'Volatility_120',
       'SHRTURN', 'Beta_126', 'NUMBEROFANALYSTS', 'rmonth', 'DtA', 'ROE',
       'NPM', 'mkvalt', 'logMC', 'at', 'logTA', 'INTtA', 'Current', 'TobinQ',
       'BtM', 'RDxopr', 'ProprietaryCost', 'ROA', 'IndVol_', 'IndVol+',
       'InstOwnership', 'NumberDirectors_1', 'ShrdDir_1', 'GenderRatio_1',
       'SharedRF

In [29]:
Study2_data = Study2_data[Study2_data.groupby('CIK')['ryear'].transform('nunique')>1]
Study2_data.dropna(subset=['Volatility_120', 'NumberDirectors'], inplace=True)

In [22]:
Study2_data.shape

(1255849, 78)

In [None]:
df = topics_df[['CIK', 'SIC', 'ryear', 'fyear']].drop_duplicates().reset_index(drop=True)
# SIC code of high-tech and regulated industries (Kothari, 2009, Kim 2012)
high_tech = []
high_tech.extend(range(2833, 2838))
high_tech.extend(range(3570, 3577))
high_tech.extend(range(3600, 3674))
high_tech.extend(range(7370, 7374))
high_tech.extend(range(8731, 8734))

retail_firms =[] 
retail_firms.extend(range(5200, 5961))

regulated = [4812, 4813, 4833, 4841, 4922, 4923, 4924, 4931, 4941]
regulated.extend(range(4811, 4900))

df['high_litigation'] = df['SIC'].isin(high_tech + retail_firms + regulated).astype(int)

Study2_data = pd.merge(
    left=Study2_data,
    right=df,
    on=['CIK', 'ryear', 'fyear'],
    how='left'
)

In [23]:
Study2_data.to_csv("Data/Study2_data2_V3.csv", index=False)

End

In [12]:
Study2_data = pd.read_csv("Data/Study2_data2_V3.csv")

In [13]:
df = Study2_data[['CIK', 'Topic_H', 'New', 'fyear', 'ryear', 'RF_length', 'FOG', 'Specificity', 'meanSpec', 'meanFOG', 'meanLen']]

In [14]:
df['Specificity'] = df['Specificity'] / df['RF_length']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Specificity'] = df['Specificity'] / df['RF_length']


In [15]:
df2 = df[df['New']==1]

In [16]:
df2 = df2[(df2['Specificity']<0.027)&(df2['meanSpec']>0.049)&(df2['RF_length']<140)&(df2['meanLen']>215)].dropna()

In [17]:
df2[df2['ryear']>2015]

Unnamed: 0,CIK,Topic_H,New,fyear,ryear,RF_length,FOG,Specificity,meanSpec,meanFOG,meanLen
2256,2488,41,1,2021,2020,65.0,55.667227,0.000000,0.053909,29.999752,333.800000
4360,3499,49,1,2023,2022,125.0,19.298039,0.008000,0.053850,24.129148,285.333333
4852,3545,76,1,2021,2021,60.0,24.433333,0.000000,0.076167,28.126522,216.000000
5090,3570,7,1,2022,2021,75.0,23.503030,0.013333,0.171446,27.005997,633.142857
5712,4127,45,1,2020,2020,80.0,27.419745,0.000000,0.131796,30.027719,569.333333
...,...,...,...,...,...,...,...,...,...,...,...
1249409,1828723,18,1,2022,2021,87.0,22.638095,0.022989,0.139771,30.371187,349.000000
1250844,1834045,6,1,2022,2022,127.0,25.179511,0.015748,0.052475,29.158199,280.200000
1252873,1846069,3,1,2023,2022,122.0,24.333806,0.000000,0.055363,22.371126,437.000000
1253551,1850902,66,1,2023,2022,116.0,23.060000,0.008621,0.086293,27.869900,760.666667


In [4]:
cik = 3545
year = 2021
T = 76

In [20]:
topics = pd.read_csv("Top2Vec\T2V_tw_5.csv")

In [None]:
topics.iloc[T]

In [9]:
link_df[(link_df['CIKCode']==cik)&(link_df['CIKCode_lnkd']==23795)]

Unnamed: 0,BoardID,DirectorID,Year,CIKCode,BoardID_lnkd,CIKCode_lnkd,LinkTime
10432,1383,509595,2018,3545.0,7720,23795.0,1.0
10433,1383,509595,2019,3545.0,7720,23795.0,2.0
10434,1383,509595,2020,3545.0,7720,23795.0,3.0
10435,1383,509595,2021,3545.0,7720,23795.0,4.0
10436,1383,509595,2022,3545.0,7720,23795.0,5.0


In [7]:
lnkdCIKs.loc[cik, 2019]

[23795.0, 1001082.0, 1045309.0]

In [31]:
topics_df[(topics_df['CIK'].isin(lnkdCIKs.loc[cik, year]+[cik]))&(topics_df['Topic_H']==T)&((topics_df['ryear']>=2020))][['Topic_H', 'Score_H', 'CIK', 'report_dt',
       'rf_seq', 'rf_length', 'NERs', 'FOG', 'Specificity']]

Unnamed: 0,Topic_H,Score_H,CIK,report_dt,rf_seq,rf_length,NERs,FOG,Specificity
8356,76,0.567291,3545,2021-09-30,33,60.0,[],24.433333,0.0
8398,76,0.573099,3545,2022-09-30,32,238.0,"[DATE, DATE, DATE, DATE, DATE, DATE, DATE, ORG]",27.397768,0.029412
8399,76,0.328958,3545,2022-09-30,33,70.0,"[ORG, GPE]",22.059259,0.014286
8439,76,0.57957,3545,2023-09-30,27,189.0,"[DATE, DATE, DATE, DATE, DATE, DATE, DATE, ORG]",42.197672,0.037037
8440,76,0.378078,3545,2023-09-30,28,103.0,"[DATE, DATE, DATE, DATE, DATE, DATE, ORG, GPE]",27.437703,0.067961
62197,76,0.70513,23795,2020-12-31,95,117.0,"[LAW, LAW, ORG, DATE, ORG, ORG, ORG, ORG, ORG,...",27.637522,0.025641
62198,76,0.698854,23795,2020-12-31,96,97.0,"[LAW, LAW, LAW, LAW, GPE, ORG]",30.199476,0.051546
62293,76,0.685778,23795,2021-12-31,88,117.0,"[LAW, LAW, ORG, DATE, ORG, ORG, ORG, ORG, ORG,...",27.92,0.025641
62294,76,0.706878,23795,2021-12-31,89,101.0,"[LAW, LAW, LAW, LAW, GPE, ORG]",26.749091,0.049505
62385,76,0.67421,23795,2022-12-31,85,117.0,"[LAW, LAW, ORG, DATE, ORG, ORG, ORG, ORG, ORG,...",27.92,0.025641


In [44]:
topics_df[(topics_df['Topic_H']==T)&(topics_df['CIK']==cik)]

Unnamed: 0,Topic,Score,Topic_H,Score_H,CIK,report_dt,filing_dt,rf_seq,ticker,filerCIK,...,Fu,Sentiment,FOG,SIC,FF,ryear,fyear,rf_seq_count,Specificity,SIC3
7277,2312,0.292648,49,0.284377,3499,2006-12-31,2007-02-26,8,ALX,3499,...,5,-0.989,23.711111,6798,48.0,2006,2007,34.0,0.063063,679
7407,480,0.4443,49,0.392172,3499,2010-12-31,2011-02-22,7,ALX,3499,...,3,-0.9906,,6798,48.0,2010,2011,35.0,0.0,679
7443,480,0.461554,49,0.410177,3499,2011-12-31,2012-02-27,8,ALX,3499,...,3,-0.9906,,6798,48.0,2011,2012,37.0,0.0,679
7481,480,0.458469,49,0.431677,3499,2012-12-31,2013-02-26,9,ALX,3499,...,3,-0.9906,,6798,48.0,2012,2013,40.0,0.0,679
7520,480,0.435195,49,0.387315,3499,2013-12-31,2014-02-24,8,ALX,3499,...,3,-0.9906,,6798,48.0,2013,2014,37.0,0.0,679
7558,480,0.46241,49,0.430639,3499,2014-12-31,2015-02-17,9,ALX,3499,...,3,-0.9906,,6798,48.0,2014,2015,38.0,0.0,679
7597,480,0.4805,49,0.419409,3499,2015-12-31,2016-02-16,10,ALX,3499,...,3,-0.9906,,6798,48.0,2015,2016,40.0,0.0,679
7637,480,0.458174,49,0.418712,3499,2016-12-31,2017-02-13,10,ALX,3499,...,3,-0.9906,,6798,48.0,2016,2017,40.0,0.0,679
7683,480,0.437239,49,0.425558,3499,2017-12-31,2018-02-12,16,ALX,3499,...,3,-0.9906,,6798,48.0,2017,2018,43.0,0.0,679
7728,480,0.46017,49,0.429638,3499,2018-12-31,2019-02-11,18,ALX,3499,...,3,-0.9906,,6798,48.0,2018,2019,44.0,0.0,679


In [45]:
topics_df[(topics_df['Topic_H']==T)&(topics_df['CIK'].isin(lnkdCIKs.loc[cik, year]))&(topics_df['report_dt']<="2022-12-31")]

Unnamed: 0,Topic,Score,Topic_H,Score_H,CIK,report_dt,filing_dt,rf_seq,ticker,filerCIK,...,Fu,Sentiment,FOG,SIC,FF,ryear,fyear,rf_seq_count,Specificity,SIC3
725675,3326,0.366292,49,0.452584,899689,2006-12-31,2007-02-27,7,VNO,899689,...,1,-0.9638,,6798,48.0,2006,2007,48.0,0.045455,679
725676,2761,0.34192,49,0.32821,899689,2006-12-31,2007-02-27,8,VNO,899689,...,4,-0.9747,20.949758,6798,48.0,2006,2007,48.0,0.110092,679
725722,2761,0.325157,49,0.307566,899689,2007-12-31,2008-02-26,6,VNO,899689,...,5,-0.9819,21.141085,6798,48.0,2007,2008,48.0,0.072072,679
725771,2761,0.313247,49,0.319381,899689,2008-12-31,2009-02-24,7,VNO,899689,...,5,-0.9698,21.058128,6798,48.0,2008,2009,49.0,0.066986,679
725874,331,0.390124,49,0.460868,899689,2010-12-31,2011-02-23,11,VNO,899689,...,3,-0.9906,,6798,48.0,2010,2011,50.0,0.0,679
725925,4649,0.348026,49,0.454251,899689,2012-12-31,2013-02-26,12,VNO,899689,...,3,-0.9906,,6798,48.0,2012,2013,56.0,0.0,679
725976,5133,0.376422,49,0.493642,899689,2013-12-31,2014-02-24,7,VNO,899689,...,3,-0.9906,,6798,48.0,2013,2014,49.0,0.0,679
725978,3505,0.359458,49,0.319407,899689,2013-12-31,2014-02-24,9,VNO,899689,...,5,-0.9612,21.060729,6798,48.0,2013,2014,49.0,0.068783,679
726025,5133,0.368575,49,0.443823,899689,2014-12-31,2015-02-17,7,VNO,899689,...,3,-0.9906,,6798,48.0,2014,2015,48.0,0.0,679
726076,2002,0.366161,49,0.508605,899689,2015-12-31,2016-02-16,10,VNO,899689,...,3,-0.9906,,6798,48.0,2015,2016,40.0,0.0,679


In [27]:
# Compare the observations in main sample and linked firms. 
# Fill the linked firms of missing observations in BoardEX with last year links
lnkdCIKs = pd.merge(
    left=disc_long[['CIK', 'ryear']].drop_duplicates(),
    right=lnkdCIKs,
    left_on=['CIK', 'ryear'],
    right_index=True,
    how='outer'
).set_index(['CIK', 'ryear']).groupby('CIK').ffill(limit=1)

In [None]:
df = pd.concat([link_df, compo[['BoardID', 'DirectorID', 'Year', 'CIKCode', 'ED', 'RiskCommittee']]])
df.sort_values(['BoardID', 'DirectorID', 'Year'], inplace=True)
df = df[(~df.duplicated(subset=['BoardID', 'DirectorID', 'Year', 'CIKCode'], keep=False))|(~df['BoardID_lnkd'].isna())]
df['SharedDir'] = df['BoardID_lnkd'].notna().astype(int)
df = df[df.groupby(['BoardID', 'DirectorID'])['SharedDir'].transform('max')>0]

In [None]:
df = compo.groupby(['CIKCode', 'Year'])['DirectorID'].agg(lambda x: set(x)).to_frame()
df['SharedDir'] = link_df.groupby(['CIKCode', 'Year'])['DirectorID'].agg(lambda x: set(x))
df[['DirectorID+1', 'SharedDir+1']] = df.groupby(['CIKCode'])[['DirectorID', 'SharedDir']].shift(-1)