In [2]:
# Import liberaries and functions
import pandas as pd
import glob
from random import random
from tqdm.auto import tqdm

In [None]:
# Get all raw html/text file paths
all_files = glob.glob('Item1As/*')

all_files_df = pd.DataFrame(all_files, columns=["path"])
all_files_df["name"] = all_files_df["path"].apply(lambda x: x.split("/")[-1])
all_files_df["CIK"] = all_files_df["name"].apply(lambda x: x.split("_")[0]).astype(int)
all_files_df["ticker"] = all_files_df["name"].apply(lambda x: x.split("_")[1])
all_files_df["filing_dt"] = pd.to_datetime(all_files_df["name"].apply(lambda x: x.split("_")[-4]))
all_files_df["report_dt"] = pd.to_datetime(all_files_df["name"].apply(lambda x: x.split("_")[-3]))
all_files_df["formType"] = all_files_df["name"].apply(lambda x: x.split("_")[-2])
all_files_df["filerCIK"] = all_files_df["name"].apply(lambda x: x.split("_")[-1].split('.')[0])
all_files_df["extension"] = all_files_df["name"].apply(lambda x: x.split(".")[-1])

# Filter files for the time period and drop duplicates
files = all_files_df.copy()[all_files_df["filing_dt"].dt.year > 2005]

files.sort_values(by=["CIK", "report_dt", "filing_dt"], inplace=True)
files = files[
    ~(files.duplicated(subset=["CIK", "filing_dt", "report_dt", "formType"], keep=False))|
    ~(files['extension']=='txt')
]

# # Save all files to train the W2V model
# files.to_csv("Data/All_1Afiles.csv", index=False)

In [3]:
RF_df = pd.read_csv('clean_docs_4.csv').dropna(subset=['cleaned_txt'])

In [None]:
cat_df = pd.read_csv("Data/category.csv", index_col=0)

# Find smaller reporting companies
regex = r"(Smaller reporting company)"
cat_df["category"] = cat_df["category"].str.extract(regex)[0].str.replace(regex, 'SRC', regex=True)

# Firms industry info
all_co = pd.read_excel("Data/all_companies.xlsx")

In [None]:
# Add firm categories to the files df
RF_df_extend = pd.merge(RF_df, cat_df, on="CIK", how="left")

# Add firm industry to the Risk Factors df
RF_df_extend = pd.merge(RF_df_extend, all_co[["CIK", "SIC", "Industry"]], on="CIK", how='left')

In [None]:
print(RF_df_extend[["CIK", "report_dt", "filing_dt"]].drop_duplicates().shape)
print(RF_df_extend["CIK"].nunique())

In [None]:
# Drop SRC firms and finance industry
RF_no_Fin = RF_df_extend.copy()[
    # (RF_df_extend["category"] != "SRC") &
    (RF_df_extend["Industry"] != "Office of Finance") & 
    (RF_df_extend["Industry"] != "Office of Structured Finance")
]
RF_no_Fin.reset_index(drop=True, inplace=True)

In [None]:
word_cnt = RF_no_Fin['cleaned_txt'].astype('str').map(lambda x: len(x.split()))
word_cnt.describe()

In [None]:
RF_no_Fin = RF_no_Fin[word_cnt >= word_cnt.quantile(0.1)]

In [None]:
rf_count = RF_no_Fin.groupby(['CIK', 'report_dt'])['cleaned_txt'].transform('count')
rf_count.describe()

In [None]:
RF_no_Fin = RF_no_Fin[
    (rf_count>rf_count.quantile(0.005))&
    (rf_count<rf_count.quantile(0.999))
]

In [None]:
print(RF_no_Fin[["CIK", "report_dt", "filing_dt"]].drop_duplicates().shape)
print(RF_no_Fin["CIK"].nunique())

In [None]:
RF_no_Fin.rename(columns={'Unnamed: 0.1': 'rf_seq'}, inplace=True)

In [None]:
RF_no_Fin.shape

In [None]:
RF_no_Fin.to_csv("Data\clean_docs_3.csv", index=False)

In [None]:
RF_no_Fin = pd.read_csv("Data\clean_docs_3.csv")
Item1A = pd.read_csv("Data\Item1As.csv", index_col=0)

In [None]:
Item1A = Item1A[Item1A.CIK.isin(RF_no_Fin.CIK.unique())]
Item1A.to_csv("Data\Item1As.csv", index=False)