# Unstrucutred Analysis


If you want to explore the edgar data base of filings, follow the below link. You may need to login using the below credentials:

[Accessing Edgar DB Examples](https://drive.google.com/file/d/13w75E5wanU46zSZR1cA0I2CRTshi7wzx/view?usp=sharing)

#### MSBA Account Login to Gmail Info:
* Email: financemsba@gmail.com
* Password: qbH2jHq8sGN2KgE


In [None]:
# We need these packages to be installed, and they are not installed in collab by default.

%%capture
!pip install -i https://test.pypi.org/simple/ ut-msba-edgar-scraper
!pip install faker
!pip install pysentiment2

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from ut_msba_edgar_scraper import Downloader
import ut_msba_edgar_scraper

# Import all of the required stuff necessary for text analysis.
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
stop_words = set( stopwords.words('english'))

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get S&P 500 memebrs
sp_500_df = pd.read_csv('https://github.com/Brandt-moreThan4/Data/blob/main/sp_500_members.csv?raw=true')
sp_500_df.head(2)

Unnamed: 0,gvkey,tic,conm
0,61739,HIG,HARTFORD FINANCIAL SERVICES
1,1380,HES,HESS CORP


In [None]:
df_log = pd.read_csv('/content/drive/MyDrive/master_log.csv') # Read in the log
df_log = df_log.iloc[:,1:] # Get rid of stupid index column

df_log['period_end_date'] = pd.to_datetime(df_log['period_end_date'])
df_log['file_date'] = pd.to_datetime(df_log['file_date']) 
df_log = df_log.sort_values(['gvkey','period_end_date'])

# We need to merge by year and month, not the exact date, because sometimes the datadate in edgar is a day off from WRDS
df_log['dd_year'] = df_log.period_end_date.dt.year
df_log['dd_month'] = df_log.period_end_date.dt.month

# Filter for successful scrapings only because those are the only filings you will actually find in the drive folder
df_successes = df_log[df_log.success == True]
# Get rid of useless columns
df_successes = df_successes[['period_end_date', 'cik', 'cik_lookup', 'edgar_name',
       'file_name_txt', 'report_type', 'gvkey', 'file_date', 'dd_year', 'dd_month']].copy()

print(df_successes.shape)

df_successes.head(2)

(105066, 10)


Unnamed: 0,period_end_date,cik,cik_lookup,edgar_name,file_name_txt,report_type,gvkey,file_date,dd_year,dd_month
20,2000-08-31,1750,1750,AAR CORP (AIR) (CIK 0000001750),0000001004_2000-08-31_10-Q_0000912057-00-04471...,10-Q,1004,2000-10-13,2000,8
21,2000-11-30,1750,1750,AAR CORP (AIR) (CIK 0000001750),0000001004_2000-11-30_10-Q_0000912057-01-00131...,10-Q,1004,2001-01-12,2000,11


In [None]:
# Log with only S&P 500 companies
df_log_500 = df_successes[df_successes.gvkey.isin(sp_500_df.gvkey.unique())] 

Create our helper functions here:

In [None]:
def get_filing_text(file_name:str)->str:
    """Function to extract report text from drive, given a file name."""
    ROOT_FILE_PATH = '/content/drive/MyDrive/edgar_filings/'
    with open(ROOT_FILE_PATH + file_name, 'r') as f:
        return f.read()

In [None]:
# Test to make sure that the drive is working this is so stupid, but sometimes you have to run this like 3 times to get it warmed up
report_text = get_filing_text(df_successes[df_successes['gvkey'] == 1690].file_name_txt.iloc[1])
report_text

This complicated stuff below is just to help extract out sentences from the report.

In [None]:

import re

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text:str) -> list:
    """Send in a block of text. Splits out a list of strings which are the sentences."""
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    text = text.lower()
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [None]:
df_dict = pd.read_csv('https://github.com/Brandt-moreThan4/Data/blob/main/words_dict.csv?raw=True')
df_dict.head()

Unnamed: 0,countries,regions,international,revenue,expense,growth,caution
0,Afghanistan,asia,exchange rate,revenue,expense,increase,reduc
1,Albania,africa,u.s dollar,sales,cost,higher,decrease
2,Algeria,U.K.,foreign,profit,fees,double,decline
3,Andorra,americas,yuan,margin,loss,grow,lower
4,Angola,middle east,pound,income,tax,,


In [None]:
dict_international, dict_revenue, dict_expense, dict_growth, dict_caution = [],[],[],[],[]

dict_international.extend(df_dict['countries'].dropna().str.lower().tolist())
dict_international.extend(df_dict['regions'].dropna().str.lower().tolist())
dict_international.extend(df_dict['international'].dropna().str.lower().tolist())

dict_revenue.extend(df_dict['revenue'].dropna().str.lower().tolist())
dict_expense.extend(df_dict['expense'].dropna().str.lower().tolist())
dict_growth.extend(df_dict['growth'].dropna().str.lower().tolist())
dict_caution.extend(df_dict['caution'].dropna().str.lower().tolist())

# dict_international
# dict_caution
# dict_expense
# dict_growth
dict_revenue

['revenue', 'sales', 'profit', 'margin', 'income', 'earnings', 'demand']

## Now for the fun:

In [None]:
# Run this line if you want to load in half completed results df
# df_results = pd.read_csv('/content/drive/MyDrive/Bull_Stearns/sp_word_counts_master.csv')
# df_results.head()

In [None]:
gvkeys = df_log_500.gvkey.unique()
print(len(gvkeys))
gvkeys[:5]

282


array([1045, 1075, 1078, 1161, 1209])

### Google Drive Text Extraction

In [None]:
# from tqdm import tqdm
# from nltk.tokenize import sent_tokenize

company_count = 0
df_results = pd.DataFrame(columns = ['gvkey','data_date','file_date','type','international_count','rev_growth_count', 'rev_slow_count', 'expense_growth_count',' expense_slow_count'])

for gv in gvkeys:
# for gv in [1690]: # Apple

    print('\nGVKEY:', gv)
    df_temp = df_successes[df_successes['gvkey'] == gv] 
    for f in range(len(df_temp)):
        try:
            data_date = df_temp.period_end_date.iloc[f] # I'm pretty sure we want this to be data date. not dile date. For the merge. We should attach file date on as another column though.
            rep_type = df_temp.report_type.iloc[f]
            file_date = df_temp.file_date.iloc[f]
        
            report_text = get_filing_text(df_temp.file_name_txt.iloc[f])

            text_lowered = report_text.lower() # We don't want to be case sensitive
            count_international = sum([text_lowered.count(phrase) for phrase in dict_international])

            # We do want to send in the raw , because I think the upper case helps the tokenizer decipher certain strings?
            sentences = split_into_sentences(report_text) # This function also lower cases.

            # Now count how many sentences fall into weach bucket
            rev_growth_count, rev_slow_count, expense_growth_count, expense_slow_count = 0,0,0,0
            for s in sentences:
                rev_count = sum([s.count(phrase) for phrase in dict_revenue])
                exp_count = sum([s.count(phrase) for phrase in dict_expense])
                gro_count = sum([s.count(phrase) for phrase in dict_growth])
                cau_count = sum([s.count(phrase) for phrase in dict_caution])


                net_rev_count = rev_count - exp_count
                net_growth_count = gro_count - cau_count            

                if net_rev_count > 0 and net_growth_count > 0:
                    rev_growth_count += 1
                elif net_rev_count > 0 and net_growth_count < 0:
                    rev_slow_count += 1
                elif net_rev_count < 0 and net_growth_count > 0:
                    expense_growth_count += 1
                elif net_rev_count < 0 and net_growth_count < 0:
                    expense_slow_count += 1                                                        


            df_results.loc[len(df_results)] = [gv,data_date,file_date,rep_type,count_international,rev_growth_count, rev_slow_count, expense_growth_count, expense_slow_count]

        except Exception:
            print(f'Error for {gv}, {data_date}, {rep_type}')
            pass

    company_count += 1
    if company_count % 5 == 0:
        df_results.to_csv(f'/content/drive/MyDrive/Bull_Stearns/sp_word_counts_{company_count}.csv',index=False)
        df_results.to_csv(f'sp_word_counts_{company_count}.csv',index=False)


df_results[['international_count','rev_growth_count', 'rev_slow_count', 'expense_growth_count',' expense_slow_count']] = df_results[['international_count','rev_growth_count', 'rev_slow_count', 'expense_growth_count',' expense_slow_count']].astype(int)
df_results.to_csv(f'sp_word_counts_{company_count}.csv',index=False)

df_results.to_csv(f'/content/drive/MyDrive/Bull_Stearns/sp_word_counts_master.csv',index=False)

In [None]:
df_results

Unnamed: 0,gvkey,data_date,file_date,type,international_count,rev_growth_count,rev_slow_count,expense_growth_count,expense_slow_count
0,1045,2000-06-30,2000-07-28,10-Q,14,17,3,14,0
1,1045,2001-03-31,2001-04-24,10-Q,15,4,6,11,1
2,1045,2001-06-30,2001-08-13,10-Q,14,9,10,20,3
3,1045,2001-09-30,2001-10-25,10-Q,17,1,14,24,9
4,1045,2002-03-31,2002-04-18,10-Q,8,0,9,11,2
...,...,...,...,...,...,...,...,...,...
21496,189491,2020-12-31,2021-02-26,10-K,302,26,37,34,28
21497,189491,2021-03-31,2021-05-04,10-Q,114,37,2,12,9
21498,189491,2021-06-30,2021-08-03,10-Q,147,58,0,15,10
21499,189491,2021-09-30,2021-11-02,10-Q,153,55,6,18,12


### JIT Text Extraction

Unfortunately, not all filings can be found in the drive, so we ran the JIT scraper to grab the rest. Running the below cell will probabbly take around 10 hours.

In [None]:
df_500_no_results = sp_500_df[~sp_500_df.gvkey.isin(df_results.gvkey.unique())] # Ignore companies we already have results for
gvkeys = df_500_no_results.gvkey.unique()

print(df_500_no_results.shape)

gvkeys = sorted(sp_500_df.gvkey.unique()) # Let's try everything again
print(len(gvkeys))
# gvkeys

(2, 3)
497


In [None]:
downloader = Downloader() # This is the object that drives the JIT scraper. # This is our backup option if a particular file is not in the drive.
company_count = df_results.gvkey.nunique() # How many companies are currently in the results file

# for gv in [1380]: 
for gv in gvkeys:        
    print('\nGVKEY:', gv)

    try: 
        df_gv = df_results[df_results.gvkey == gv] # Look at the reports we have for this gvkey
        filings_10_ks = downloader.get_filings('10-K', str(gv),before='2021-01-01')
        filings_10_qs = downloader.get_filings('10-Q', str(gv),before='2021-01-01')
        all_filings = filings_10_ks + filings_10_qs

        for f in all_filings:
            try:
                if f.data_date in df_gv.data_date.values:
                    continue # Don't re-donwload this report

                data_date = f.data_date # I'm pretty sure we want this to be data date. not dile date. For the merge. We should attach file date on as another column though.
                rep_type = f.report_type
                file_date = f.file_date

                report_text = f.get_report(type='text') 
                if report_text is None:
                    continue

                text_lowered = report_text.lower() # We don't want to be case sensitive
                count_international = sum([text_lowered.count(phrase) for phrase in dict_international])

                # We do want to send in the raw , because I think the upper case helps the tokenizer decipher certain strings?
                sentences = split_into_sentences(report_text) # This function also lower cases.

                # Now count how many sentences fall into weach bucket
                rev_growth_count, rev_slow_count, expense_growth_count, expense_slow_count = 0,0,0,0
                for s in sentences:
                    rev_count = sum([s.count(phrase) for phrase in dict_revenue])
                    exp_count = sum([s.count(phrase) for phrase in dict_expense])
                    gro_count = sum([s.count(phrase) for phrase in dict_growth])
                    cau_count = sum([s.count(phrase) for phrase in dict_caution])


                    net_rev_count = rev_count - exp_count
                    net_growth_count = gro_count - cau_count            

                    if net_rev_count > 0 and net_growth_count > 0:
                        rev_growth_count += 1
                    elif net_rev_count > 0 and net_growth_count < 0:
                        rev_slow_count += 1
                    elif net_rev_count < 0 and net_growth_count > 0:
                        expense_growth_count += 1
                    elif net_rev_count < 0 and net_growth_count < 0:
                        expense_slow_count += 1                                                        


                df_results.loc[len(df_results)] = [gv,data_date,file_date,rep_type,count_international,rev_growth_count, rev_slow_count, expense_growth_count, expense_slow_count]

            except Exception as e:
                print(f'Error for {gv},{data_date} {rep_type}')
                print(str(e))
                pass

    except Exception as e:
        print(f'Error somewhere for {gv}, {rep_type}')
        print(str(e))
        pass


    company_count += 1
    if company_count % 5 == 0:
        df_results.to_csv(f'/content/drive/MyDrive/Bull_Stearns/sp_word_counts_{company_count}.csv',index=False)
        df_results.to_csv(f'sp_word_counts_{company_count}.csv',index=False)


df_results[['international_count','rev_growth_count', 'rev_slow_count', 'expense_growth_count',' expense_slow_count']] = df_results[['international_count','rev_growth_count', 'rev_slow_count', 'expense_growth_count',' expense_slow_count']].astype(int)
df_results.to_csv(f'sp_word_counts_{company_count}.csv',index=False)

df_results.to_csv(f'/content/drive/MyDrive/Bull_Stearns/sp_word_counts_master.csv',index=False)

In [None]:
df_results.groupby('gvkey')['data_date'].count().describe()

count    495.000000
mean      71.666667
std       20.387541
min        4.000000
25%       75.000000
50%       80.000000
75%       83.000000
max       87.000000
Name: data_date, dtype: float64

## Merging in External Dictionary

Below is code that you can use to merge in the results from above onto the external dictionary that gives sentiment scores. We have saved the merged file on github so that we can reference it in our other notebook.

In [None]:
# # Scraper Code Results
# df_sp = pd.read_csv('https://github.com/Brandt-moreThan4/Data/blob/main/sp_word_counts_master.csv?raw=true') # Unstructured Variables
# df_sp['file_date'] = pd.to_datetime(df_sp['file_date'])
# df_sp['data_date'] = pd.to_datetime(df_sp['data_date'])
# df_sp['file_year_month'] = df_sp['file_date'].dt.strftime('%Y-%m')
# df_sp['data_year_month'] = df_sp['data_date'].dt.strftime('%Y-%m')
# df_sp.head(2)

# # Loughran-McDonald results, does not have GVKEY. See Data sources, for the link to this file.
# df_lm = pd.read_csv('/content/drive/MyDrive/Finance_Ratings/Loughran-McDonald_10X_Summaries_1993-2021.csv')

# # Get CIK - GVKEY Mapping and merge
# df_map = pd.read_csv('/content/drive/MyDrive/Finance_Ratings/GVKEY_CIK_Mapping.csv')[['gvkey','cik']].drop_duplicates()

# # Merge
# df_lm2 = df_lm.merge(df_map, left_on = 'CIK', right_on = 'cik')
# df_lm2['FILING_DATE'] = pd.to_datetime(df_lm2['FILING_DATE'].astype(str), format='%Y%m%d')
# # df_lm2['file_year_month'] = df_lm2['FILING_DATE'].dt.strftime('%Y-%m')
# df_lm2.rename(columns = {'FILING_DATE':'file_date'}, inplace = True)

# df_lm2.head(2)

# # Merged Unstructured Data

# df_fil = df_sp.merge(df_lm2, on = ['gvkey','file_date'])
# df_fil['lm_positive_count'] = df_fil['N_Positive'] - df_fil['N_Negation']
# df_fil['data_date'] = pd.to_datetime(df_fil['data_year_month'])
# df_fil = df_fil.iloc[:,[0,1,10,9,3,4,5,6,7,8,36,20,22,23]]
# df_fil.rename(columns={'N_Negative':'lm_negative_count','N_Uncertainty':'lm_uncertainty_count','N_Litigious':'lm_litigious_count'},inplace=True)
# df_fil.head(2)