In [None]:
"""
preprocess text extracted from 10K reports
ran one year at a time 2011-2020 to check output at each step
"""

In [128]:
import pandas as pd
import numpy as np
import pickle
import datetime

import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [222]:
year = 2011

In [223]:
with open('text_risk_data_' + str(year) + '.pickle', 'rb') as read_file:
    text_dict = pickle.load(read_file)
    

In [224]:
#check for errors and remove
issues = {}
for key, value in text_dict.items():
    if value == "error":
        issues[key] = value

for key in issues.keys():
    del text_dict[key]

In [225]:
#into df
text_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in text_dict.items()])).melt().dropna()

  text_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in text_dict.items()])).melt().dropna()


In [226]:
text_df[["ticker", "linkToTxt"]] = text_df["variable"].str.split(';', expand=True)
del text_df["variable"]

In [228]:
text_df.head()

Unnamed: 0,value,ticker,linkToTxt
0,RISK FACTORS,SBAC,https://www.sec.gov/Archives/edgar/data/103405...
1,QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...
2,ITEM 1A. RISK FACTORS,SBAC,https://www.sec.gov/Archives/edgar/data/103405...
3,Risks Related to Our Business,SBAC,https://www.sec.gov/Archives/edgar/data/103405...
4,Our foreign operations are subject to economic...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...


In [229]:
# Remove duplicates
text_df2 = text_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
text_df2.shape

(34584, 3)

In [None]:
#### CLEAN TEXT

In [230]:
# first couple years of data pulls didn't filter to word boundaries
text_df2.loc[:, 'keep'] = np.where(text_df2.loc[:, 'value'].str.contains( r"(\brisk\b|\brisks\b)", case=False, na=False), 1, 0)

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [231]:
text_df2.keep.value_counts()

1    34584
Name: keep, dtype: int64

In [232]:
the_keeps = text_df2[text_df2.keep == 1]
the_keeps.shape

(34584, 4)

In [233]:
# Text preprocessing steps - remove HTML tags, numbers, and punctuation

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

the_keeps.loc[:,'cleaned' ] = the_keeps.loc[:,'value'].apply(remove_tags)

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_remove = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)

the_keeps.loc[:,'preproc_text'] = the_keeps.loc[:,'cleaned'].map(alphanumeric).map(punc_remove)


In [234]:
the_keeps.head()

Unnamed: 0,value,ticker,linkToTxt,keep,cleaned,preproc_text
0,RISK FACTORS,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,RISK FACTORS,RISK FACTORS
1,QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT...,QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT...
2,ITEM 1A. RISK FACTORS,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,ITEM 1A. RISK FACTORS,ITEM RISK FACTORS
3,Risks Related to Our Business,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,Risks Related to Our Business,Risks Related to Our Business
4,Our foreign operations are subject to economic...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,Our foreign operations are subject to economic...,Our foreign operations are subject to economic...


In [235]:
# add text length and will check for very long strings, each df row should be ~ 1-2 sentences
the_keeps.loc[:,'value_len'] = the_keeps.loc[:,'value'].apply(lambda x : len(x))


In [236]:
# check for any very short text strings
the_keeps[the_keeps.value_len < 100].value.value_counts()

Risk Factors                                                              329
Risk-free interest rate                                                   205
Quantitative and Qualitative Disclosures About Market Risk                179
Interest Rate Risk                                                         64
Quantitative and Qualitative Disclosures about Market Risk                 64
                                                                         ... 
ITEM 7A.  QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET\nRISK       1
Geographic Risk.                                                            1
Item 1A — Risk Factors Related to NRG Energy,\n Inc.                        1
QUANTITATIVE   AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK.               1
Board Oversight\n    of Risk                                                1
Name: value, Length: 4432, dtype: int64

In [237]:
## some of the shorter strings are the section headers
## will remove the section headers
      # Risk Factors 
      # Quantitative and Qualitative Disclosures About Market Risk  
      # Quantitative and Qualitative Disclosures About Market Risks 
      # ITEM 1A. RISK FACTORS
      # ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK
        
the_keeps['value2'] = the_keeps['value'].apply(lambda x: " ".join(x.split()))

headers = ['risk factors', 'quantitative and qualitative disclosures about market risk',
          'quantitative and qualitative disclosures about market risk',
          'item 1a. risk factors', 'item 7a. quantitative and qualitative disclosures about market risk']    


f1_df = (the_keeps[~the_keeps['value2'].str.lower().isin(headers)])
f1_df.shape

(33105, 8)

In [238]:
# check for any very long text strings
f1_df[f1_df.value_len > 10_000].head()


Unnamed: 0,value,ticker,linkToTxt,keep,cleaned,preproc_text,value_len,value2
1209,WZ'!93.\nM[K._\!UJ]+C<=B89JZ:6NB?2-)C;W7NA(VU_...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,WZ'!93.\nM[K._\!UJ]+C6\nM&VCN'K:/:F.W7N:CR&&W[...,WZ \nM K UJ \nM VCN K F CR W NC ...,3157845,"WZ'!93. M[K._\!UJ]+C<=B89JZ:6NB?2-)C;W7NA(VU_,..."
26370661,"G\/C]UJA%,7M\nML;%(P3YVX?&@@T=VP+W$78S7KHM!0'A...",MXIM,https://www.sec.gov/Archives/edgar/data/743316...,1,"G\/C]UJA%,7M\nML;%(P3YVX?&@@T=VP+W$78S7KHM!0'A...",G C UJA \nML T VP W AG M KH...,14666579,"G\/C]UJA%,7M ML;%(P3YVX?&@@T=VP+W$78S7KHM!0'AG..."
32184859,(6) Notice of the intention of the C...,NEE,https://www.sec.gov/Archives/edgar/data/753308...,1,(6) Notice of the intention of the C...,Notice of the intention of the C...,39000285,(6) Notice of the intention of the Company to ...
37583383,"R*^GR2?$VGW;44KK)#%O_`+2[?WCC6=&#*:G""\nMY;?<N&...",AAP,https://www.sec.gov/Archives/edgar/data/115844...,1,"R*^GR2?$VGW;44KK)#%O_`+2[?WCC6=&#*:G""\nMY;?6,C...",R VGW O G \nMY CZJ M ...,7406783,"R*^GR2?$VGW;44KK)#%O_`+2[?WCC6=&#*:G"" MY;?<N&K..."
47135017,Entergy’s Utility operating companies’ rate sc...,ETR,https://www.sec.gov/Archives/edgar/data/65984/...,1,Entergy’s Utility operating companies’ rate sc...,Entergy’s Utility operating companies’ rate sc...,48716691,Entergy’s Utility operating companies’ rate sc...


In [239]:
len(f1_df[f1_df.value_len > 10_000].head())

5

In [240]:
# remove the strings over 10K that are some code and not text content
f2_df = f1_df[f1_df.value_len < 10_000]
f2_df.shape

(33086, 8)

In [None]:
#f2_df.to_excel('f2_df_review.xlsx', index=False)

In [None]:
################


In [241]:
f2_df.head()

Unnamed: 0,value,ticker,linkToTxt,keep,cleaned,preproc_text,value_len,value2
3,Risks Related to Our Business,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,Risks Related to Our Business,Risks Related to Our Business,30,Risks Related to Our Business
4,Our foreign operations are subject to economic...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,Our foreign operations are subject to economic...,Our foreign operations are subject to economic...,218,Our foreign operations are subject to economic...
5,"Our current business operations in Canada, Cos...",SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,"Our current business operations in Canada, Cos...",Our current business operations in Canada Cos...,623,"Our current business operations in Canada, Cos..."
6,The majority of our international operations a...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,The majority of our international operations a...,The majority of our international operations a...,494,The majority of our international operations a...
7,"Due to these risks, it may take\nlonger to com...",SBAC,https://www.sec.gov/Archives/edgar/data/103405...,1,"Due to these risks, it may take\nlonger to com...",Due to these risks it may take\nlonger to com...,380,"Due to these risks, it may take longer to comp..."


In [242]:
f3_df = f2_df.drop(['value_len', 'value2', 'cleaned', 'keep'], axis=1, inplace=False)

In [243]:
f3_df.head()

Unnamed: 0,value,ticker,linkToTxt,preproc_text
3,Risks Related to Our Business,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,Risks Related to Our Business
4,Our foreign operations are subject to economic...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,Our foreign operations are subject to economic...
5,"Our current business operations in Canada, Cos...",SBAC,https://www.sec.gov/Archives/edgar/data/103405...,Our current business operations in Canada Cos...
6,The majority of our international operations a...,SBAC,https://www.sec.gov/Archives/edgar/data/103405...,The majority of our international operations a...
7,"Due to these risks, it may take\nlonger to com...",SBAC,https://www.sec.gov/Archives/edgar/data/103405...,Due to these risks it may take\nlonger to com...


In [244]:
with open('../data/preproc/preproc_text_' + str(year) + '.pickle', 'wb') as to_write:
    pickle.dump(f3_df, to_write)