Download SEC filing data

In [1]:
from sec_edgar_downloader import Downloader
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Initialize a downloader instance. If no argument is passed
# to the constructor, the package will download filings to
# the current working directory.

# dl = Downloader("/path/to/valid/save/location")

currentDirectory = os.getcwd()
print(currentDirectory)
dl = Downloader(currentDirectory + '/data/download/')

Download dow30 10-K 10-Q

In [None]:
dow30 = ['AAPL','AMGN','AXP','BA','CAT','CRM','CSCO','CVX','DIS','DOW','GS','HD','HON','IBM','INTC','JNJ','JPM','KO','MCD','MMM','MRK','MSFT','NKE','PG','TRV','UNH','V','VZ','WBA','WMT']
for x in dow30:
    dl.get("10-Q",x,amount=6)
    dl.get("10-K",x,amount=5)

Functions to parse report

In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')] ## change to 3 
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)



In [439]:
def parse_10k(company, filepath): 
    f = open(filepath)
    raw_10k = f.read()
    
    ## find start end range to avoid xml parsing problems ##
    import re
    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
    
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    
    document = {}

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            #print(doc_start)
            #print(doc_end)
            doc_10k = raw_10k[doc_start:doc_end].lower()
            break
            
    doc_10k_bs = BeautifulSoup(doc_10k, 'lxml')
    doc_10k_str = str(doc_10k_bs)
    
    #i7 = "management’s discussion and analysis of financial condition and results of operations"
    i7 = "s discussion and analysis of financial condition and results of operations"
    i7a = 'quantitative and qualitative disclosures about market risk' 
    if company == 'GS':
        i7 = 's discussion and analysis of financial condition and results of operations'
        i7a = 'quantitative and qualitative disclosure about market risk' 
    if company == 'INTC':
        i7 = 's discussion and analysis \(md&amp;a\)'
        i7a = 'font-weight:bold;">other key information'
    if company == 'JNJ':
        i7 = "s discussion and analysis of results of operations and financial condition"
    if company == 'MSFT':
        i7a = 'ive disclosures about market risk'
    if company == 'IBM':
        i7 = 'item\xa07. management’s discussion and analysis of financial condition and results of operations.<'
    if company == 'MCD':
        i7 = 'item\xa07. management’s discussion and analysis of financial condition and results of operations'
    
    
    sec7_find = re.finditer(i7, doc_10k_str)
    sec7a_find = re.finditer(i7a, doc_10k_str)
    
    
    # moving keyword range to dataframe
    sec7_start_range = []
    sec7_end_range = []
    for x in sec7_find: 
        #print(x.start())
        sec7_start_range.append(x.start())
        sec7_end_range.append(x.end())
        sec7_df = pd.DataFrame({'start': sec7_start_range,'end': sec7_end_range})
    if not sec7_start_range: 
        return sec7_start_range
    sec7a_start_range = []
    sec7a_end_range = []    
    for x in sec7a_find: 
        sec7a_start_range.append(x.start())
        sec7a_end_range.append(x.end())
        sec7a_df = pd.DataFrame({'start': sec7a_start_range,'end': sec7a_end_range})
    if not sec7a_start_range: 
        return sec7a_start_range
    
    #qc check
    for index, row in sec7_df.iterrows():
        print('7' , str(row['start']) , str(row['end']))
    for index, row in sec7a_df.iterrows():
        print('7a', str(row['start']), str(row['end']))
        
    # get possible ranges 
    possible_start_range = []
    possible_end_range = []
    for index, row7a in sec7a_df.iterrows():
        #print(row7a['start'])
        filtered = []
        for i, row7 in sec7_df.iterrows():
            #print(row7['end'])
            if row7['end'] < row7a['start']:
                filtered.append(row7['end'])
        #print(filtered)
        if not filtered:
            continue # nothing before item 7a
        max_y = max(filtered)
        possible_start_range.append(max_y)
        possible_end_range.append(row7a['start']-12)

        # qc 
        #for x in possible_start_range:
        #    print(x)

        # get the largest possible range
    possible_df = pd.DataFrame({'start': possible_start_range,'end': possible_end_range})
    possible_df['delta'] = possible_df['end'] - possible_df['start']
    print(possible_df.loc[possible_df['delta'].idxmax()])

    item7_df = possible_df.loc[possible_df['delta'].idxmax()]
    
    item_7_content = doc_10k[item7_df['start']:item7_df['end']]
    item_7_cleaned = preprocess(item_7_content)
    bagOfWordsA = item_7_cleaned.split(' ')
    
    return bagOfWordsA


In [None]:
def parse_10q(company, filepath): 
    f = open(filepath)
    raw_10k = f.read()
    
    ## find start end range to avoid xml parsing problems ##
    import re
    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
    
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    
    document = {}

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            #print(doc_start)
            #print(doc_end)
            doc_10k = raw_10k[doc_start:doc_end].lower()
            break
            
    doc_10k_bs = BeautifulSoup(doc_10k, 'lxml')
    doc_10k_str = str(doc_10k_bs)
    
    #i7 = "management’s discussion and analysis of financial condition and results of operations"
    i7 = "s discussion and analysis of financial condition and results of operations"
    i7a = 'quantitative and qualitative disclosures about market risk' 
    if company == 'GS':
        i7 = 's discussion and analysis of financial condition and results of operations'
        i7a = 'quantitative and qualitative disclosure about market risk' 
    if company == 'INTC':
        i7 = 's discussion and analysis \(md&amp;a\)'
        i7a = 'font-weight:bold;">other key information'
    if company == 'JNJ':
        i7 = "s discussion and analysis of results of operations and financial condition"
    if company == 'MSFT':
        i7a = 'ive disclosures about market risk'
    if company == 'IBM':
        i7 = 'item\xa07. management’s discussion and analysis of financial condition and results of operations.<'
    if company == 'MCD':
        i7 = 'item\xa07. management’s discussion and analysis of financial condition and results of operations'
    
    
    sec7_find = re.finditer(i7, doc_10k_str)
    sec7a_find = re.finditer(i7a, doc_10k_str)
    
    
    # moving keyword range to dataframe
    sec7_start_range = []
    sec7_end_range = []
    for x in sec7_find: 
        #print(x.start())
        sec7_start_range.append(x.start())
        sec7_end_range.append(x.end())
        sec7_df = pd.DataFrame({'start': sec7_start_range,'end': sec7_end_range})
    if not sec7_start_range: 
        return sec7_start_range
    sec7a_start_range = []
    sec7a_end_range = []    
    for x in sec7a_find: 
        sec7a_start_range.append(x.start())
        sec7a_end_range.append(x.end())
        sec7a_df = pd.DataFrame({'start': sec7a_start_range,'end': sec7a_end_range})
    if not sec7a_start_range: 
        return sec7a_start_range
    
    #qc check
    for index, row in sec7_df.iterrows():
        print('7' , str(row['start']) , str(row['end']))
    for index, row in sec7a_df.iterrows():
        print('7a', str(row['start']), str(row['end']))
        
    # get possible ranges 
    possible_start_range = []
    possible_end_range = []
    for index, row7a in sec7a_df.iterrows():
        #print(row7a['start'])
        filtered = []
        for i, row7 in sec7_df.iterrows():
            #print(row7['end'])
            if row7['end'] < row7a['start']:
                filtered.append(row7['end'])
        #print(filtered)
        if not filtered:
            continue # nothing before item 7a
        max_y = max(filtered)
        possible_start_range.append(max_y)
        possible_end_range.append(row7a['start']-12)

        # qc 
        #for x in possible_start_range:
        #    print(x)

        # get the largest possible range
    possible_df = pd.DataFrame({'start': possible_start_range,'end': possible_end_range})
    possible_df['delta'] = possible_df['end'] - possible_df['start']
    print(possible_df.loc[possible_df['delta'].idxmax()])

    item7_df = possible_df.loc[possible_df['delta'].idxmax()]
    
    item_7_content = doc_10k[item7_df['start']:item7_df['end']]
    item_7_cleaned = preprocess(item_7_content)
    bagOfWordsA = item_7_cleaned.split(' ')
    
    return bagOfWordsA



In [3]:
def read_file_date(filepath):
    # Regex to find <DOCUMENT> tags
    date_start_pattern = re.compile(r'FILED AS OF DATE:')
    date_end_pattern = re.compile(r'DATE AS OF CHANGE:')
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    date_type_pattern = re.compile(r'<TYPE>[^\n]+')

    f = open(filepath, "r")

    raw_10k = f.read()
    document = {}

    date_start_is = [x.end() for x in date_start_pattern.finditer(raw_10k)]
    date_end_is = [x.start() for x in date_end_pattern.finditer(raw_10k)]
    date_types = [x[len('<TYPE>'):] for x in date_type_pattern.findall(raw_10k)]

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for date_type, date_start, date_end in zip(date_types, date_start_is, date_end_is):
        if date_type == '10-K':
            ddd = raw_10k[date_start:date_end]
            ddd = ddd.strip()
        if date_type == '10-Q':
            ddd = raw_10k[date_start:date_end]
            ddd = ddd.strip()
    return ddd

Parse 10K 

In [440]:
##### Keywords Problem SOVLED
## GS: one report omitted item 7, 
##     "management\x92s discussion and analysis of financial condition and results of operations" 
##.    'quantitative and qualitative disclosures about market risk'  <<-- disclosure no s in report
## MSFT: 7a keyword disconnected with html in between qualitat <some html> ive
    # solution: try 'ive disclosures about market risk' for item 7a
## JNJ: switch  " analysis of financial condition and results of operations" => results of operations and financial condidtion
    #solution: switch and 
## INTC: keyword format different, Management's Discussion and analysis (md&A) - results of operations
    # some report have repeat 
    #solution: use special keyword, 'font-weight:bold;">other key information'
## IBM: one report omitted, keyword repeated each page break; Operations – (continued)
    #solution: custom keyword put period at the end of keyword
## MCD: keyword in between 2 keywords *** not all reports in same format
    # solution: use exact: ITEM 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
## HD: one ok, the other t’s -> t's 
    
###----
## CVX: under the FS-1 section, repeated keyword when page break 
    # solution: XOM
## BA: report omitted item 7 
    # solution download more reports, read more reports 
## JPM: keyword = no item 7 
    # maybe use C (citi bank) 10K?

# 10K -> 5 + 10Q -> 15 

## Can't parse: BA, GS,'IBM','INTC','JNJ','JPM','MCD','MSFT','WBA'
## can't parse 1/2 time: ('V'),('VZ')
dow30 = ['AAPL','AMGN','AXP','BA','CAT','CRM','CSCO','CVX','DIS','DOW','GS','HD','HON','IBM','INTC','JNJ','JPM','KO','MCD','MMM','MRK','MSFT','NKE','PG','TRV','UNH','V','VZ','WBA','WMT']
#
dow30 = ['IBM','INTC','JNJ','JPM','MCD','MSFT','V','VZ','WBA']
dow30 = ['MCD']
companyk = []
file_namek = []
doc_typek = []
file_datek = []
contentk = []
for x in dow30: # company loop
    i = 0
    currentDirectory = os.getcwd()
    file_dir = currentDirectory + '/data/download/'
    file_dir = file_dir + 'sec-edgar-filings/' + x + '/10-K/'
    for name in os.listdir(file_dir): 
        if not name.startswith('.') and os.path.isfile(os.path.join(file_dir,name)+'/full-submission.txt'):
            
            if i >=5: # only get 2 10K files from each company
                break
            print(os.path.join(file_dir,name)+'/full-submission.txt')
            item7 = parse_10k(x, os.path.join(file_dir,name)+'/full-submission.txt')
            print(len(item7))
            contentk.append(item7)
            companyk.append(x)
            file_namek.append(name)
            doc_typek.append('10-K')
            file_d = read_file_date(os.path.join(file_dir,name)+'/full-submission.txt')
            file_datek.append(file_d)
            i = i+1
            print(i)

/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-17-000017/full-submission.txt
7 377439 377532
7a 41425 41483
7a 1152278 1152336
7a 1152758 1152816
start     377532
end      1152746
delta     775214
Name: 1, dtype: int64
6398
1
/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-20-000022/full-submission.txt
0
2
/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-19-000010/full-submission.txt
7 506519 506612
7a 205465 205523
7a 1300944 1301002
7a 1301424 1301482
start     506612
end      1301412
delta     794800
Name: 1, dtype: int64
5935
3
/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-21-000013/full-submission.txt
0
4
/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/000

In [None]:
Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations.

In [None]:
ITEM&#160;7. Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations

step by step testing

In [114]:
import sys
print(sys.getrecursionlimit())
sys.setrecursionlimit(10000)

3000


In [426]:
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/CAT/10-K/0000018230-21-000063/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/GS/10-K/0001193125-21-102511/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/IBM/10-K/0001558370-20-001786/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/JNJ/10-K/0000200406-21-000008/full-submission.txt'

filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-17-000017/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MSFT/10-K/0001564590-21-039151/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/INTC/10-K/0000050863-19-000007/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/IBM/10-K/0001558370-20-001786/full-submission.txt'
filepath = '/Users/wailunchung/Documents/GitHub/DS6120_Capstone_A_dey_chung/data/download/sec-edgar-filings/MCD/10-K/0000063908-17-000017/full-submission.txt'

In [427]:

f = open(filepath)
raw_10k = f.read()

In [428]:
import re
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

In [429]:
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

In [430]:
document = {}

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        print(doc_start)
        print(doc_end)
        #doc_10k = raw_10k[doc_start:doc_end].lower()
        document[doc_type] = raw_10k[doc_start:doc_end].lower()
        doc_10k = raw_10k[doc_start:doc_end].lower()
        break

955
2697627


In [431]:
len(document['10-K'])

2696672

In [432]:
doc_10k = BeautifulSoup(doc_10k, 'lxml')

In [433]:
doc_10k_str = str(doc_10k)

In [298]:
#doc_10k_str[901657:901715] #7a
doc_10k_str[3859171:3859496]
doc_10k_str[371150:371375]

'ze:18pt;"><font style="font-family:arial;font-size:18pt;color:#ffffff;font-weight:bold;">management\'s discussion and analysis (md&amp;a)</font></div></td></tr></table></div></div><div style="line-height:120%;font-size:9pt;"><'

In [None]:
INTC
     start      end
0    37502    37560
1   901657   901715
2  3865798  3865856

0    32210    32235
1   371250   371275
2  3859271  3859296


In [231]:
V
7 23016 23090
7 149192 149266
7a 23532 23590

0  363612  363670
1  412134  412192

SyntaxError: invalid syntax (<ipython-input-231-424a6fb0b984>, line 2)

In [None]:
Microsoft 
7 360718 360792
7 411342 411416
7 412052 412126
7 446022 446096
7 853795 853869
7 854089 854163 <-- this item 7 starts 
7 854776 854850 <-- not exactly but this also okay
7a 363612 363670
7a 412134 412192

854177	854268 item 7 content start 
1394710	1394763 after jolie
1394444	1394458 jolie
1392078	1392259 before jolie
1396791	1396849 after item 7a we are exposed

In [177]:
doc_10k_str[1394710:1394763]

'corporate vice president and chief accounting officer'

In [435]:
doc_10k_str[377258:377532]

'al-align:bottom;padding-left:2px;padding-top:2px;padding-bottom:2px;padding-right:2px;"><div style="text-align:left;font-size:12pt;"><font style="font-family:arial;font-size:12pt;">item\xa07. management’s discussion and analysis of financial condition and results of operations'

In [437]:
i7 = "management’s discussion and analysis of financial condition and results of operations"
i7 = "management's discussion and analysis of financial condition and results of operations"
i7 = "s discussion and analysis of financial condition and results of operations"
i7 = 's discussion and analysis \(md&amp;a\)'
i7a = "trategies and predictions of our future activities or other future events or conditions"
#i7 = "results of operations"
sec7_find = re.finditer(i7, doc_10k_str)
i7a = 'management&#8217;s discussion and analysis of financial condition and results of operations.'
i7a = 's discussion and analysis of financial condition and results of operations.'
i7a = 'item\xa07. management’s discussion and analysis of financial condition and results of operations'
#Item&#160;7. Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations.
#i7a = 'other key information'
       #QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK
#i7a = 'corporate vice president and chief accounting officer'
#i7a = 'alice l. jolla'
#i7a = 'and our independent registered public accounting firm to ensure that each is meeting its responsibilities and to discuss matters concerning internal controls and financial reporting'
#i7a = 'e are exposed to economic risk from foreign exchange rates'
#i7a = 'is intended to help the reader understand the results of operations and financial condition'
#i7a = 'market risk' 
#i7a = "trategies and predictions of our future activities or other future events or conditions"
sec7a_find = re.finditer(i7a, doc_10k_str)

#for x in sec7a_find: 
#    print(x)
del sec7a_df

    # moving keyword range to dataframe
sec7_start_range = []
sec7_end_range = []
for x in sec7_find: 
    #print(x.start())
    sec7_start_range.append(x.start())
    sec7_end_range.append(x.end())
    sec7_df = pd.DataFrame({'start': sec7_start_range,'end': sec7_end_range})
sec7a_start_range = []
sec7a_end_range = []    
for x in sec7a_find: 
    sec7a_start_range.append(x.start())
    sec7a_end_range.append(x.end())
    sec7a_df = pd.DataFrame({'start': sec7a_start_range,'end': sec7a_end_range})

    
    #qc check
#for index, row in sec7_df.iterrows():
#    print('7' , str(row['start']) , str(row['end']))
#for index, row in sec7a_df.iterrows():
#    print('7a', str(row['start']), str(row['end']))

In [438]:
print(sec7_df)
print(sec7a_df)

    start     end
0   32210   32246
1  371250  371286
    start     end
0  377439  377532


In [320]:
doc_10k_str[51029:51087]

'quantitative and qualitative disclosures about market risk'

In [50]:
i7 = "discussion and analysis of financial condition and results of operations"
#i7 = "financial condition and results of operations"
sec7_find = re.finditer(i7, doc_10k)

In [51]:
for x in sec7_find: 
    print(x.start())

30319
518530
716604


In [145]:
doc_10k[1087900:1132112]

'uding successfully integrating acquired businesses, could have an adverse effect on our business, financial condition and results of operations.&#160; Furthermore, we make strategic divestitures from time to time. In the case of divestitures, we may agree to indemnify acquiring parties for certain liabilities arising from our former businesses. These divestitures may also result in continued financial involvement in the divested businesses following the transaction, including through guarantees or other financial arrangements.&#160; Lower performance by those divested businesses could affect our future financial results.</span></div><div><span><br/></span></div><div><span style="color:#000000;font-family:\'Times New Roman\',sans-serif;font-size:10pt;font-weight:700;line-height:120%">Union disputes or other labor matters could adversely affect our operations and financial results.</span></div><div><span style="color:#000000;font-family:\'Times New Roman\',sans-serif;font-size:10pt;font

In [96]:
#i7 = 'management&#8217;s discussion and analysis of financial condition and results of operations'
i7 = "s discussion and analysis of financial condition and results of operations"
sec7_find = re.finditer(i7, doc_10k)
i7a = 'quantitative and qualitative disclosures about market risk' 
sec7a_find = re.finditer(i7a, doc_10k)
    
# moving keyword range to dataframe
sec7_start_range = []
sec7_end_range = []
for x in sec7_find: 
    print(x.start())
    print(x.end())
    sec7_start_range.append(x.start())
    sec7_end_range.append(x.end())
    sec7_df = pd.DataFrame({'start': sec7_start_range,'end': sec7_end_range})
sec7a_start_range = []
sec7a_end_range = []    
for x in sec7a_find: 
    print(x.start())
    print(x.end())
    sec7a_start_range.append(x.start())
    sec7a_end_range.append(x.end())
    sec7a_df = pd.DataFrame({'start': sec7a_start_range,'end': sec7a_end_range})
    
#qc check
for index, row in sec7_df.iterrows():
    print(row['start'], row['end'])
    print('7aaaa')
for index, row in sec7a_df.iterrows():
    print(row['start'], row['end'])


TypeError: expected string or bytes-like object

In [114]:
    
# get possible ranges 
possible_start_range = []
possible_end_range = []
for index, row7a in sec7a_df.iterrows():
    filtered = []
    for i, row7 in sec7_df.iterrows():
        if row7['end'] < row7a['start']:
            filtered.append(row7['end'])
    #print(filtered)
    max_y = max(filtered)
    possible_start_range.append(max_y)
    possible_end_range.append(row7a['start']-1)
    
    # qc 
    #for x in possible_start_range:
    #    print(x)
    
    # get the largest possible range
possible_df = pd.DataFrame({'start': possible_start_range,'end': possible_end_range})
possible_df['delta'] = possible_df['end'] - possible_df['start']
    
possible_df = possible_df.loc[possible_df['delta'].idxmax()]


In [115]:
possible_df

start    45884
end      46460
delta      576
Name: 0, dtype: int64

In [116]:
item_7_content = doc_10k[possible_df['start']:possible_df['end']]
sss = preprocess(item_7_content)
bagOfWordsA = sss.split(' ')

In [117]:
bagOfWordsA

['amp',
 'vgp',
 'amp',
 'jmo',
 'xouqw',
 'cqgi',
 'amp',
 'hwb',
 'amp',
 'ebedj',
 'hly',
 'mfdh',
 'amp',
 'ipftsh',
 'yck',
 'ynn',
 'yphbtxx',
 'tksj',
 'lgf',
 'gwonu',
 'amp',
 'mwekkacmz',
 'amp',
 'naibt',
 'kkf',
 'bizxh',
 'amp']