In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import re
import html
from tqdm import tqdm
import time
from requests import HTTPError

In [2]:
roman_dict = {
        '1': "I",
        '1A': "I A",
        '1B': "I B",
        '1C': "I C",
        '2': "II",
        '3': "III",
        '4': "IV",
        '5': "V",
        '6': "VI",
        '7': "VII",
        '7A': "VII A"
    }

In [3]:
def find_all_indices(file_text, substring):
    try:
        item_no = substring.split()[1].strip('.')
        pattern = r'(?i)(?:^|\n)(I[\s\n]*t[\s\n]*e[\s\n]*m|I[\s\n]*t[\s\n]*e[\s\n]*m[\s\n]*s[\s\n]*)\s*{}\b\s*[.:\\-]*'
        pattern = pattern.format(item_no)
        matches = [match.start() for match in re.finditer(pattern, file_text, re.MULTILINE)]
        if matches==[]:
            roman_no = roman_dict[item_no]
            pattern = r'(?i)(?:^|\n)(I[\s\n]*t[\s\n]*e[\s\n]*m|I[\s\n]*t[\s\n]*e[\s\n]*m[\s\n]*s[\s\n]*)\s*{}\b\s*[.:\\-]*'
            pattern = pattern.format(roman_no)
            matches = [match.start() for match in re.finditer(pattern, file_text, re.MULTILINE)]
    except:
        matches = []
    return matches

def find_item_text(file_text, substring, index):
    item_indices = find_all_indices(file_text, substring)
    if substring=='ITEM 1.' and index!=0:
        item_indices = [i for i in item_indices if i >= index]
    elif substring!='ITEM 1.':
        item_indices = [i for i in item_indices if i >= index]

    if item_indices:
        for start in item_indices:
            item_start_index = start
            startcheck = start
            if file_text[item_start_index]=='\n':
                startcheck +=1
            lines = file_text.splitlines()
            matching_line = None
            current_index = 0
            for i, line in enumerate(lines):
                line_length = len(line)
                if i < len(lines) - 1:
                    line_length += 1  
                if current_index <= startcheck < current_index + line_length:
                    matching_line = line
                    break
                current_index += line_length
            line = matching_line
            if line==None:
                pass
            elif '"' not in line and '“' not in line and '”' not in line and '...' not in line and ',' not in line:
                item_start_index = start
                break
        pattern = re.compile(r'(?i)(?:^|\n)(I[\s\n]*t[\s\n]*e[\s\n]*m|I[\s\n]*t[\s\n]*e[\s\n]*m[\s\n]*s[\s\n]*)\s*[0-9A-C]+\s*[.:\\-]*', re.MULTILINE)
        stop = False
        count=0
        start_index = item_start_index
        while True:
            count+=1
            if stop == True:
                break
            if count>15:
                return '', -1
            match = pattern.search(file_text, pos=start_index+10)
            if match:
                stop_index = match.start()
                item_stop_index = stop_index
                stopcheck = stop_index
                if file_text[item_stop_index]=='\n':
                    stopcheck+=1
                matching_line = None
                current_index = 0
                for i, line in enumerate(lines):
                    line_length = len(line)
                    if i < len(lines) - 1:
                        line_length += 1  
                    if current_index <= stopcheck < current_index + line_length:
                        matching_line = line
                        break
                    current_index += line_length
                line = matching_line
                if line==None:
                    start_index = stop_index
                    continue
                else:
                    try:
                        if '"' not in line and '“' not in line and '”' not in line and '...' not in line and ',' not in line:
                            item_stop_index = stop_index
                            item_text  = file_text[item_start_index:item_stop_index]
                            stop = True
                            return item_text, item_stop_index 
                        else:
                            start_index = stop_index 
                            continue
                    except:
                        start_index = stop_index
                        continue
            else:
                break
        return '', -1
    else:
        return '', -1

def clean_data(text):
    #text = re.sub(r'<.*?>', '', text)
    #text = re.sub(r'-\d+-', '', text)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    return text

In [4]:
items = {
    'ITEM 1.':"Item 1. Business",
    'ITEM 1A.': "Item 1A. Risk Factors",
    'ITEM 1B.' : "Item 1B. Unresolved Staff Comments",
    'ITEM 1C.' : "Item 1C. Cybersecurity",
    'ITEM 2.': "Item 2. Properties",
    'ITEM 3.' : "Item 3. Legal Proceedings",
    'ITEM 4.' : "Item 4. Mine Safety Disclosures",
    'ITEM 5.': "Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    'ITEM 7.' : "Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations",
    'ITEM 7A.' : "Item 7A. Quantitative and Qualitative Disclosures About Market Risk"
}

In [None]:
#run this only once before starting the manual cleaning
def extract_info_from_url(row):
    if pd.notna(row['Item 1. Business']) and row['Item 1. Business'] != '':
        return row
    url = row['URL']
    try:
        max_retries = 3
        retry_delay = 3 
        for _ in range(max_retries):
            try:
                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
                response = requests.get(url, headers=headers)
                response.raise_for_status()
            except HTTPError as e:
                if e.response.status_code == 503:
                    print(f"Failed to retrieve content. Status code: {e.response.status_code}")
                    time.sleep(retry_delay)  
                    continue
                else:
                    print(f"HTTP error: {e}")
                    break
            except Exception as e:
                print(f"An error occurred: {e}")
                break
        
        if response.status_code == 200:
            text = response.text 
            text = html.unescape(text)
            text=text.replace('&nbsp;', '')
            text=text.replace('&#151;', '-')
            text=text.replace('&#160;', '-')
            text=text.replace('\xa0', ' ')
            # Part word remove  pattern 
            pattern_Part = re.compile(r'part\s*(II|I)\s*([.,\s]?)\s*', re.IGNORECASE)
            text = re.sub(pattern_Part, '' ,text)
            # no.s in words 
            number_mapping1 = {'One': '1', 'Two': '2', 'Three': '3', 'Four': '4', 'Five': '5', 'Seven': '7'}
            number_mapping2 = {'ONE': '1', 'TWO': '2', 'THREE': '3', 'FOUR': '4', 'FIVE': '5', 'SEVEN': '7'}

            converted_text = text

            for word, number in number_mapping1.items():
                converted_text = converted_text.replace(word, number)

            for word, number in number_mapping2.items():
                converted_text = converted_text.replace(word, number)

            text = converted_text

            part_url=['https://www.sec.gov/Archives/edgar/data/23197/0001169232-02-002122.txt','https://www.sec.gov/Archives/edgar/data/70145/0000070145-03-000112.txt' ,'https://www.sec.gov/Archives/edgar/data/70793/0000910647-03-000404.txt','https://www.sec.gov/Archives/edgar/data/1014052/0000096313-03-000216.txt','https://www.sec.gov/Archives/edgar/data/38777/0000038777-03-000693.txt', 'https://www.sec.gov/Archives/edgar/data/1014052/0001015402-04-004698.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-04-000463.txt','https://www.sec.gov/Archives/edgar/data/1283858/0000939057-04-000485.txt','https://www.sec.gov/Archives/edgar/data/717724/0000717724-04-000008.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-05-000360.txt','https://www.sec.gov/Archives/edgar/data/809933/0000809933-05-000019.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-07-000444.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-07-000308.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-08-000478.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-08-000364.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-09-000370.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-09-000097.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-10-000384.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-10-000079.txt']

            text = re.sub(r'<!--[\s\S]*?-->|<[^>]*>', '\n', text)
            emptypattern = re.compile(r'^\s*$', re.MULTILINE)
            text = re.sub(emptypattern, '', text)
            lines = [line.lstrip() for line in text.split('\n')]
            text = '\n'.join(lines)
            flag = False
            table_stop_index = 0
            table_stop_pattern = re.compile(r'Signatures', re.IGNORECASE)
            table_start_index = text.lower().find("table of contents")
            if table_start_index!=-1: 
                start_index = 0
                while True:
                    toc_index = text.lower().find("table of contents", start_index)
                    if toc_index==-1:
                        toc_index = text.lower().find("index to annual report on form 10-k", start_index)
                    if toc_index==-1:
                        toc_index = text.lower().find("index", start_index)
                    if toc_index==-1:
                        toc_index = text.lower().find("contents", start_index)
                    if flag==True:
                        break
                    if toc_index != -1:
                        match = table_stop_pattern.search(text, pos=toc_index+1)
                        if match:
                            table_stop_index = match.start()
                            content_after_toc = text[toc_index:table_stop_index]
                            content_lines = content_after_toc.split('\n')
                            found_items = [re.search(r'(?i)Item\s*[0-9A-C]+\s*[.:\\-]*', line) for line in content_lines]
                            if any(found_items):
                                table_start_index = toc_index
                                flag=True
                                break 
                            else:
                                start_index = toc_index + 10
                        else:
                            break
                    else:
                        break
                
                if len(text[table_start_index:table_stop_index])<=3500 and table_stop_index!=0:
                    text = text[:table_start_index] + text[table_stop_index:]
                   
            table_pattern =r'\s*Item\s*[0-9A-C]+\s*[.:\\-]*\s+[A-Za-z0-9\s\'"\-,]+\.*\s*\.{3,}\s*\d+$'
            part_pattern = r'\bPART [I1]\b'
            part_start = []
            for match in re.finditer(part_pattern, text, re.IGNORECASE):
                part_start.append(match.start())
            if len(part_start)>=2:
                if part_start[1]-part_start[0]<=2500 or url in part_url:
                    if text[part_start[0]:part_start[1]].lower().count('item')>10:
                        text = text[part_start[1]:]
            text=text.replace('Item l.', 'Item 1.')
            text=text.replace('ITEM l.', 'Item 1.')
            
            index=0
            index_save=0
            for item in items:
                item_text, index= find_item_text(text, item, index_save)
                if index!=-1:
                    index_save = index
                if item_text!= None:
                    item_text = clean_data(item_text)
                    item1count= 0 
                    if len(item_text)<500 and item=='ITEM 1.' and not re.search(r'(?i)\b(omitted|not applicable|not|none|pursuant|relief)\b[.,;!?]*', item_text):
                        while True:
                            item1count+=1
                            if item1count>7:
                                break
                            if len(item_text)<500 and item=='ITEM 1.' and not re.search(r'(?i)\b(omitted|not applicable|not|none|pursuant|relief)\b[.,;!?]*', item_text):
                                item_text, index= find_item_text(text, item, index_save)
                                row[items[item]] = item_text
                                if index!=-1:
                                    index_save = index
                            else:
                                break
                    else:
                        row[items[item]] = item_text
                else:
                    row[items[item]] = ' '
        
        else:
            print(f"Failed to retrieve content. Status code: {response.status_code}")
    except Exception as e:
        print(url, e)
    return row

tqdm.pandas()
# filtered_df = pd.read_csv(r'D:\Inter Mnaggi\Mengge_Li\10-K_2008_Q.csv') #change the path
# filtered_df= filtered_df.progress_apply(extract_info_from_url, axis=1)
# filtered_df.to_csv(r'D:\Inter Mnaggi\Mengge_Li\10-K_2008_Q.csv',index=False)
# filtered_df.to_excel(r'D:\Inter Mnaggi\Mengge_Li\10-K_2008_Q.xlsx', index=False)
# print('New CSV and Excel Generated')

In [47]:
'''
This will generate a csv file of all the text present in the URL. Generated CSV is used for copying and pasting the required items.
Required items are :
Item 1. Business,	
Item 1A. Risk Factors,	
Item 1B. Unresolved Staff Comments, 
Item 1C. Cybersecurity,	
Item 2. Properties,	
Item 3. Legal Proceedings, 
Item 4. Mine Safety Disclosures,	
Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities	
Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations,	
Item 7A. Quantitative and Qualitative Disclosures About Market Risk  or 
7A. Critical Accounting Policies and New Accounting Pronouncements

Item 6 SELECTED FINANCIAL DATA

Item 8 FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA
'''
def extract_info_from_url(url):
    try:
        max_retries = 3
        retry_delay = 3 
        for _ in range(max_retries):
            try:
                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
                response = requests.get(url, headers=headers)
                response.raise_for_status()
            except HTTPError as e:
                if e.response.status_code == 503:
                    print(f"Failed to retrieve content. Status code: {e.response.status_code}")
                    time.sleep(retry_delay)  
                    continue
                else:
                    print(f"HTTP error: {e}")
                    break
            except Exception as e:
                print(f"An error occurred: {e}")
                break
        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        #response = requests.get(url, headers=headers)
        if response.status_code == 200:
            text = response.text 
            #soup = BeautifulSoup(text, 'html.parser')
            #text = soup.get_text()
            text = html.unescape(text)
            text=text.replace('&nbsp;', '')
            text=text.replace('&#151;', '-')
            text=text.replace('&#160;', '-')
            text=text.replace('\xa0', ' ')
            # Part remove word pattern 
            pattern_Part = re.compile(r'part\s*(II|I)\s*([.,\s]?)\s*', re.IGNORECASE)
            text = re.sub(pattern_Part, '' ,text)
#             number_mapping1 = {'One': '1', 'Two': '2', 'Three': '3', 'Four': '4', 'Five': '5', 'Seven': '7'}
#             number_mapping2 = {'ONE': '1', 'TWO': '2', 'THREE': '3', 'FOUR': '4', 'FIVE': '5', 'SEVEN': '7'}


#             converted_text = text

#             for word, number in number_mapping1.items():
#                 converted_text = converted_text.replace(word, number)

#             for word, number in number_mapping2.items():
#                 converted_text = converted_text.replace(word, number)

#             text = converted_text
            part_url=['https://www.sec.gov/Archives/edgar/data/23197/0001169232-02-002122.txt','https://www.sec.gov/Archives/edgar/data/70145/0000070145-03-000112.txt' ,'https://www.sec.gov/Archives/edgar/data/70793/0000910647-03-000404.txt','https://www.sec.gov/Archives/edgar/data/1014052/0000096313-03-000216.txt','https://www.sec.gov/Archives/edgar/data/38777/0000038777-03-000693.txt', 'https://www.sec.gov/Archives/edgar/data/1014052/0001015402-04-004698.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-04-000463.txt','https://www.sec.gov/Archives/edgar/data/1283858/0000939057-04-000485.txt','https://www.sec.gov/Archives/edgar/data/717724/0000717724-04-000008.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-05-000360.txt','https://www.sec.gov/Archives/edgar/data/809933/0000809933-05-000019.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-07-000444.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-07-000308.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-08-000478.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-08-000364.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-09-000370.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-09-000097.txt','https://www.sec.gov/Archives/edgar/data/1046050/0000939057-10-000384.txt','https://www.sec.gov/Archives/edgar/data/1305014/0001305014-10-000079.txt']

            text = re.sub(r'<!--[\s\S]*?-->|<[^>]*>', '\n', text)
            emptypattern = re.compile(r'^\s*$', re.MULTILINE)
            text = re.sub(emptypattern, '', text)
            lines = [line.lstrip() for line in text.split('\n')]
            text = '\n'.join(lines)
            flag = False
            table_stop_index = 0
            table_stop_pattern = re.compile(r'Signatures', re.IGNORECASE)
            table_start_index = text.lower().find("table of contents")
            if table_start_index!=-1: 
                start_index = 0
                while True:
                    toc_index = text.lower().find("table of contents", start_index)
                    if toc_index==-1:
                        toc_index = text.lower().find("index to annual report on form 10-k", start_index)
                    if flag==True:
                        break
                    if toc_index != -1:
                        match = table_stop_pattern.search(text, pos=toc_index+1)
                        if match:
                            table_stop_index = match.start()
                            content_after_toc = text[toc_index:table_stop_index]
                            content_lines = content_after_toc.split('\n')
                            found_items = [re.search(r'(?i)Item\s*[0-9A-C]+\s*[.:\\-]*', line) for line in content_lines]
                            if any(found_items):
                                table_start_index = toc_index
                                flag=True
                                break 
                            else:
                                start_index = toc_index + 10
                        else:
                            break
                    else:
                        break
                
                if len(text[table_start_index:table_stop_index])<=3500 and table_stop_index!=0:
                    text = text[:table_start_index] + text[table_stop_index:]
                   
            table_pattern =r'\s*Item\s*[0-9A-C]+\s*[.:\\-]*\s+[A-Za-z0-9\s\'"\-,]+\.*\s*\.{3,}\s*\d+$'
            part_pattern = r'\bPART [I1]\b'
            part_start = []
            for match in re.finditer(part_pattern, text, re.IGNORECASE):
                part_start.append(match.start())
            if len(part_start)>=2:
                if part_start[1]-part_start[0]<=2500 or url in part_url:
                    if text[part_start[0]:part_start[1]].lower().count('item')>10:
                        text = text[part_start[1]:]
            text=text.replace('Item l.', 'Item 1.')
            text=text.replace('ITEM l.', 'Item 1.')
            ctext=clean_data(text)
            correct_df = pd.DataFrame({'Cleaned_Text': [ctext]})
            correct_df.to_csv("correct.csv", index=False)
        else:
            print(f"Failed to retrieve content. Status code: {response.status_code}")
    except Exception as e:
        print(url, e)
   

tqdm.pandas()

#change the url only. URL for file having incorrect or no extraction.
url = 'https://www.sec.gov/Archives/edgar/data/71180/0000741508-08-000011.txt'
extract_info_from_url(url)