In [1]:
import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup as bs

from path import Path, getcwdu

import glob
import os
from pathlib import PurePath
import copy

import random
import gzip
import shutil

In [2]:
full_path_list = [PurePath(os.getcwd()).joinpath(file).as_posix() for file in glob.iglob('../employee_filings/*.gz')]
full_file_list = [PurePath(file).name for file in glob.iglob('../employee_filings/*.gz')]
full_accession_ids = [PurePath(file).stem.replace('.html', '') for file in full_file_list]
full_cik_nbrs = [x.split(sep='-')[0] for x in full_accession_ids]

Read in accession ID lists created from initial data splitting

In [4]:
train_accession_ids = pd.read_csv('../data/train_accession_ids.csv', names=['acc_id'])['acc_id'].tolist()
val_accession_ids = pd.read_csv('../data/val_accession_ids.csv', names=['acc_id'])['acc_id'].tolist()

In [5]:
labeled_df = pd.read_excel('../data/train_val_employee_count_paragraphs.xlsx')
subset_df = pd.read_excel('../data/subset_employee_count_paragraphs.xlsx')

In [6]:
def print_row_detail(df=subset_df, nrow=10, header_list = ['ticker', 'accession_number' ],
                    detail_list = ['data_key_friendly_name', 'text', 'paragraph_text'],
                    sortby=['accession_number', 'data_key_friendly_name'], ascending=True):
    df_sorted = df.sort_values(sortby, ascending=ascending).reset_index()
    nrow = min(len(df_sorted), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

In [155]:
print_row_detail(df=subset_df, nrow=20)

----------------------------------- AAN -----------------------------------
----------------------------------- 0000706688-17-000030 -----------------------------------
data_key_friendly_name  :Other Employees

text  :employees

paragraph_text  :Employees   At December 31, 2016, the Company had approximately 11,500 employees. None of our employees are covered by a   collective bargaining agreement and we believe that our relations with employees are good

----------------------------------- AAON -----------------------------------
----------------------------------- 0000824142-17-000034 -----------------------------------
data_key_friendly_name  :Other Employees

text  :employees

paragraph_text  :Employees   As of February 12, 2017, we employed 1,619 permanent employees. Our employees are not represented by  unions

----------------------------------- AAXN -----------------------------------
----------------------------------- 0001069183-17-000042 -----------------------------------
d

In [153]:
subset_df.accession_number

0     0001090872-16-000082
1     0001193125-17-083862
2     0001193125-17-065791
3     0001564590-17-003590
4     0000706688-17-000030
5     0001558370-17-001556
6     0000824142-17-000034
7     0001158449-17-000034
8     0001158449-17-000034
9     0001193125-17-053796
10    0001069183-17-000042
11    0001069183-17-000042
12    0001104659-17-015892
13    0001551152-17-000004
14    0001140859-16-000022
15    0001140859-16-000022
16    0001466815-17-000007
17    0001466815-17-000007
18    0001466815-17-000007
19    0001628280-16-022122
Name: accession_number, dtype: object

In [151]:
subset_df.groupby('data_key_friendly_name').text.value_counts()

data_key_friendly_name  text                  
Full-Time Employees     full-time                 5
                        full-time Team            1
Other Employees         employees                 5
                        employed                  4
                        Total                     1
                        temporary                 1
Part-Time Employees     part-time                 2
                        part-time Team Members    1
Name: text, dtype: int64

In [7]:
subset_file_list = [PurePath(os.getcwd()).joinpath('../employee_filings/').joinpath(file) for file in full_file_list if PurePath(file).stem.replace('.html', '') in subset_df.accession_number.unique().tolist()]

In [289]:
emp_pat_list = [r"^(Employees|Team Members)$",
r"([0-9]{1,3},)*[0-9]{1,3}( |\n)((permanent|full-time|part-time|temporary|total)( |\n))*(employees|people|team members|members)",
r"employ((ed|s)?)?( |\n)(approximately( |\n))?([0-9]{1,3},)*[0-9]{1,3}( |\n)((permanent|full-time|part-time|temporary)( |\n))?(employees|people|team members|members|persons|associates)", 
r"employed(( |\n)approximately)?$", 
r"Total workforce",
r"((permanent|full|part|time|full-time|part-time|temporary)( |\n))+(employees|team members|associates)",
r"^((permanent|full|part|time|full-time|part-time|temporary|total)( |\n))*(employees|team members|associates)"]
emp_pats = [re.compile(x, re.I) for x in emp_pat_list]

In [9]:
subset_file_list[5]

PureWindowsPath('c:/projects/DSBC/capstone/sec_employee_information_extraction/../employee_filings/0001140859-16-000022.html.gz')

In [92]:
with gzip.open(subset_file_list[2], mode='rt', encoding="utf8") as file: 
            file1_html = file.read()
            soup1 = bs(file1_html, 'lxml')
soup1_emp_count = soup1.find_all(string=[emp_pats])

In [12]:
with gzip.open(subset_file_list[12], mode='rt', encoding="utf8") as file: 
            file2_html = file.read()
            soup2 = bs(file2_html, 'lxml')
soup2_emp_count = soup2.find_all(string=[emp_pats])

In [139]:
with gzip.open(subset_file_list[11], mode='rt', encoding="utf8") as file: 
            file3_html = file.read()
            soup3 = bs(file3_html, 'lxml')
soup3_emp_count = soup3.find_all(string=[emp_pats])

In [216]:
with open(html_path_list[4], encoding="utf8") as file: 
            file4_html = file.read()
            soup4 = bs(file4_html, 'lxml')
soup4_emp_count = soup4.find_all(string=[emp_pats])

In [217]:
soup4_emp_count[0]

['At AIG, we\nbelieve that a major strength of ours is the quality and dedication of our people. At December 31, 2016 and\n2015, we had approximately 56,400 and 66,400 employees, respectively. We believe that our relations with our\nemployees are satisfactory.',
 'employees',
 'Employed']

In [184]:
full_path_list[0]

'c:/projects/DSBC/capstone/sec_employee_information_extraction/../employee_filings/0000003570-17-000052.html.gz'

In [244]:
with open(html_path_list[24], encoding="utf8") as file: 
            file0_html = file.read()
            soup0 = bs(file0_html, 'lxml')
soup0_emp_count = soup0.find_all(string=[emp_pats])

In [275]:
soup0.find_all(string=emp_head)[1].find_parent(block_re).find_next_siblings(block_re, limit=5)#[1].find(string=emp_pats)
#.next_sibling.next_sibling#.next_sibling.next_sibling


[<div class="c106"></div>,
 <div class="c71"><span class="c73">We have approximately 25,000 employees as of December 31,</span>
 <span class="c73">2016</span><span class="c73">.</span></div>,
 <div><a id="sD00DD6DDD08DA54E10480FFA5CAA1F4F" name="sD00DD6DDD08DA54E10480FFA5CAA1F4F"></a></div>,
 <div class="c1"></div>,
 <div class="c70">Foreign Operations</div>]

In [263]:
soup0_emp_count[2].find_parent(block_re).next_sibling.next_sibling.next_sibling.next_sibling

<div class="c70">Employees</div>

In [256]:
soup0_emp_count[2].find_parent(block_re).find_next_siblings(block_re, limit=2)[1].find(string=emp_pats)

'We have approximately 25,000 employees as of December 31,'

In [140]:
len(soup3_emp_count)

3

In [28]:
emp_tag4 = `

In [141]:
soup3_emp_count#[0].find_parent(block_re).next_sibling.next_sibling

['Employees',
 "AbbVie employed approximately 30,000 persons as of January 31, 2017. Outside the United\nStates, some of AbbVie's employees are represented by unions or works councils. AbbVie believes that it has\ngood relations with its employees.",
 "AbbVie's products are generally sold worldwide directly to wholesalers, distributors,\ngovernment agencies, health care facilities, specialty pharmacies and independent retailers from AbbVie-owned\ndistribution centers and public warehouses. In the United States, AbbVie distributes pharmaceutical products\nprincipally through independent wholesale distributors, with some sales directly to pharmacies and patients.\nOutside the United States, sales are made either directly to customers or through distributors, depending on\nthe market served. Certain products are co-marketed or co-promoted with other companies. AbbVie has\napproximately 30,000 employees. AbbVie operates in one business segment-pharmaceutical products."]

In [42]:
emp_tag2 = soup2_emp_count[1]

In [58]:
emp_tag2.parent

<span class="c58">Employees</span>

In [51]:
soup2_emp_count

['As of December 31, 2016, we had a total of 276 employees working in the\nR&D department, including 16 with Ph.D. degrees. We continue to recruit talented engineers to further\nenhance our research and development capabilities. We have research and development departments in our\nfacilities in Texas, Georgia, China and Taiwan. Our research and development teams collaborate on joint\nprojects, and by co-locating with our manufacturing operations enable us to achieve an efficient cost structure\nand improve our time to market.',
 'Employees',
 'As of December 31, 2016, we employed 2,776 full-time employees, of which 31\nheld Ph.D. degrees in a science or engineering field. Of our employees, 287 are located in the U.S., 1,218 are\nlocated in Taiwan and 1,271 are located in China. None of our employees are represented by any collective\nbargaining agreement, but certain employees of our China subsidiary are members of a trade union. We have never\nsuffered any work stoppage as a result of

In [107]:
soup4_emp_count[12].find_parent(name=block_re).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling

<div class="c101">
<table border="0" cellpadding="0" cellspacing="0" class="c23">
<tr class="c89" height="16">
<td class="c88" height="16" nowrap="nowrap" valign="bottom" width="60%">
<p class="c80"></p>
</td>
<td class="c87" colspan="5" height="16" nowrap="nowrap" valign="bottom" width="40%">
<p class="c86"><span class="c83">December 31,</span></p>
</td>
</tr>
<tr class="c89" height="16">
<td class="c88" height="16" valign="bottom" width="60%">
<p class="c80"></p>
</td>
<td class="c87" height="16" valign="bottom" width="12%">
<p class="c86"><span class="c83">2016</span></p>
</td>
<td class="c218" height="16" valign="bottom" width="2%">
<p class="c39"></p>
</td>
<td class="c219" height="16" valign="bottom" width="12%">
<p class="c86"><span class="c83">2015</span></p>
</td>
<td class="c218" height="16" valign="bottom" width="2%">
<p class="c39"></p>
</td>
<td class="c219" height="16" valign="bottom" width="12%">
<p class="c86"><span class="c83">2014</span></p>
</td>
</tr>
<tr class="c89

In [115]:
soup2_emp_count_body = soup2.body.find_all(string=[emp_pats])

In [52]:
emp2_tag = soup2_emp_count[1].parent

In [None]:
emp_tag.parent.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.table

In [187]:
emp_tag.parent.name

'p'

In [236]:
soup3_emp_count[0].parent.next_sibling.next_sibling

<span class="c65">EMPLOYEES</span>

In [235]:
block_re = re.compile(r"p|div|table")
for i, v in enumerate(emp_tag.parent.find_next_siblings(block_re, limit=6)):
    print(i)
    print(v)

0
<p class="c68"><span class="c67">A breakdown of our employees by geographic region is as follows:</span></p>
1
<p class="c102"></p>
2
<div class="c101">
<table border="0" cellpadding="0" cellspacing="0" class="c23">
<tr class="c89" height="16">
<td class="c88" height="16" nowrap="nowrap" valign="bottom" width="60%">
<p class="c80"></p>
</td>
<td class="c87" colspan="5" height="16" nowrap="nowrap" valign="bottom" width="40%">
<p class="c86"><span class="c83">December 31,</span></p>
</td>
</tr>
<tr class="c89" height="16">
<td class="c88" height="16" valign="bottom" width="60%">
<p class="c80"></p>
</td>
<td class="c87" height="16" valign="bottom" width="12%">
<p class="c86"><span class="c83">2016</span></p>
</td>
<td class="c218" height="16" valign="bottom" width="2%">
<p class="c39"></p>
</td>
<td class="c219" height="16" valign="bottom" width="12%">
<p class="c86"><span class="c83">2015</span></p>
</td>
<td class="c218" height="16" valign="bottom" width="2%">
<p class="c39"></p>
</t

In [241]:
for i, v in enumerate(emp2_tag.find_next_siblings(block_re, limit = 1, string=False)):
    if v.find()

#.find(string=[emp_pats]).parent

<class 'bs4.element.Tag'>


Examples of employee count data in HTML

view-source:file:///C:/projects/DSBC/capstone/subset_filings/0001090872-16-000082.html

In [136]:
def show_str_context(search_str: str, in_string: str, char_before: int=500, char_after: int=2000, match_num: int=0):
    sub_idx = in_string.find(search_str) 
    len_offset = len(search_str)
    if match_num:
        idx = sub_idx
        for i in range(match_num):
            idx = in_string.find(search_str, idx + len_offset)
        sub_idx = idx
    context_beg = max(sub_idx - char_before, 0)
    context_end = sub_idx + len_offset + char_after
    return in_string[context_beg:context_end]   

In [125]:
soup2.body.text.find('bargaining agreements. We believe that our relationship with our employees is good.') 

34173

In [200]:
 with gzip.open(subset_file_list[0], mode='rt', encoding="utf8") as file: 
            file3_html = file.read()

In [198]:
re.search(emp_head, file2_html)

<_sre.SRE_Match object; span=(86339, 86350), match='>Employees<'>

In [201]:
emp2_tag.

<div class="c78">Employees</div>

In [196]:
file2_html.find('>Employees<')

86339

In [199]:
file2_html.count('\n',0,86339)

1406

In [137]:

search_str = 'bargaining agreements. We believe that our relationship with our employees is good.'
show_str_context(search_str=search_str, in_string= file2_html)

'ngement\nclaims against us from time to time.</div>\n<div class="c78">Employees</div>\n<div class="c81"><span class="c80">As of</span> <span class="c80">September 30, 2016</span><span class="c80">,\nwe had approximately</span> <span class="c80">19,000</span> <span class="c80">employees, of which\napproximately</span> <span class="c80">18,000</span> <span class="c80">were full-time employees.\nApproximately</span> <span class="c80">2%</span> <span class="c80">of our employees are covered by collective\nbargaining agreements. We believe that our relationship with our employees is good. If any of our employees in\nlocations that are unionized should engage in strikes or other such bargaining tactics in connection with the\nnegotiation of new collective bargaining agreements upon the expiration of any existing collective bargaining\nagreements, such tactics could be disruptive to our operations and adversely affect our results of operations,\nbut we believe we have adequate contingency pl

In [124]:
len(soup2.body.text)

424918

## Write functions to flag paragraphs

notes:
- if r"^Employees$" matches, find the next block element that matches another pattern 
- Include tables as "matches" when siblings of Employees divs

In [73]:
soup3_emp_count[0].find_parent(name=block_re)#.parent#.find_next_siblings(block_re, limit=6)

<div class="c78">Employees</div>

In [473]:
emp_head_raw = re.compile(r"[>](Our )?Employees[<]", re.I)

In [472]:
block_re = re.compile(r"^(p|div|table)$")
head_block_re = re.compile(r"^(p|div|h[1-6])$")

In [85]:
soup2.body.find(string=emp_head).find_parent(name=block_re).next_sibling.next_sibling.next_sibling.next_sibling


<p class="c77"><span class="c32">As of December 31, 2016, we employed 2,776 full-time employees, of which 31
held Ph.D. degrees in a science or engineering field. Of our employees, 287 are located in the U.S., 1,218 are
located in Taiwan and 1,271 are located in China. None of our employees are represented by any collective
bargaining agreement, but certain employees of our China subsidiary are members of a trade union. We have never
suffered any work stoppage as a result of an employment related strike or any employee related dispute and
believe that we have satisfactory relations with our employees.</span></p>

In [116]:
print(subset_file_list[4])

c:\projects\DSBC\capstone\sec_employee_information_extraction\..\employee_filings\0001104659-17-015892.html.gz


In [148]:
full_accession_ids[0]

'0000003570-17-000052'

In [169]:
html_path_list = [x.replace('.gz', '') for x in full_path_list]

In [225]:
emp_head_raw = re.compile(r"[>](Employees|Team Members)[<]", re.I)
emp_head = re.compile(r"^(Our |Number of )?(Employees|Team Members)$", re.I)

In [384]:
acc_id_list = [] ; para_list_orig = [] ;  tag_list = [];
emp_head_list = []; emp_head_first_list = [];
tbl_acc_id_list = [] ; tbl_tag_list = []; 

In [408]:
tbl_acc_id_list[-1]

'0001047469-17-002477'

In [409]:
tbl_acc_id_list.index(tbl_acc_id_list[-1])

58

In [411]:
acc_id_list[-1]

'0001047469-17-001782'

In [401]:
acc_id_list.index(acc_id_list[-1])

2554

In [406]:
full_accession_ids.index(acc_id_list[-1])

769

In [410]:
acc_id_list = acc_id_list[:2554]
para_list_orig = para_list_orig[:2554]
tag_list = tag_list[:2554]
emp_head_list = emp_head_list[:2554]
emp_head_first_list = emp_head_first_list[:2554]
tbl_acc_id_list = tbl_acc_id_list[:58]
tbl_tag_list = tbl_tag_list[:58]

In [451]:
for i, fl in enumerate(html_path_list[1200:1300]):
    acc_id = PurePath(fl).stem.replace('.html', '')
    tag_set = set();
    with open(fl, encoding="utf8") as file: 
        file_html = file.read()
        soup = bs(file_html, 'lxml')
#        emp_head_flag = False
#        emp_head_first_match = False
        if re.search(emp_head_raw, file_html):
            for ihead, hblock in enumerate(soup.body.find_all(string=emp_head, limit=4)):
                try:
                    emp_head_tag = hblock.find_parent(name=head_block_re)
                    if emp_head_tag.name != 'table' and emp_head_tag.find_parent('table') == None:
                        emp_head_matched = False
            #            print(emp_head_tag.name) ;print(emp_head_tag)
                        for i2, block in enumerate(emp_head_tag.find_next_siblings(block_re, limit=6)):
                            #print('Block sibling number: ' + str(i2))
                            if block.find(string=[emp_pats]) != None and block.name != 'table':
                                block_tag = copy.copy(block)
                                if block_tag not in tag_set:
                                    acc_id_list.append(acc_id) 
                                    tag_list.append(block_tag)
                                    para_list_orig.append(block_tag.get_text())
                                    tag_set.add(block_tag)
                                    emp_head_list.append(True)
                                    if not emp_head_matched:
                                        emp_head_flag = True
                                        emp_head_matched = True
                                        emp_head_first_list.append(True)
                                    else:
                                        emp_head_first_list.append(False)
                            if block.find('table') != None:
            #                    print('Found table match!')
                                tbl_block_tag = copy.copy(block)
                                if tbl_block_tag not in tag_set:
                                    tbl_acc_id_list.append(acc_id) 
                                    tbl_tag_list.append(tbl_block_tag)
                                    tag_set.add(tbl_block_tag)
                except:
                    continue
                        #tbl_df = pd.read_html(block.find('table').prettify(), tupleize_cols=True)[0].dropna(axis=1,how='all')
    #                    print(tbl_df); print(tbl_df.info()); print(block)
#        else:
#            print('No Employees header')
        soup_emp_count = soup.body.find_all(string=[emp_pats])
        soup_emp_paras = [x.find_parent(name=block_re) for x in soup_emp_count]
        soup_emp_paras = [x for x in soup_emp_paras if x != None]
        for i2, block in enumerate(soup_emp_paras):
#                print('Para number: ' + str(i2)); print(block)
            block_tag = copy.copy(block)
            if block_tag not in tag_set:
                if block.find('table') != None:
                    tbl_acc_id_list.append(acc_id) 
                    tbl_tag_list.append(block_tag)
                    tag_set.add(block_tag)
                else:
                    acc_id_list.append(acc_id) 
                    tag_list.append(block_tag)
                    tag_set.add(block_tag)
                    para_list_orig.append(block_tag.get_text())
                    emp_head_list.append(False)
                    emp_head_first_list.append(False)
#emp2_tag.find_next_siblings(block_re, limit = 1, string=False)[0].find(string=[emp_pats]).parent

In [453]:
print(len(set(acc_id_list)))
print(len(acc_id_list))
print(len(para_list_orig))
print(len(tag_list))
print(len(emp_head_list))
print(len(emp_head_first_list))
print(len(tbl_acc_id_list))
print(len(tbl_tag_list))

1275
4817
4817
4817
4817
4817
94
94


In [494]:
#### Make dataframe
tbl_html_df = pd.DataFrame(data = { 'acc_id': tbl_acc_id_list, 'tbl_html': tbl_tag_list, 'split' : 'train' })
tbl_html_df.loc[tbl_html_df.acc_id.isin(val_accession_ids),'split'] = 'val'

In [495]:
#### Write to csv for later use
tbl_html_df.to_csv('tbl_html_df.csv')

In [496]:
paragraph_input_dict = {'acc_id' : acc_id_list, 
                        'para_text' : [p.replace('\n', ' ') for p in para_list_orig],
                        'len' : [len(p) for p in para_list], 
                        'emp_header' : emp_head_list,
                        'first_emp_head_block' : emp_head_first_list,
                       'para_text_orig' : para_list_orig, 
                        'para_tag' : tag_list, 
                       'split' : 'train', 
                       'label' : 0 }
#paragraph_input_df['para_text'] = paragraph_input_df.para_text_orig.replace('\n', ' ')
p_columns = ['acc_id', 'para_text', 'len', 'emp_header', 'first_emp_head_block', 'para_text_orig',
              'para_tag', 'split', 'label']

paragraph_input_df = pd.DataFrame(paragraph_input_dict, columns=p_columns)


paragraph_input_df.loc[paragraph_input_df.acc_id.isin(val_accession_ids),'split'] = 'val'

train_df = paragraph_input_df[paragraph_input_df.split == 'train']

In [497]:
paragraph_input_df.to_csv('paragraph_input_df.csv')

Code for printing results from paragraph search for a file

In [483]:
def print_parse_objects(file: str, use_acc_id: bool=True, acc_id_list = full_accession_ids, 
                       file_path_list = html_path_list):
    fl = file
    if use_acc_id:
        fl = html_path_list[full_accession_ids.index(fl)]        
        print('Index number: ')
        print(full_accession_ids.index(file))
    print('File path: ')
    print(fl)
    #for i, fl in enumerate(html_path_list[303:304]):
    with open(fl, encoding="utf8") as file: 
        file_html = file.read()
        soup = bs(file_html, 'lxml')
        emp_head_flag = False
        emp_head_first_match = False
        if re.search(emp_head_raw, file_html):
            for ihead, hblock in enumerate(soup.body.find_all(string=emp_head)):
                emp_head_tag = hblock.find_parent(name=head_block_re)

                if emp_head_tag.name != 'table' and emp_head_tag.find_parent('table') == None:
                    emp_head_matched = False
                    print(ihead); print(emp_head_tag.name) ;print(emp_head_tag)
                    for i2, block in enumerate(emp_head_tag.find_next_siblings(block_re, limit=6)):
                        print('Block sibling number: ' + str(i2))
                        if block.find(string=emp_pats) != None:
                            block_tag = copy.copy(block)
                            print('Found match!'); print(block_tag) ; print(block_tag.get_text())
                        if block.find('table') != None:
                            print('Found table match!')
                            tbl_block_tag = copy.copy(block)
                            tbl_df = pd.read_html(block.find('table').prettify())[0].dropna(axis=1,how='all')
                            print(tbl_df); print(tbl_df.info()); print(block)
        #else:
        print('No Employees header')
        soup_emp_count = soup.body.find_all(string=[emp_pats])
        soup_emp_paras = [x.find_parent(name=head_block_re) for x in soup_emp_count]
        soup_emp_paras = [x for x in soup_emp_paras if x != None]
        soup_emp_blocks = [x  for x in soup_emp_count if x.name in ['p', 'div']]
        for i2, block in enumerate(soup_emp_paras):
            block_tag = copy.copy(block)
            print('Para number: ' + str(i2)); 
            try:
                print('Original match name:  ' +  soup_emp_count[i2].name)
            except:
                print('Original match has no name')
            print('Original match:  ' +  soup_emp_count[i2])
            print('Block match name:  ' +  block_tag.name)
            print('Block:  ')
            print(block)
            print(type(block))
            print('Block Text:  ')
            print(block.get_text().replace('\n', ' '))
        for i2, block in enumerate(soup_emp_blocks):
            print('Block number: ' + str(i2)); print(block)
#emp2_tag.find_next_siblings(block_re, limit = 1, string=False)[0].find(string=[emp_pats]).parent

In [492]:
print('Paragraphs: ' + str(len(acc_id_list)))
print('Files with paragraphs: ' + str(len(set(acc_id_list))))
print('Tables: ' + str(len(tbl_acc_id_list)))

Paragraphs: 4817
Files with paragraphs: 1275
Tables: 94


Code for investigating files for which no candidate paragraphs were identified

In [312]:
missed_ids = [x for x in full_accession_ids[:100] if x not in set(acc_id_list)]
# For investigating files with no candidate paragraphs if they are part of the training set
#[x for x in missed_ids if x in train_accession_ids]  

In [350]:
#find index of file for specific accession id
#[i for i, x in enumerate(html_path_list[:300]) if re.search(r"0000042316-17-000014", x)]

[89]

In [460]:
train_df.head(2)

Unnamed: 0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split,label
2,0000004127-16-000068,"As of September 30, 2016, we employed approxim...",245,True,True,"As of September 30, 2016,\nwe employed approxi...","<div class=""c80""><span class=""c32"">As of</span...",train,0
3,0000004127-16-000068,EMPLOYEES,9,False,False,EMPLOYEES,"<div class=""c90"">EMPLOYEES</div>",train,0


In [461]:
train_df.describe()

Unnamed: 0,len,label
count,3597.0,3597.0
mean,408.237142,0.0
std,483.178275,0.0
min,8.0,0.0
25%,29.0,0.0
50%,281.0,0.0
75%,601.0,0.0
max,5054.0,0.0


In [490]:
train_df.len.value_counts().to_frame('len_counts').sort_index(ascending=False)

Unnamed: 0,len_counts
5054,1
4944,1
4868,1
4213,1
3972,1
3904,1
3878,1
3862,1
3795,1
3791,1


### Test and refine regex patterns for flagging likely relevant documents

In [None]:
def check_regex_match(pattern, text_list):
    for idx, s in enumerate(text_list):
        mo = re.search(pattern, s)
        if mo:
            ms = mo.span()[1]
            print("------    " + str(idx) + "   Matched!    -----")
            print('str length  :' + str(len(s)) + '    match span  :' + str(ms))
            print(s[:ms])
            print('')
            print(s[ms:])
            print(re.search(pattern, s))
        else:
            print("------    " + str(idx) + "  NO MATCH    -----")
            print(s)

In [458]:
flag_pat_list = [r"([0-9]{1,3},)*[0-9]{1,3} ([\w']+[, ]?)*((employ\w*|head([ -])?count|member(s)?|person(s)?|people|staff|team|workforce))",
r"((employ\w*|head([ -])?count|member(s)?|person(s)?|people|staff|team|workforce) )+([\w']+[, ]?)*([0-9]{1,3},)*[0-9]{1,3}",
r"([0-9]{1,3},)*[0-9]{1,3} ((permanent|full-time|part-time|temporary|total) )*(employees|people|team members|members)",
r"employ((ed|s)?)? (approximately )?([0-9]{1,3},)*[0-9]{1,3} ((permanent|full-time|part-time|temporary) )?(employees|people|team members|members|persons|associates)"
#r"employed( approximately)?$", 
#r"Total workforce",
#r"((permanent|full|part|time|full-time|part-time|temporary) )+(employees|team members|associates)",
#r"^((permanent|full|part|time|full-time|part-time|temporary|total) )*(employees|team members|associates)"
]                 
flag_pats = [re.compile(x, re.I) for x in flag_pat_list]

In [484]:
print_row_detail(df=train_df, nrow=10, header_list = ['acc_id'  ],
                    detail_list = ['len', 'emp_header', 'first_emp_head_block', 'para_text'],
                    sortby=['len', 'acc_id'], ascending=False)

----------------------------------- 0001193125-17-106055 -----------------------------------
len  :5054

emp_header  :False

first_emp_head_block  :False

para_text  :                         Current format     Previous format                             Millions of Euros       Millions of Euros    ASSETS   December  2015     December  2014      ASSETS   December  2015     December  2014              CASH, CASH BALANCES AT CENTRAL BANKS AND OTHER DEMAND DEPOSITS (1)   29,282    27,719     CASH AND BALANCES WITH CENTRAL BANKS   43,467    31,430    FINANCIAL ASSETS HELD FOR TRADING   78,326    83,258     FINANCIAL ASSETS HELD FOR TRADING   78,326    83,258    Derivatives   40,902    44,229     Loans and advances to credit institutions   -    -    Equity instruments   4,534    5,017     Loans and advances to customers   65    128    Debt securities   32,825    33,883     Debt securities   32,825    33,883    Loans and advances to central banks   -    -     Equity instruments   4,534    5,

In [491]:
print_parse_objects(file='0001193125-17-106055')

Index number: 
1298
File path: 
c:/projects/DSBC/capstone/sec_employee_information_extraction/../employee_filings/0001193125-17-106055.html
1
p
<p class="c44">D. <a id="tx368256_32" name="tx368256_32"></a>Employees</p>
Block sibling number: 0
Found match!
<p class="c43">As of December 31, 2016, we, through our various subsidiaries, had 134,792 employees.
Approximately 88% of our employees in Spain held technical, managerial and executive positions, while the
remainder were clerical and support staff. The table below sets forth the number of BBVA employees by
geographic area.</p>
As of December 31, 2016, we, through our various subsidiaries, had 134,792 employees.
Approximately 88% of our employees in Spain held technical, managerial and executive positions, while the
remainder were clerical and support staff. The table below sets forth the number of BBVA employees by
geographic area.
Block sibling number: 1
Block sibling number: 2
Block sibling number: 3
Block sibling number: 4
Block s