# Digital Transformation Advisory

## 02.0 - Document Processing TCs

In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 02 - Document Processing
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): pdf files
#-- Description:  
#                - Approval Documents of TCs
#                
#                
#                
#
#-- @author:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  1.3
#-- Last Update: 01/12/2021
#-- Last Revision Date: 08/23/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#### Environment Setup

In [None]:
import os
import pandas as pd
import re

###### Required Libraries:

In [None]:
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

# ****

In [None]:
import joblib

## Version 1.3 - data from November and December 2020 (4th Iteration)

In [None]:
data_pre = joblib.load('./output/Approval_Documents_Collection_2021-01-12_v10_.joblib.bz2')

In [None]:
data_pre.head()

## Reading

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

In [None]:
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data[(~data['Document_Name'].str.contains('Approval Document - GA-274-1')) & (data.page_count > 5) & (data.blank_pages.astype(float) < 60)].copy()

In [None]:
# no additional filtering required

In [None]:
# stores the type of document: tc's, other
data['doc_type'] = ''
# stores the matching title that defines the type and its page
data['doc_identifier'] = ''

In [None]:
to_review = []
tc_count = 0
for index, row in data.iterrows():
    is_tc = False
    for page in range(0,len(data.Document_Content[index])):
        if re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('TC header found at page:', str(page))
            tc_count += 1
            is_tc = True
            data.at[index, 'doc_type'] = 'tc'
            match_title_type = re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE).group()
            data.at[index, 'doc_identifier'] = (match_title_type, page)
            break
    if not is_tc: 
        print('check regex on:', str(index))
        data.at[index, 'doc_type'] = 'other'
        data.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('TCs identified:', str(tc_count))

#### Language detection

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
data['language'] = ''

In [None]:
%%time
for index, row in data.iterrows():
    if data['doc_type'][index] == 'tc':
        data.at[index, 'language'] = detect(''.join(data['Document_Content'][index][:5])) # run language detection on Document_Content[page]
    else:
        datae.at[index, 'language'] = 'na'

In [None]:
data.language.value_counts()

In [None]:
data

In [None]:
#v1.3: TC documents, both languages - November and December 2020
df_data_novdec = 'df_pre_tcs_2021-01-12_v12.joblib'
joblib.dump(data, './output/' + df_data_novdec + '.bz2', compress=('bz2', 3))#

In [None]:
# **************************************************************************************************************** #

## Version 1.2 - data from October 2020 (3rd Iteration)

In [None]:
data_pre = joblib.load('./output/Approval_Documents_Collection_2020-11-04_v09_.joblib.bz2')

In [None]:
data_pre.head()

## Reading

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
# **************************************************************************************************************** #

In [None]:
data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

In [None]:
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data[(~data['Document_Name'].str.contains('Approval Document - GA-274-1')) & (data.page_count > 5) & (data.blank_pages.astype(float) < 60)].copy()

In [None]:
# no additional filtering required

In [None]:
# stores the type of document: tc's, other
data['doc_type'] = ''
# stores the matching title that defines the type and its page
data['doc_identifier'] = ''

In [None]:
to_review = []
tc_count = 0
for index, row in data.iterrows():
    is_tc = False
    for page in range(0,len(data.Document_Content[index])):
        if re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('TC header found at page:', str(page))
            tc_count += 1
            is_tc = True
            data.at[index, 'doc_type'] = 'tc'
            match_title_type = re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE).group()
            data.at[index, 'doc_identifier'] = (match_title_type, page)
            break
    if not is_tc: 
        print('check regex on:', str(index))
        data.at[index, 'doc_type'] = 'other'
        data.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('TCs identified:', str(tc_count))

In [None]:
### Language detection

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
data['language'] = ''

In [None]:
%%time
for index, row in data.iterrows():
    if data['doc_type'][index] == 'tc':
        data.at[index, 'language'] = detect(''.join(data['Document_Content'][index][:5])) # run language detection on Document_Content[page]
    else:
        datae.at[index, 'language'] = 'na'

In [None]:
data.language.value_counts()

## Version 1.1 - Storing (data from July to September 2020)

In [None]:
#v1.1: TC documents, both languages - July to September 2020
df_data_july2sept = 'df_pre_july2sept_2020-10-16_v11.joblib'
joblib.dump(data, './output/' + df_data_july2sept + '.bz2', compress=('bz2', 3))#

# ****

## Version 1.1 - data from July to September 2020

In [None]:
data_pre = joblib.load('./output/Approval_Documents_Collection_2020-10-16_v08_.joblib.bz2')

In [None]:
data_pre.head()

## Reading

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
# **************************************************************************************************************** #

In [None]:
data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

In [None]:
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data[(~data['Document_Name'].str.contains('Approval Document - GA-274-1')) & (data.page_count > 5) & (data.blank_pages.astype(float) < 60)].copy()

In [None]:
# no additional filtering required

In [None]:
# stores the type of document: tc's, other
data['doc_type'] = ''
# stores the matching title that defines the type and its page
data['doc_identifier'] = ''

In [None]:
to_review = []
tc_count = 0
for index, row in data.iterrows():
    is_tc = False
    for page in range(0,len(data.Document_Content[index])):
        if re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('TC header found at page:', str(page))
            tc_count += 1
            is_tc = True
            data.at[index, 'doc_type'] = 'tc'
            match_title_type = re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|Nombre de la CT|TC DOCUMENT|TC Name|TECHNICAL COOPERATION', data.Document_Content[index][page], re.IGNORECASE).group()
            data.at[index, 'doc_identifier'] = (match_title_type, page)
            break
    if not is_tc: 
        print('check regex on:', str(index))
        data.at[index, 'doc_type'] = 'other'
        data.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('TCs identified:', str(tc_count))

In [None]:
### Language detection

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
data['language'] = ''

In [None]:
%%time
for index, row in data.iterrows():
    if data['doc_type'][index] == 'tc':
        data.at[index, 'language'] = detect(''.join(data['Document_Content'][index][:5])) # run language detection on Document_Content[page]
    else:
        datae.at[index, 'language'] = 'na'

In [None]:
data.language.value_counts()

## Version 1.1 - Storing (data from July to September 2020)

In [None]:
#v1.1: TC documents, both languages - July to September 2020
df_data_july2sept = 'df_pre_july2sept_2020-10-16_v11.joblib'
joblib.dump(data, './output/' + df_data_july2sept + '.bz2', compress=('bz2', 3))#

# ****

#### Load dataframe from `Digital Transformation Advisory - 01 - Document Collection` notebook

In [None]:
# Load source file:
df_pre = joblib.load('./output/Approval_Documents_Collection_2020-07-08_v04_.joblib.bz2')

In [None]:
df_base_pre.head()

#### `Approval Document` Reading

###### Document Location:

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals"

print(file_dir)

In [None]:
#Test:
file_dir + '\\' + df_base_pre.Document_Name[15]

###### Dataframe for text processing:

In [None]:
df_base = df_base_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()

In [None]:
df_base['Document_Content'] = ''

In [None]:
df_base.head()

In [None]:
df_base.Document_Status.value_counts()

###### Read the documents and store the content in the dataframe:

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in df_base.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + df_base.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        df_base.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        df_base.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
df_base.head(10)

In [None]:
df_base.Document_Content[6]

In [None]:
df_base.Document_Status.value_counts()

In [None]:
#v0.3 - Store content
f_base = 'TC_Approval_Documents-full_content_v03_2020-07-08.joblib'
joblib.dump(df_base, './output/' + f_base + '.bz2', compress=('bz2', 3))

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base['blank_pages'] = ''

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base[0:10]

In [None]:
# **************************************************************************************************************** #

##### Converted and manually adjusted in the dataframe: 

- 'RG-T3485': `RG-T3485_TC Document - RG-T3485.pdf`
- 'RG-T3539': `RG-T3539_Documento CT - RG-T3539.pdf`
<br>

In [None]:
# Docx files converted to pdf - using MS-Word:
df_base[df_base['Document_Name'].str.endswith('docx')]

In [None]:
for index in [1187, 1223]:
    df_base['Document_Name'][index] = df_base['Document_Name'][index].replace('docx', 'pdf')
    
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + df_base.Document_Name[index]
    pages_txt = []
    
    # extract pages:
    # Read PDF file
    data = parser.from_file(filename, xmlContent=True)
    xhtml_data = BeautifulSoup(data['content'])
    for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
        # Parse PDF data using TIKA (xml/html)
        # It's faster and safer to create a new buffer than truncating it
        # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
    
        # Add pages
        if parsed_content['content'] != None:    # page is not blank page
            text = parsed_content['content'].strip()
        else: 
            text = ''
        
        pages_txt.append(text)
        
            
    # save results and report status:
    df_base.at[index, 'Document_Content'] = pages_txt
    doc_count += 1
    print()
    print("Completed doc index:", str(index), "Document number:", str(doc_count))
    del pages_txt
    del filename
    print('------')
    print()
    

In [None]:
df_base[1187:1188]

In [None]:
# **************************************************************************************************************** #

In [None]:

for index, row in df_base.iterrows():
    print('## Processing index', str(index))
    lista = df_base['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    df_base.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

In [None]:
df_base['Document_Content'][6]

In [None]:
df_base[0:10]

In [None]:
df_base.blank_pages.value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
blank_pages_per_document = df_base.blank_pages.tolist()

In [None]:
blank_pages_per_document

In [None]:
%matplotlib inline
# Build the histogram.
plt.rcParams["figure.figsize"]=10,10
plt.hist(blank_pages_per_document, bins = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], alpha=0.75, histtype='bar', ec='black')
plt.xlabel('Number of pages per document')
plt.ylabel('Frequency')
plt.title(r'blank pages per document (%)')
#plt.figure(figsize = (200, 100))
#plt.grid(True)
plt.show()

In [None]:
import pylab as P

P.figure()

bins = [0, 10,20,30,40,50,60,70,80,90,100]
# the histogram of the data with histtype='step'
n, bins, patches = P.hist(blank_pages_per_document, bins, histtype='bar', rwidth=0.8)

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base['page_count'] = df_base['Document_Content'].apply(lambda x: len(x))

In [None]:
df_base.head()

In [None]:
# Plotting the document length distribution:

P.figure()
# the histogram of the data with histtype='step'
n, bins, patches = P.hist(df_base.page_count.to_list(), bins, histtype='bar', rwidth=0.8)

In [None]:
# **************************************************************************************************************** #

In [None]:
# Specifics - Check document: 'Approval Document-BR-T1408_18Nov2019.pdf'

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document-BR-T1408_18Nov2019')]

In [None]:
df_base.Document_Content[170]

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document - AR-T1206_12Dec2018-171614')]

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document - BO-T1306_12Apr2018-15374')]

In [None]:
df_base.Document_Content[102][1:5]

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document - PE-T1424_01Jul2019-17134')]

In [None]:
df_base.Document_Content[707][0:5]

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document - RG-T3394 _30Apr2019-103759')]

In [None]:
df_base[df_base['Document_Name'].str.contains('Approval Document - SU-T1102_21Jun2018-141515')]

In [None]:
# **************************************************************************************************************** #

## Filtering 

#### Step_1

As of 07/10, <b>filter out</b> files under the following conditions:
* documents containing `Approval Document - GA-274-1-` as Document_Name
* documents with page_count < 6
* documents with more than 60% of blank pages

In [None]:
df_filtered = df_base[(~df_base['Document_Name'].str.contains('Approval Document - GA-274-1')) & (df_base.page_count > 5) & (df_base.blank_pages.astype(float) < 60)].copy()
df_filtered

In [None]:
# Plotting the new results:
P.figure()
# the histogram of the data with histtype='step'
n, bins, patches = P.hist(df_filtered.page_count.to_list(), bins, histtype='bar', rwidth=0.8, color='g')

In [None]:
df_base[df_base.blank_pages.astype(float) > 60].shape

In [None]:
# **************************************************************************************************************** #

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_filtered.page_count, color='g').set_title('Document length distribution (in pages)');

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_filtered.blank_pages).set_title('Blank Pages distribution (% of blank pages)');

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base.shape[0], df_filtered.shape[0]

In [None]:
# **************************************************************************************************************** #

#### Step_2 

######  classifier

After the above filtering decisions, a second filtering level is performed where TC's components -such as common titles- are reviewed

In [None]:
# Copy of the previous result to work with:
df_filtered_2 = df_filtered.copy()

In [None]:
# stores the type of document: tc's, other
df_filtered_2['doc_type'] = ''
# stores the matching title that defines the type and its page
df_filtered_2['doc_identifier'] = ''

In [None]:
to_review = []
tc_count = 0
for index, row in df_filtered_2.iterrows():
    is_tc = False
    for page in range(0,len(df_filtered_2.Document_Content[index])):
        if re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|TC DOCUMENT|TC Name', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('TC header found at page:', str(page))
            tc_count += 1
            is_tc = True
            df_filtered_2.at[index, 'doc_type'] = 'tc'
            match_title_type = re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|TC DOCUMENT|TC Name', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'doc_identifier'] = (match_title_type, page)
            break
    if not is_tc: 
        print('check regex on:', str(index))
        df_filtered_2.at[index, 'doc_type'] = 'other'
        df_filtered_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('TCs identified:', str(tc_count))

In [None]:
# indexes of documents to review:
#len(to_review)

df_filtered_2.loc[to_review]

In [None]:
df_filtered_2.Document_Content[1357][0:5]

In [None]:
for element in to_review:
    print(str(element),df_filtered_2.Document_Name[element])

In [None]:
df_filtered_2.loc[~df_filtered_2.index.isin(to_review)]

In [None]:
#step_2
df_filtered_2_step_2 = df_filtered_2.loc[~df_filtered_2.index.isin(to_review)].copy()

# **************************************************************************************************************** #
<br>
<br>
<br>

## Storing -intermediate- Results

#### Save results:

In [None]:
#v1.01: NULL_URL TC documents, both languages
f_df_pre_null_v101 = 'df_pre_null_2020-08-25_v101.joblib'
joblib.dump(df_filtered_null_2, './output/' + f_df_pre_null_v101 + '.bz2', compress=('bz2', 3))#

In [None]:
#v1.0: NULL_URL TC documents, both languages
f_df_pre_null_v10 = 'df_pre_null_2020-08-24_v10.joblib'
joblib.dump(df_filtered_null_2, './output/' + f_df_pre_null_v10 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.8: English TC documents, content extracted and cleaned - pending supraindexes removal
f_df_pre_en_v08 = 'df_pre_en_2020-08-23_v08.joblib'
joblib.dump(df_pre_en, './output/' + f_df_pre_en_v08 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.7: Spanish documents, content extracted and cleaned - pending conversion of 6 to ó
f_df_pre_es_v07 = 'df_pre_es_2020-07-14_v07.joblib'
joblib.dump(df_pre_es, './output/' + f_df_pre_es_v07 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.6: Spanish documents, content extracted
f_df_pre_es_v06 = 'df_pre_es_2020-07-14_v06.joblib'
joblib.dump(df_pre_es, './output/' + f_df_pre_es_v06 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.5: Added document language (es, en, na), selected spanish, filtered, identified interest sections by title and their respective location
f_df_filtered_es_v05 = 'df_filtered_es_2020-07-11_v05.joblib'
joblib.dump(df_pre_es, './output/' + f_df_filtered_es_v05 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.4: Added document type (tc, other)
f_df_filtered_2_v04 = 'df_filtered_2_step_2_2020-07-11_v04.joblib'
joblib.dump(df_filtered_2_step_2, './output/' + f_df_filtered_2_v04 + '.bz2', compress=('bz2', 3))#

In [None]:
#######################

In [None]:
df_pre = df_filtered_2_step_2.copy()

In [None]:
df_pre.shape

In [None]:
## Recover df_pre_es: 07/13 v0.5
df_pre_es = joblib.load('./output/df_filtered_es_2020-07-11_v05.joblib.bz2')

In [None]:
df_pre_es.shape

In [None]:
df_pre_es.head()

# **************************************************************************************************************** #
<br>
<br>
<br>

### Language detection over the 'doc_identifier'

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
df_pre['language'] = ''

In [None]:
%%time
for index, row in df_pre.iterrows():
    if df_pre['doc_type'][index] == 'tc':
        df_pre.at[index, 'language'] = detect(df_pre['Document_Content'][index][df_pre.doc_identifier[index][1]]) # run language detection on Document_Content[page]
    else:
        df_pre.at[index, 'language'] = 'na'

In [None]:
df_pre.language.value_counts()

In [None]:
#### 2nd run on the first 3 pages:

In [None]:
%%time
for index, row in df_pre.iterrows():
    if df_pre['doc_type'][index] == 'tc':
        df_pre.at[index, 'language'] = detect(''.join(df_pre['Document_Content'][index][:3])) # run language detection on Document_Content[page]
    else:
        df_pre.at[index, 'language'] = 'na'

In [None]:
df_pre.language.value_counts()

In [None]:
df_pre.head(5)

#### Notes:
After inspection and titles search, some documents were found that their language was different from the expected.
The following indexes were manually changed into 'en'
909, 920, 1045 and 1105
<br>
Additionally, the documents `CO-T1580_Approval Document-CO-T1580 .pdf` and `PE-T1408_Approval Document - PE-T1408_15Oct2018-143108.pdf` had their intermediate titles also changed on page 4 (indexes 293, 698)
<br>
Finally, document hadd missing pages

In [None]:
# Manually changed language
df_pre.at[909, 'language'] = 'en'
df_pre.at[920, 'language'] = 'en'
df_pre.at[1045, 'language'] = 'en'
df_pre.at[1105, 'language'] = 'en'

In [None]:
# split on language to better process titles:
# Spanish:
df_pre_es = df_pre[df_pre['language'] == 'es'].copy()

In [None]:
# Manually change of intermediate title for document `CO-T1580_Approval Document-CO-T1580 .pdf`: 
lista_aux = df_pre_es['Document_Content'][293]
lista_aux[4] = df_pre_es['Document_Content'][293][4].replace('Objetivos y Justificación de la CT', 'Descripción de las actividades y resultados')
df_pre_es.at[293, 'Document_Content'] = lista_aux

In [None]:
# Idem for document `PE-T1408_Approval Document - PE-T1408_15Oct2018-143108.pdf`
lista_aux = df_pre_es['Document_Content'][698]
lista_aux[4] = df_pre_es['Document_Content'][698][4].replace('Ill. Actividades', 'Ill. Descripción de las actividades y resultados')
df_pre_es.at[698, 'Document_Content'] = lista_aux

###
Documents with missing pages: 
    - `CH-T1197_Approval Document -CH-T1197.pdf` (index 202)  
    - `CO-T1439_Approval Document - CO-T1439 .pdf` (index 230)
#### Pending decision - meanwhile is removed from df_pre_es


In [None]:
df_pre_es.drop([202], inplace=True)

In [None]:
df_pre_es.drop([230], inplace=True)

In [None]:
df_pre_es.shape

In [None]:
df_pre_es.head()

In [None]:
# **************************************************************************************************************** #

### Titles search

##### 'Objetivos y justificación' 

In [None]:
pattern_es_1 = r'\n?\s?\n?\s?[2IV31l]+\.?\s{0,}(Objetivos? y Justificación((\s?de (la\s)?(CT\:?|Cooperación Técnica|TC)\.?)|(\sdel Proyecto)|\:| de la Cooperación Técnica \(CT\))?|Justificación y Objetivos de la CT|Justificación y Objetivo|Problema\, Objetivos y Justificación de la CT\.?|OBJETIVOS Y JUSTIFICACIÓN DE LA OPERACIÓN DE COOPERACIÓN TÉCNICA|Justificaci6n y objetivo|Objetivos y Justificación de la CT \(estimado\: 1 página\)|DESCRIPCIÓN DEL PRÉSTAMO\/GARANTÍA ASOCIADO)\s{0,}\n?'

In [None]:
df_pre_es['Document_Content'][301][1]

In [None]:
## Replace specific titles for the following indexes: 
# 301
lista_aux = df_pre_es['Document_Content'][301]
lista_aux[1] = df_pre_es['Document_Content'][301][1].replace('II. Objetivo', 'II. Objetivo y justificación')
df_pre_es.at[301, 'Document_Content'] = lista_aux

In [None]:
re.search(pattern_es_1, df_pre_es['Document_Content'][301][1], re.IGNORECASE)

In [None]:
index_es_to_check = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_1, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

##### 'Descripción de las actividades y resultados'

In [None]:
### pattern_2

In [None]:
pattern_es_2 = r'\n?\s?\n?\s?[IV\.\,ll]+\s+(Descripción (de )((las)?\s?actividades\s?|los\s)?((\/|\,\s)?Componentes y (Resultados|Actividades)|\/?\s?componentes3? y presupuesto(\:|\.)?|\sy resultados|\sdel proyecto|\sy presupuesto|\, componentes y presupuesto\.?|componentes\s?(\,|\/)\s?actividades y (productos|presupuesto|resultados)|actividades y productos|\, componentes\, resultados y presupuesto| y presupuesto\.?|y Resultados|\, resultados y presupuesto|\, los componentes y el presupuesto))|(Actividades\/componentes y presupuesto)|(Actividades y Componentes)|(Descripción de componentes\/actividades y presupuesto)|(Descripción de las actividades)|(Descripción de los objetivos actividades y presupuesto)|(Descripción de componentes y productos)|(Descripción Actividades y Resultados)\s{0,}\n'

In [None]:
index_es_to_check = []
results_in_page_0 = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_2, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            if i == 0:
                results_in_page_0.append(index)
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)
print('Results in page 0:', results_in_page_0)

In [None]:
df_pre_es['Document_Content'][884][4]

####

##### 'Agencia Ejecutora y estructura de ejecución'

In [None]:
pattern_es_3 = r'\n?\s?\n?\s?[IV\.5]+\s{0,}(((4\.1\s+)?Agencia Ejecutora(\s+\(AE\))?|Organismo Ejecutor|Unidad Ejecutora|Entidad Ejecutora) y estructura de ejecución|Estructura del Organismo Ejecutor\s?\(?O?E?\)?|Organismo de Ejecución y Estructura de Implementación|Agencia ejecutora y justificación de la estructura de ejecución|Organismo Ejecutor|Estructura de ejecución|Agencia Ejecutora|Mecanismo de Ejecución)\s{0,}\n?'

In [None]:
index_es_to_check = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_3, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

### Titles results

In [None]:
# for storing the results:
df_pre_es['title_inicial'] = ''
df_pre_es['title_medio'] = ''
df_pre_es['title_final'] = ''

###### title inicial

In [None]:
index_es_to_check = []
# identify 1st title location:
for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(0,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_1, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            
            ## storing:
            inicial_match_title = re.search(pattern_es_1, df_pre_es['Document_Content'][index][i], re.IGNORECASE).group()
            inicial_match_page = i
            df_pre_es.at[index, 'title_inicial'] = (inicial_match_title, inicial_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)    


In [None]:
df_pre_es.head()

In [None]:
df_pre_es.title_inicial[2]

###### title medio

In [None]:
index_es_to_check = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['title_inicial'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_2, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            medio_match_title = re.search(pattern_es_2, df_pre_es['Document_Content'][index][i], re.IGNORECASE).group()
            medio_match_page = i
            df_pre_es.at[index, 'title_medio'] = (medio_match_title, medio_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

###### title final

In [None]:
index_es_to_check = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['title_medio'][index][1] # starting page

    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_3, df_pre_es['Document_Content'][index][i], re.IGNORECASE) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            final_match_title = re.search(pattern_es_3, df_pre_es['Document_Content'][index][i], re.IGNORECASE).group()
            final_match_page = i
            df_pre_es.at[index, 'title_final'] = (final_match_title, final_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
df_pre_es.head()

#### check for crossed titles

In [None]:
other_case = []
for index, row in df_pre_es.iterrows():
    if (df_pre_es.title_inicial[index][1] < df_pre_es.title_medio[index][1] < df_pre_es.title_final[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_pre_es.title_final[index][1]> df_pre_es.title_inicial[index][1] > df_pre_es.title_medio[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        other_case.append(index)
        
    if (df_pre_es.title_final[index][1] - df_pre_es.title_inicial[index][1]) > 10: # alert on cases where extension between titles is greater than 10
        print('File to check due to extension between titles:', df_pre_es['Document_Name'][index])
        print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))
        print()

In [None]:
other_case

In [None]:
index = 170
print(df_pre_es['Document_Name'][index])
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

### Removed from df_pre_es: 
- `BR-T1408_Approval Document-BR-T1408_18Nov2019.pdf`

In [None]:
df_pre_es.drop([170], inplace=True)

In [None]:
df_pre_es.shape

In [None]:
index = 321
print(df_pre_es['Document_Name'][index])
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

In [None]:
index = 1377
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

In [None]:
index = 1338
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

In [None]:
index = 1328
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

In [None]:
index = 984
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

In [None]:
index = 777
#max([df_pre_es.title_inicial[index][1],df_pre_es.title_medio[index][1],df_pre_es.title_final[index][1]])
print((df_pre_es.title_inicial[index][0], df_pre_es.title_inicial[index][1]), (df_pre_es.title_medio[index][0], df_pre_es.title_medio[index][1]), (df_pre_es.title_final[index][0], df_pre_es.title_final[index][1]))

#### footer and header clean-up

In [None]:
df_pre_es.shape

In [None]:
# to store the extracted content:
df_pre_es['extracted'] = ''

In [None]:
# Clean-up routine (v1.0)
#for index in [30]:
for index, row in df_pre_es.iterrows():
    page_ini = df_pre_es.title_inicial[index][1]
    page_fin = df_pre_es.title_final[index][1]
    
    print('### Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_pre_es['Document_Content'][index][j]
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}\d\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    # cutting sections based on titles
    ini = re.search(pattern_es_1, texto, re.IGNORECASE).span()[0]
    

    if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
        fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
        
    else:   # search for pattern_3, as border condition
        fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    #fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    texto = texto[ini:fin].strip()
    #print(texto)
    
    # store extracted content in dataframe
    df_pre_es.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
# (store results as v0.6)

#### supra-indexes removal

In [None]:
print(df_pre_es['extracted'][1359])

In [None]:
# for cleaned content storing:
df_pre_es['extracted_cleaned'] = ''

In [None]:
for index, row in df_pre_es.iterrows():
    texto = df_pre_es['extracted'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\-\)]+\d{1,2}\.?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_pre_es.at[index, 'extracted_cleaned'] = res_clean

In [None]:
df_pre_es.head()

In [None]:
df_pre_es.extracted_cleaned[0]

In [None]:
#(store as v0.7 - pending convert 6 to ó)

In [None]:
df_pre_es[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name',
       'Document_Status', 'blank_pages', 'page_count',
       'doc_identifier', 'title_inicial',
       'title_medio', 'title_final', 'extracted', 'extracted_cleaned']].to_excel('TCs_Approval-Docs_ES_Processing_2020-07-14_v07.xlsx')

In [None]:
df_pre_es.Document_Content[171]

In [None]:
df_pre_es.Document_Name[171]

In [None]:
df_pre_es.loc[[171]]

In [None]:
df_pre_es.title_inicial[171], df_pre_es.title_medio[171], df_pre_es.title_final[171]

In [None]:
for index in other_case:
    print(str(index))
    print(df_pre_es.title_inicial[index], df_pre_es.title_medio[index], df_pre_es.title_final[index])
    print('~~~')
    print()

In [None]:
print(df_pre_es.Document_Content[673][4])

In [None]:
#######################

## v0.8 - English TCs processing:

In [None]:
## Load joblib from v0.4
# Load source file:
df_pre_full = joblib.load('./output/df_filtered_2_step_2_2020-07-11_v04.joblib.bz2')

In [None]:
df_pre_full.shape

In [None]:
#######################

### v0.8 - Language detection

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
df_pre_full['language'] = ''

In [None]:
#### Detection performed on the first 3 pages:

In [None]:
%%time
for index, row in df_pre_full.iterrows():
    if df_pre_full['doc_type'][index] == 'tc':
        df_pre_full.at[index, 'language'] = detect(''.join(df_pre_full['Document_Content'][index][:3])) # run language detection on Document_Content[page]
    else:
        df_pre_full.at[index, 'language'] = 'na'

In [None]:
df_pre_full.language.value_counts()

In [None]:
df_pre_full.head(20)

#### Notes:
After inspection and titles search, some documents were found that their language was different from the expected.
The following indexes were manually changed into 'en'
909, 920, 1045 and 1105
<br>
<br>

In [None]:
# Manually changed language
df_pre_full.at[909, 'language'] = 'en'
df_pre_full.at[920, 'language'] = 'en'
df_pre_full.at[1045, 'language'] = 'en'
df_pre_full.at[1105, 'language'] = 'en'

In [None]:
df_pre_en = df_pre_full[df_pre_full['language'] == 'en'].copy()

In [None]:
df_pre_en.shape

####  Manually removed:
<br>index 56 - 'BH-T1059_Approval Document - BH-T1059.pdf'
<br>index 65 - 'BH-T1074_Approval Document-BH-T1074 _05Sep2019-104741.pdf'
<br> and indexes: [947, 948, 949, 1311] related to hurricane assistance

In [None]:
df_pre_en.drop([56], inplace=True)

In [None]:
df_pre_en.drop([65], inplace=True)

In [None]:
for i in [947, 948, 949, 1311]:
    df_pre_en.drop([i], inplace=True)

In [None]:
df_pre_en.shape

### Titles search

##### 'Objectives and Justification'

In [None]:
pattern_en_1 = r'\n?\s?\n?\s?[2IV31l]+\.?\s{0,}(Objectives?\s+and\s+(J|j)ustification( of the TC)?|OBJECTIVES? AND JUSTIFICATION( OF THE TC)?|JUSTIFICATION AND OBJECTIVE|(TC|TECHNICAL COOPERATION) OBJECTIVES AND RATIONALE|Justification and Objectives of the TC|Description of the Associated Loan|JUSTIFICATION|Background\, Objectives and Justification of the TC)\s{0,}\n?'

In [None]:
index_en_to_check = []

for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_1, df_pre_en['Document_Content'][index][i]) != None: #, re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

In [None]:
########

##### 'Description of activities/components and budget | Description of Activities and Outputs' 

In [None]:
### pattern_2

In [None]:
pattern_en_2 = r'\n?\s?\n?\s?[IV\.\·\,ll3]+\s+(Description of (the )?[aA]ctivities\/[cC]omponents?( and [bB]udget)?|Description of components and budget|DESCRIPTION OF COMPONENTS AND BUDGET|DESCRIPTION OF ACTIVITIES\/COMPONENTS AND BUDGET|Description of Activities and Budget|Description of activities\, components and budget|Description of activity/component and budget|DESCRIPTION OF ACTIVITIES( AND OUTPUTS)?|Description of (A|a)ctivities and (O|o)utputs|Description of components and activities|Description of activities \/ components and budget|Description of activities\/components|Description of components\/activities and budget|Description of Activities\/ Components and Budget|Activity and Results Description|Description of Components and Activities|Description of activities and results|Description of activities\, outputs and budget|Description of Activities \/ component and budget|Description of Components\, Activities and Budget|Description of Activities \/ Components and Budget|Description of activities\/ components and budget|Description of Activities\/Outputs and Budget)\s{0,}\n?'

In [None]:
index_en_to_check = []
results_in_page_0 = []

for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_2, df_pre_en['Document_Content'][index][i]) != None: #, re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            if i == 0:
                results_in_page_0.append(index)
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)
print('Results in page 0:', results_in_page_0)

####

##### 'Executing agency and execution structure'

In [None]:
## Replace specific titles for the following indexes: 
# 121
lista_aux = df_pre_en['Document_Content'][121]
lista_aux[6] = df_pre_en['Document_Content'][121][6].replace('V. DESCRIPTION OF ACTIVITIES/COMPONENTS AND BUDGET', 'V. EXECUTING AGENCY')
df_pre_en.at[121, 'Document_Content'] = lista_aux

In [None]:
## Replace specific titles for the following indexes: 
# 1164
lista_aux = df_pre_en['Document_Content'][1164]
lista_aux[6] = df_pre_en['Document_Content'][1164][6].replace('IV. Budget', 'IV. EXECUTING AGENCY')
df_pre_en.at[1164, 'Document_Content'] = lista_aux

In [None]:
pattern_en_3 = r'\n?\s?\n?\s?[IV\.54]+\s{0,}(4\.1\s+)?(Executing [Aa]gency and [Ee]xecution [Ss]tructure|EXECUTING AGENCY( AND EXECUTION STRUCTURE)?|Executing agency and execution|Executing Agency \(EA\) and execution structure|Executing Agency and Executing Structure|Executing agency and execution structure|EA AND EXECUTION STRUCTURE)\s{0,}\n?'

In [None]:
index_en_to_check = []

for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_3, df_pre_en['Document_Content'][index][i]) != None: #, re.IGNORECASE) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

### Titles results

In [None]:
# for storing the results:
df_pre_en['title_inicial'] = ''
df_pre_en['title_medio'] = ''
df_pre_en['title_final'] = ''

###### title inicial

In [None]:
index_en_to_check = []
# identify 1st title location:
for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['doc_identifier'][index][1] # starting page
    
    for i in range(0,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_1, df_pre_en['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            
            ## storing:
            inicial_match_title = re.search(pattern_en_1, df_pre_en['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_pre_en.at[index, 'title_inicial'] = (inicial_match_title, inicial_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)    


In [None]:
df_pre_en.head()

In [None]:
df_pre_en.title_inicial[22]

###### title medio

In [None]:
index_en_to_check = []

for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['title_inicial'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_2, df_pre_en['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            medio_match_title = re.search(pattern_en_2, df_pre_en['Document_Content'][index][i]).group()
            medio_match_page = i
            df_pre_en.at[index, 'title_medio'] = (medio_match_title, medio_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

###### title final

In [None]:
index_en_to_check = []

for index, row in df_pre_en.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_en['title_medio'][index][1] # starting page

    for i in range(page_base,len(df_pre_en['Document_Content'][index])):
        if re.search(pattern_en_3, df_pre_en['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            final_match_title = re.search(pattern_en_3, df_pre_en['Document_Content'][index][i]).group()
            final_match_page = i
            df_pre_en.at[index, 'title_final'] = (final_match_title, final_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

In [None]:
df_pre_en.head()

#### check for crossed titles

In [None]:
other_case = []
for index, row in df_pre_en.iterrows():
    if (df_pre_en.title_inicial[index][1] < df_pre_en.title_medio[index][1] < df_pre_en.title_final[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_pre_en.title_final[index][1]> df_pre_en.title_inicial[index][1] > df_pre_en.title_medio[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        other_case.append(index)
        
    if (df_pre_en.title_final[index][1] - df_pre_en.title_inicial[index][1]) > 10: # alert on cases where extension between titles is greater than 10
        print('File to check due to extension between titles:', df_pre_en['Document_Name'][index])
        print((df_pre_en.title_inicial[index][0], df_pre_en.title_inicial[index][1]), (df_pre_en.title_medio[index][0], df_pre_en.title_medio[index][1]), (df_pre_en.title_final[index][0], df_pre_en.title_final[index][1]))
        print()

In [None]:
other_case

In [None]:
index = 81
#max([df_pre_en.title_inicial[index][1],df_pre_en.title_medio[index][1],df_pre_en.title_final[index][1]])
print((df_pre_en.title_inicial[index][0], df_pre_en.title_inicial[index][1]), (df_pre_en.title_medio[index][0], df_pre_en.title_medio[index][1]), (df_pre_en.title_final[index][0], df_pre_en.title_final[index][1]))

In [None]:
index = 1294
#max([df_pre_en.title_inicial[index][1],df_pre_en.title_medio[index][1],df_pre_en.title_final[index][1]])
print((df_pre_en.title_inicial[index][0], df_pre_en.title_inicial[index][1]), (df_pre_en.title_medio[index][0], df_pre_en.title_medio[index][1]), (df_pre_en.title_final[index][0], df_pre_en.title_final[index][1]))

#### footer and header clean-up

In [None]:
df_pre_en.shape

In [None]:
# to store the extracted content:
df_pre_en['extracted'] = ''

In [None]:
# Clean-up routine (v1.0)
#for index in [30]:
for index, row in df_pre_en.iterrows():
    page_ini = df_pre_en.title_inicial[index][1]
    page_fin = df_pre_en.title_final[index][1]
    
    print('### Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_pre_en['Document_Content'][index][j]
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}\d\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    
    
    # cutting sections based on titles
    if re.search(pattern_en_1, texto).span()[0] != None:
        ini = re.search(pattern_en_1, texto).span()[0]
    
    # alternatively:
    else:
        ini = re.search(df_pre_en['title_inicial'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    ##fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    ##texto = texto[ini:fin].strip()[:-3]
    ##print(texto)
    

    if re.search(r'Indicative (B|b)udget', texto) != None:  # search for 'Presupuesto Indicativo'
        fin = re.search(r'Indicative (B|b)udget', texto).span()[0]
        
    else:   # search for pattern_3, as border condition
        fin = re.search(pattern_en_3, texto).span()[0]
    
    #fin = re.search(pattern_en_3, texto, re.IGNORECASE).span()[0]
    texto = texto[ini:fin].strip()
    #print(texto)
    
    # store extracted content in dataframe
    df_pre_en.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
# (store results as v0.8)

#### supra-indexes removal

In [None]:
#print(df_pre_en['extracted'][177])

In [None]:
# for cleaned content storing:
df_pre_en['extracted_cleaned'] = ''

In [None]:
for index, row in df_pre_en.iterrows():
    texto = df_pre_en['extracted'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\-\)\”]+\d{1,2}\.?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_pre_en.at[index, 'extracted_cleaned'] = res_clean

In [None]:
df_pre_en.head()

In [None]:
df_pre_en.extracted_cleaned[29]

In [None]:
df_pre_en[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name',
       'Document_Status', 'blank_pages', 'page_count',
       'doc_identifier', 'title_inicial',
       'title_medio', 'title_final', 'extracted', 'extracted_cleaned']].to_excel('TCs_Approval-Docs_EN_Processing_2020-08-23_v09.xlsx')

In [None]:
#######################

## v2.0: Spanish language TCs processing

In [None]:
import joblib

In [None]:
## Load joblib from v0.7
# Load source file:
df_pre_es = joblib.load('./output/df_pre_es_2020-07-14_v07.joblib.bz2')

In [None]:
df_pre_es.head(30)

In [None]:
df_pre_es['Document_Content'][171][3]

In [None]:
# adjustment:
aux = df_pre_es['Document_Content'][171]
#removal of 3rd element since it is an index page
aux = aux[:2] + aux[4:]
#
df_pre_es.at[171, 'Document_Content'] = aux

In [None]:
# Manually change of initial title for document on 171: 
lista_aux = df_pre_es['Document_Content'][171]
lista_aux[3] = df_pre_es['Document_Content'][171][3].replace('II. OBJETIVOS Y JUSTIFICACIÓN DE LA OPERACIÓN DE COOPERACIÓN TÉCNICA  ', 'II. Objetivos y justificación de la Cooperación Técnica')
df_pre_es.at[171, 'Document_Content'] = lista_aux

In [None]:
df_pre_es['Document_Content'][171][2:5]

In [None]:
df_pre_es.at[171, 'doc_identifier'] = ('DOCUMENTO DE COOPERACIÓN TÉCNICA', 2)

##### 'Objetivos y justificación' - v2.0

In [None]:
#v2.0
#pattern_es_1 = r'\n?\s?\n?\s?[2IV31lI]+\.?\s{0,}(Objetivos? y Justificación((\s?de (la\s)?(CT\:?|Cooperación Técnica|TC)\.?)|(\sdel Proyecto)|\:| de la Cooperación Técnica \(CT\))?|Objetivos y justificación de la CT|Objetivos y justificación de la Cooperación Técnica|Justificación y Objetivos de la CT|Justificación y Objetivo|Problema\, Objetivos y Justificación de la CT\.?|OBJETIVOS Y JUSTIFICACIÓN DE LA OPERACIÓN DE COOPERACIÓN TÉCNICA|OBJETIVOS Y JUSTIFICACIÓN|Justificaci6n y objetivo|Objetivos y Justificación de la CT \(estimado\: 1 página\)|DESCRIPCIÓN DEL PRÉSTAMO\/GARANTÍA ASOCIADO)\s{0,}\n?'
pattern_es_1 = r'\n?\s?\n?\s?[2IV31lI]+\.?\s{0,}(Objetivos? y Justificación((\s?de (la\s)?(CT\:?|Cooperación Técnica|TC)\.?)|(\sdel Proyecto)|\:| de la Cooperación Técnica \(CT\))?|Objetivos y justificación de la CT|Objetivos y justificación de la Cooperación Técnica|Justificación y Objetivos de la CT|Justificación y Objetivo|Problema\, Objetivos y Justificación de la CT\.?|OBJETIVO Y JUSTIFICACIÓN DE LA CT|OBJETIVOS Y JUSTIFICACIÓN DE LA OPERACIÓN DE COOPERACIÓN TÉCNICA|JUSTIFICACIÓN Y OBJETIVOS DE LA CT|OBJETIVOS Y JUSTIFICACIÓN|Justificaci6n y objetivo|Objetivos y Justificación de la CT \(estimado\: 1 página\)|Objetivo y justificación|Objetivos y justificación|DESCRIPCIÓN DEL PRÉSTAMO\/GARANTÍA ASOCIADO|Descripción del préstamo\/garantía asociado)\s{0,}\n?'
pattern_en_1 = r'\n?\s?\n?\s?[2IV31lI]+\.?\s{0,}(Objectives?\s+and\s+(J|j)ustification( of the TC)?|OBJECTIVES? AND JUSTIFICATION( OF THE TC)?|JUSTIFICATION AND OBJECTIVE|(TC|TECHNICAL COOPERATION) OBJECTIVES AND RATIONALE|Justification and Objectives of the TC|Description of the Associated Loan|JUSTIFICATION|Background\, Objectives and Justification of the TC|OBJECTIVE AND RATIONALE OF THE TC|OBJECTIVES AND RATIONALE OF THE TECHNICAL COOPERATION OPERATION)\s{0,}\n?'

In [None]:
# Spanish documents:
index_es_to_check = []

for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_1, df_pre_es['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
index_es_to_check

##### 'Agencia Ejecutora y estructura de ejecución' - v2.0

In [None]:
Agencia Ejecutora y Estructura de Ejecución

In [None]:
#v2.0
pattern_es_3 = r'\n?\s?\n?\s?[IV\.5]+\s{0,}(((4\.1\s+)?Agencia Ejecutora(\s+\(AE\))?|Organismo [Ee]jecutor|Unidad Ejecutora|Entidad Ejecutora) y [Ee]structura de [Ee]jecución|Estructura del Organismo Ejecutor\s?\(?O?E?\)?|Organismo de Ejecución y Estructura de Implementación|(Agencia|Unidad) [Ee]jecutora y [Ee]structura de [Ee]jecución|ORGANISMO EJECUTOR Y ESTRUCTURA DE IMPLEMENTACIÓN|AGENCIA EJECUTORA Y ESTRUCTURA DE EJECUCIÓN|ORGANISMO EJECUTOR Y ESTRUCTURA DE EJECUCIÓN|Agencia ejecutora y justificación de la estructura de ejecución|Organismo Ejecutor|Estructura de ejecución|Agencia Ejecutora|Mecanismo de Ejecución)\s{0,}\n?'
pattern_en_3 = r'\n?\s?\n?\s?[IV\.54]+\s{0,}(4\.1\s+)?(Executing [Aa]gency and [Ee]xecution [Ss]tructure|EXECUTING AGENCY( AND EXECUTION STRUCTURE)?|Executing agency and execution|Executing Agency \(EA\) and execution structure|Executing Agency and Executing Structure|Executing agency and execution structure|EA AND EXECUTION STRUCTURE)\s{0,}\n?'

In [None]:
index_es_to_check = []

for index, row in df_pre_es[df_pre_es.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_3, df_pre_es['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

### Re-doing for spanish docs title_inicial and title_final

###### title inicial - v2.0

In [None]:
# Spanish:
index_es_to_check = []
# identify 1st title location:
for index, row in df_pre_es.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_pre_es['doc_identifier'][index][1] # starting page
    
    for i in range(0,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_1, df_pre_es['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            
            ## storing:
            inicial_match_title = re.search(pattern_es_1, df_pre_es['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_pre_es.at[index, 'title_inicial'] = (inicial_match_title, inicial_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)    


###### title final - v2.0

In [None]:
#Spanish:
index_es_to_check = []

for index, row in df_pre_es[df_pre_es.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    #page base is the first title:
    page_base = df_pre_es['title_inicial'][index][1] + 1 # starting page

    for i in range(page_base,len(df_pre_es['Document_Content'][index])):
        if re.search(pattern_es_3, df_pre_es['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            final_match_title = re.search(pattern_es_3, df_pre_es['Document_Content'][index][i]).group()
            final_match_page = i
            df_pre_es.at[index, 'title_final'] = (final_match_title, final_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
# check for those cases where the page distance is <2:
for index, row in df_pre_es.iterrows():
    page_ini = df_pre_es.title_inicial[index][1]
    page_fin = df_pre_es.title_final[index][1]
    #print(str(index), page_fin - page_ini, df_pre_es['title_inicial'][index], df_pre_es['title_final'][index])
    if (page_fin - page_ini) < 2:
        print('Alert index', str(index), "---", "Lenght:", (page_fin - page_ini), df_pre_es['Document_Name'][index])
        print("        ", str(index), page_fin - page_ini, df_pre_es['title_inicial'][index], df_pre_es['title_final'][index])
        print()

In [None]:
df_pre_es['Document_Name'][96]

In [None]:
df_pre_es['Document_Name'][1]

In [None]:
# test
index = 223
lista_pages = []
page_ini = df_pre_es.title_inicial[index][1]
page_fin = df_pre_es.title_final[index][1]
print(page_fin - page_ini)
if (page_fin - page_ini) < 2: 
    lista_pages.append(df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]][re.search(df_pre_es['title_inicial'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]]).span()[0]:])
    #lista_pages.append(df_pre_es['Document_Content'][index][df_pre_es['title_final'][index][1]][:re.search(df_pre_es['title_final'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_final'][index][1]]).span()[1]])
    lista_pages.append(df_pre_es['Document_Content'][index][page_fin][:df_pre_es['Document_Content'][index][page_fin].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])])
else: 
    lista_pages.append(df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]][re.search(df_pre_es['title_inicial'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]]).span()[0]:])
    for j in range(page_ini+1,page_fin-1): 
        lista_pages.append(df_pre_es['Document_Content'][index][j])
    
    lista_pages.append(df_pre_es['Document_Content'][index][page_fin][:df_pre_es['Document_Content'][index][page_fin].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])])

    
    
#page_inicial
#print(df_pre_es['title_inicial'][index][1])
#df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]][re.search(df_pre_es['title_inicial'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]]).span()[0]:]
print('length: ', str(len(lista_pages)))
for k in range(0, len(lista_pages)):
    print(lista_pages[k])

In [None]:
re.compile(df_pre_es['title_final'][index][0])

In [None]:
re.search(re.compile(df_pre_es['title_final'][index][0]), df_pre_es['Document_Content'][index][6])

In [None]:
df_pre_es['Document_Content'][index][6].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])

In [None]:
df_pre_es['Document_Content'][index][6][:df_pre_es['Document_Content'][index][6].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])]

In [None]:
df_pre_es['Document_Content'][index][6][:351]

In [None]:
print("        ", str(index), page_fin - page_ini, df_pre_es['title_inicial'][index], df_pre_es['title_final'][index])

#### Generate the list of pages, delimited by title_inicial y title_final

In [None]:
df_pre_es['lista_paginas'] = ''

In [None]:
for index, row in df_pre_es.iterrows():
    print('processing index', str(index))
    lista_pages = []
    page_ini = df_pre_es.title_inicial[index][1]
    page_fin = df_pre_es.title_final[index][1]
    if (page_fin - page_ini) < 2: 
        lista_pages.append(df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]][re.search(df_pre_es['title_inicial'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]]).span()[0]:])
        lista_pages.append(df_pre_es['Document_Content'][index][page_fin][:df_pre_es['Document_Content'][index][page_fin].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])])

    else: 
        lista_pages.append(df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]][re.search(df_pre_es['title_inicial'][index][0], df_pre_es['Document_Content'][index][df_pre_es['title_inicial'][index][1]]).span()[0]:])
        for j in range(page_ini+1,page_fin): 
            lista_pages.append(df_pre_es['Document_Content'][index][j])
        lista_pages.append(df_pre_es['Document_Content'][index][page_fin][:df_pre_es['Document_Content'][index][page_fin].find(df_pre_es['title_final'][index][0])+len(df_pre_es['title_final'][index][0])])
    
    df_pre_es.at[index, 'lista_paginas'] = lista_pages
    del lista_pages
    del page_ini
    del page_fin


In [None]:
#print(df_pre_es.title_inicial[index], df_pre_es.title_final[index])
#print(df_pre_es['lista_paginas'][index])
#page_ini = df_pre_es.title_inicial[index][1]
#page_fin = df_pre_es.title_final[index][1]
#print(str(page_fin - page_ini))

In [None]:
#df_pre_es['Document_Content'][index][3:9]

In [None]:
index = 1355
#index = 1
for index in [index]:
    print(df_pre_es['lista_paginas'][index])
    longitud = len(df_pre_es['lista_paginas'][index])
    print('### Processing index: ', str(index), ' - page range:', str(longitud))
    texto = ''
    for j in range(0,longitud):

        page = df_pre_es['lista_paginas'][index][j]
        
        # header cleanup:
        page = re.sub(r'(^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-|^\-\s{5,9})', ' \n ', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
                       
        # footnotes - pending
        elif re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        else: 
            texto = texto + ''.join(page) + ' '
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;\,]+', ' ', texto)
    
    
    print(texto)

### text_extraction and clean-up routine

In [None]:
df_pre_es['extracted_v2'] = ''

In [None]:
# New text_extraction and clean-up routine (v2.2 - 08/30/2020)

for index, row in df_pre_es.iterrows():
    #print(df_pre_es['lista_paginas'][index])
    longitud = len(df_pre_es['lista_paginas'][index])
    print('### Processing index: ', str(index), ' - page range:', str(longitud))
    texto = ''
    for j in range(0,longitud):

        page = df_pre_es['lista_paginas'][index][j]
        
        # header cleanup:
        page = re.sub(r'(^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-|^\-\s{5,9})', ' \n ', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
                       
        # footnotes - pending
        elif re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        else: 
            texto = texto + ''.join(page) + ' '
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;\,]+', ' ', texto)
    
    
    #print(texto)
    
    df_pre_es.at[index, 'extracted_v2'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
df_pre_es.head()

In [None]:
count = 0
for index, row in df_pre_es.iterrows():
    #if (len(df_pre_es['extracted'][index]) < len(df_pre_es['extracted_v2'][index])):
        #print(True, str((len(df_pre_es['extracted_v2'][index]) - len(df_pre_es['extracted'][index]))))
    #    print()
    if not (len(df_pre_es['extracted'][index]) < len(df_pre_es['extracted_v2'][index])):
        print("!!!! Alert on", str(index), str(len(df_pre_es['extracted'][index]) - len(df_pre_es['extracted_v2'][index])))
        count += 1
print(str(count))

In [None]:
len('''II. Descripción del Préstamo/Garantía Asociado 
2.1 La cooperación técnica dará apoyo operativo a los Programas de Innovación 


Empresarial y Emprendimiento I (UR-L1142) y II (UR-L1158), los cuales son 
ejecutados por la Agencia Nacional de Investigación e Innovación (ANII) y financiados 
por el Banco Interamericano de Desarrollo (BID). Estos programas forman parte de la 
Línea de Crédito Condicional para Proyectos de Inversión (CCLIP) aprobada en 2017 
por un monto de US$100 millones (UR-O1153). El objetivo de la CCLIP es 
incrementar la productividad de las empresas mediante una mayor inversión en 
conocimiento, recursos humanos, innovación y emprendimiento. En particular, la 
cooperación técnica, apoyará los procesos de monitoreo y evaluación en los 
mencionados programas, así como también contribuirá a mejorar sus niveles de 
trasparencia, mediante una difusión automática, a través del desarrollo de un portal 
que pondrá a disposición de los ciudadanos datos abiertos sobre beneficiarios 
atendidos y los apoyos realizados.''')

In [None]:
print(df_pre_es['extracted'][1354])

In [None]:
print(df_pre_es['extracted_v2'][1354])

In [None]:
df_pre_es['extracted_final'] = ''

In [None]:
# Clean-up routine (v2.01 - spanish - 2020-08-27)
##

for index, row in df_pre_es.iterrows():
    page_ini = df_pre_es.title_inicial[index][1]
    page_fin = df_pre_es.title_final[index][1]
    
    print('### Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_pre_es['Document_Content'][index][j]
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    
    
    # cutting sections based on titles
    if df_pre_es['language'][index] == 'en':  # English
        if re.search(pattern_en_1, texto).span()[0] != None:
            ini = re.search(pattern_en_1, texto).span()[0]
    
        # alternatively:
        else:
            ini = re.search(df_pre_es['title_inicial'][index][0][:-1], texto).span()[0]
    
    else: # Spanish
        if re.search(pattern_es_1, texto).span()[0] != None:
            ini = re.search(pattern_es_1, texto).span()[0]
    
        # alternatively:
        else:
            ini = re.search(df_pre_es['title_inicial'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    ##fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    ##texto = texto[ini:fin].strip()[:-3]
    ##print(texto)
    
    #Presupuesto Indicativo

    if re.search(r'(Indicative [Bb]udget)|(Presupuesto [Ii]ndicativo)', texto) != None:  # search for 'Presupuesto Indicativo'
        fin = re.search(r'(Indicative [Bb]udget)|(Presupuesto [Ii]ndicativo)', texto).span()[0]
        
    else:   # search for pattern_3, as border condition
        if df_pre_es['language'][index] == 'en':  # English
            fin = re.search(pattern_en_3, texto).span()[0]
        else:
            fin = re.search(pattern_es_3, texto).span()[0]
    
    #fin = re.search(pattern_en_3, texto, re.IGNORECASE).span()[0]
    texto = texto[ini:fin].strip()
    #print(texto)
    
    # store extracted content in dataframe
    df_pre_es.at[index, 'extracted_final'] = texto
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

## v1.0 - NULL_URL TCs processing:

In [None]:
## Load joblib from v0.5
# Load source file:
df_pre_null = joblib.load('./output/TCs_Approval-NULL_URL-Doc_Collection_2020-07-14_v05_.joblib.bz2')

In [None]:
df_pre_null.shape

In [None]:
df_pre_null.head()

In [None]:
df_pre_null = df_pre_null[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'Document_Status']].copy()

In [None]:
#######################

In [None]:
df_pre_null['Document_Content'] = ''

In [None]:
df_pre_null.head()

In [None]:
df_pre_null.Document_Status.value_counts()

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_NULLs"

print(file_dir)

###### Read the documents and store the content in the dataframe:

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in df_pre_null.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + df_pre_null.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        df_pre_null.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        df_pre_null.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
#######################

In [None]:
# Remove the ones not downloaded:
df_pre_null.drop([12, 22], inplace=True)

In [None]:
#df_pre_null.drop(["language"], axis=1, inplace=True)

In [None]:
df_pre_null['blank_pages'] = ''

In [None]:
for index, row in df_pre_null.iterrows():
    print('## Processing index', str(index))
    lista = df_pre_null['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    df_pre_null.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

In [None]:
df_pre_null['page_count'] = df_pre_null['Document_Content'].apply(lambda x: len(x))

In [None]:
df_pre_null.head(10)

In [None]:
#######################

#### Step_1

######  filtering

In [None]:
# Documents longer than 6 pages lenght and less than 60% of blank pages:
df_filtered_null = df_pre_null[(~df_pre_null['Document_Name'].str.contains('Approval Document - GA-274-1')) & (df_pre_null.page_count > 6) & (df_pre_null.blank_pages.astype(float) < 60)].copy()
df_filtered_null.head(10)

In [None]:
df_filtered_null

#### Step_2 

######  classifier

After the above filtering decisions, a second filtering level is performed where TC's components -such as common titles- are reviewed

In [None]:
# Copy of the previous result to work with:
df_filtered_null_2 = df_filtered_null.copy()

In [None]:
# stores the type of document: tc's, other
df_filtered_null_2['doc_type'] = ''
# stores the matching title that defines the type and its page
df_filtered_null_2['doc_identifier'] = ''

In [None]:
to_review = []
tc_count = 0
for index, row in df_filtered_null_2.iterrows():
    is_tc = False
    for page in range(0,len(df_filtered_null_2.Document_Content[index])):
        if re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|TC DOCUMENT|TECHNICAL COOPERATION DOCUMENT|TC Name', df_filtered_null_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('TC header found at page:', str(page))
            tc_count += 1
            is_tc = True
            df_filtered_null_2.at[index, 'doc_type'] = 'tc'
            match_title_type = re.search(r'DOCUMENTO DE COOPERACIÓN TÉCNICA|INFORMACIÓN BÁSICA DE LA CT|Información Básica de la (CT|Cooperación Técnica)|lnformaci6n Basica de la Cooperaci6n Tecnica|TC DOCUMENT|TECHNICAL COOPERATION DOCUMENT|TC Name', df_filtered_null_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_null_2.at[index, 'doc_identifier'] = (match_title_type, page)
            break
    if not is_tc: 
        print('check regex on:', str(index))
        df_filtered_null_2.at[index, 'doc_type'] = 'other'
        df_filtered_null_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('TCs identified:', str(tc_count))

In [None]:
to_review

In [None]:
df_filtered_null_2

In [None]:
#######################

### v1.0 - Language detection

In [None]:
from langdetect import detect

In [None]:
# stores language identified on the doc_identifier page
df_filtered_null_2['language'] = ''

In [None]:
#### Detection performed on the first 3 pages:

In [None]:
%%time
for index, row in df_filtered_null_2.iterrows():
    if df_filtered_null_2['doc_type'][index] == 'tc':
        df_filtered_null_2.at[index, 'language'] = detect(''.join(df_filtered_null_2['Document_Content'][index][:3])) # run language detection on Document_Content[page]
    else:
        df_filtered_null_2.at[index, 'language'] = 'na'

In [None]:
df_filtered_null_2.language.value_counts()

In [None]:
df_filtered_null_2.head(3)

In [None]:
df_filtered_null_2[df_filtered_null_2.language == 'es'].shape

In [None]:
############

### NULL_URL - Titles search

##### 'Objetivos y justificación'

In [None]:
pattern_es_1 = r'\n?\s?\n?\s?[2IV31l]+\.?\s{0,}(Objetivos? y Justificación((\s?de (la\s)?(CT\:?|Cooperación Técnica|TC)\.?)|(\sdel Proyecto)|\:| de la Cooperación Técnica \(CT\))?|Objetivos y justificación de la CT|Justificación y Objetivos de la CT|Justificación y Objetivo|Problema\, Objetivos y Justificación de la CT\.?|OBJETIVOS Y JUSTIFICACIÓN DE LA OPERACIÓN DE COOPERACIÓN TÉCNICA|OBJETIVOS Y JUSTIFICACIÓN|Justificaci6n y objetivo|Objetivos y Justificación de la CT \(estimado\: 1 página\)|DESCRIPCIÓN DEL PRÉSTAMO\/GARANTÍA ASOCIADO)\s{0,}\n?'
pattern_en_1 = r'\n?\s?\n?\s?[2IV31l]+\.?\s{0,}(Objectives?\s+and\s+(J|j)ustification( of the TC)?|OBJECTIVES? AND JUSTIFICATION( OF THE TC)?|JUSTIFICATION AND OBJECTIVE|(TC|TECHNICAL COOPERATION) OBJECTIVES AND RATIONALE|Justification and Objectives of the TC|Description of the Associated Loan|JUSTIFICATION|Background\, Objectives and Justification of the TC|OBJECTIVE AND RATIONALE OF THE TC|OBJECTIVES AND RATIONALE OF THE TECHNICAL COOPERATION OPERATION)\s{0,}\n?'

In [None]:
# Spanish documents:
index_es_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_1, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
# English documents:
index_en_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_1, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

##### 'Descripción de las actividades y resultados'

In [None]:
### pattern_2

In [None]:
pattern_es_2 = r'\n?\s?\n?\s?[IV\.\,ll]+\s+(Descripción (de )((las)?\s?actividades\s?|los\s)?((\/|\,\s)?Componentes y (Resultados|Actividades)|\/?\s?componentes3? y presupuesto(\:|\.)?|\sy resultados|\sdel proyecto|\sy presupuesto|\, componentes y presupuesto\.?|componentes\s?(\,|\/)\s?actividades y (productos|presupuesto|resultados)|actividades y productos|\, componentes\, resultados y presupuesto| y presupuesto\.?|y Resultados|\, resultados y presupuesto|\, los componentes y el presupuesto))|Descripción de las Actividades\/Componentes y Presupuesto|(Actividades\/componentes y presupuesto)|(Actividades y Componentes)|DESCRIPCIÓN DE LAS ACTIVIDADES, LOS COMPONENTES Y EL PRESUPUESTO|(Descripción de componentes\/actividades y presupuesto)|(Descripción de las actividades)|(Descripción de los objetivos actividades y presupuesto)|Descripción de las Actividades\, Componentes y Presupuesto|(Descripción de componentes y productos)|(Descripción Actividades y Resultados)|DESCRIPCIÓN DE LAS ACTIVIDADES[\,\/]\s?COMPONENTES Y PRESUPUESTO|DESCRIPCIÓN DE ACTIVIDADES\/COMPONENTES Y PRESUPUESTO6\s{0,}\n'
pattern_en_2 = r'\n?\s?\n?\s?[IV\.\·\,ll3]+\s+(Description of (the )?[aA]ctivities\/[cC]omponents?( and [bB]udget)?|Description of components and budget|DESCRIPTION OF COMPONENTS AND BUDGET|DESCRIPTION OF ACTIVITIES\/COMPONENTS AND BUDGET|Description of Activities and Budget|Description of activities\, components and budget|Description of activity/component and budget|DESCRIPTION OF ACTIVITIES( AND OUTPUTS)?|Description of (A|a)ctivities and (O|o)utputs|Description of components and activities|Description of activities \/ components and budget|Description of activities\/components|Description of components\/activities and budget|Description of Activities\/ Components and Budget|Activity and Results Description|Description of Components and Activities|Description of activities and results|Description of activities\, outputs and budget|Description of Activities \/ component and budget|Description of Components\, Activities and Budget|Description of Activities \/ Components and Budget|Description of activities\/ components and budget|Description of Activities\/Outputs and Budget|ACTIVITY\/COMPONENT DESCRIPTION AND BUDGET)\s{0,}\n?'

In [None]:
#Spanish documents:
index_es_to_check = []
results_in_page_0 = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_2, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            if i == 0:
                results_in_page_0.append(index)
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)
print('Results in page 0:', results_in_page_0)

In [None]:
#English documents:
index_en_to_check = []
results_in_page_0 = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_2, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            if i == 0:
                results_in_page_0.append(index)
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)
print('Results in page 0:', results_in_page_0)

##### 'Agencia Ejecutora y estructura de ejecución'

In [None]:
pattern_es_3 = r'\n?\s?\n?\s?[IV\.5]+\s{0,}(((4\.1\s+)?Agencia Ejecutora(\s+\(AE\))?|Organismo Ejecutor|Unidad Ejecutora|Entidad Ejecutora) y estructura de ejecución|Estructura del Organismo Ejecutor\s?\(?O?E?\)?|Organismo de Ejecución y Estructura de Implementación|Agencia ejecutora y estructura de ejecución|ORGANISMO EJECUTOR Y ESTRUCTURA DE IMPLEMENTACIÓN|AGENCIA EJECUTORA Y ESTRUCTURA DE EJECUCIÓN|ORGANISMO EJECUTOR Y ESTRUCTURA DE EJECUCIÓN|Agencia ejecutora y justificación de la estructura de ejecución|Organismo Ejecutor|Estructura de ejecución|Agencia Ejecutora|Mecanismo de Ejecución)\s{0,}\n?'
pattern_en_3 = r'\n?\s?\n?\s?[IV\.54]+\s{0,}(4\.1\s+)?(Executing [Aa]gency and [Ee]xecution [Ss]tructure|EXECUTING AGENCY( AND EXECUTION STRUCTURE)?|Executing agency and execution|Executing Agency \(EA\) and execution structure|Executing Agency and Executing Structure|Executing agency and execution structure|EA AND EXECUTION STRUCTURE)\s{0,}\n?'

In [None]:
index_es_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_3, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
index_en_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_3, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

### NULL_URL - Titles results

In [None]:
# for storing the results:
df_filtered_null_2['title_inicial'] = ''
df_filtered_null_2['title_medio'] = ''
df_filtered_null_2['title_final'] = ''

In [None]:
df_filtered_null_2.head()

###### title inicial

In [None]:
# Spanish:
index_es_to_check = []
# identify 1st title location:
for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(0,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_1, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            
            ## storing:
            inicial_match_title = re.search(pattern_es_1, df_filtered_null_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_null_2.at[index, 'title_inicial'] = (inicial_match_title, inicial_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)    


In [None]:
#English:
index_en_to_check = []
# identify 1st title location:
for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['doc_identifier'][index][1] # starting page
    
    for i in range(0,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_1, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            
            ## storing:
            inicial_match_title = re.search(pattern_en_1, df_filtered_null_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_null_2.at[index, 'title_inicial'] = (inicial_match_title, inicial_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)    


In [None]:
df_filtered_null_2.head()

###### title medio

In [None]:
#Spanish:
index_es_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['title_inicial'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_2, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            medio_match_title = re.search(pattern_es_2, df_filtered_null_2['Document_Content'][index][i]).group()
            medio_match_page = i
            df_filtered_null_2.at[index, 'title_medio'] = (medio_match_title, medio_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
#English:
index_en_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['title_inicial'][index][1] # starting page
    
    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_2, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            medio_match_title = re.search(pattern_en_2, df_filtered_null_2['Document_Content'][index][i]).group()
            medio_match_page = i
            df_filtered_null_2.at[index, 'title_medio'] = (medio_match_title, medio_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

In [None]:
df_filtered_null_2.head(10)

###### title final

In [None]:
#Spanish:
index_es_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'es'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['title_medio'][index][1] # starting page

    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_es_3, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            final_match_title = re.search(pattern_es_3, df_filtered_null_2['Document_Content'][index][i]).group()
            final_match_page = i
            df_filtered_null_2.at[index, 'title_final'] = (final_match_title, final_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_es_to_check.append(index)
print('Index to check', index_es_to_check)

In [None]:
#English:
index_en_to_check = []

for index, row in df_filtered_null_2[df_filtered_null_2.language == 'en'].iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_null_2['title_medio'][index][1] # starting page

    for i in range(page_base,len(df_filtered_null_2['Document_Content'][index])):
        if re.search(pattern_en_3, df_filtered_null_2['Document_Content'][index][i]) != None: # pattern found
            print('* pattern found at document page:', str(i))
            
            ## storing:
            final_match_title = re.search(pattern_en_3, df_filtered_null_2['Document_Content'][index][i]).group()
            final_match_page = i
            df_filtered_null_2.at[index, 'title_final'] = (final_match_title, final_match_page)
            ##
            
            print('-----------------    -----------------')
            pattern_found = True
            break       
        
    if not pattern_found: 
        print('check regex on:', str(index))
        index_en_to_check.append(index)
print('Index to check', index_en_to_check)

In [None]:
df_filtered_null_2.head(10)

#### check for crossed titles

In [None]:
other_case = []
for index, row in df_filtered_null_2.iterrows():
    if (df_filtered_null_2.title_inicial[index][1] < df_filtered_null_2.title_medio[index][1] < df_filtered_null_2.title_final[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_filtered_null_2.title_final[index][1]> df_filtered_null_2.title_inicial[index][1] > df_filtered_null_2.title_medio[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        other_case.append(index)
        
    if (df_filtered_null_2.title_final[index][1] - df_filtered_null_2.title_inicial[index][1]) > 10: # alert on cases where extension between titles is greater than 10
        print('File to check due to extension between titles:', df_filtered_null_2['Document_Name'][index])
        print((df_filtered_null_2.title_inicial[index][0], df_filtered_null_2.title_inicial[index][1]), (df_filtered_null_2.title_medio[index][0], df_filtered_null_2.title_medio[index][1]), (df_filtered_null_2.title_final[index][0], df_filtered_null_2.title_final[index][1]))
        print()

In [None]:
other_case

In [None]:
index = 5
#max([df_filtered_null_2.title_inicial[index][1],df_filtered_null_2.title_medio[index][1],df_filtered_null_2.title_final[index][1]])
print((df_filtered_null_2.title_inicial[index][0], df_filtered_null_2.title_inicial[index][1]), (df_filtered_null_2.title_medio[index][0], df_filtered_null_2.title_medio[index][1]), (df_filtered_null_2.title_final[index][0], df_filtered_null_2.title_final[index][1]))

In [None]:
index = 18
#max([df_filtered_null_2.title_inicial[index][1],df_filtered_null_2.title_medio[index][1],df_filtered_null_2.title_final[index][1]])
print((df_filtered_null_2.title_inicial[index][0], df_filtered_null_2.title_inicial[index][1]), (df_filtered_null_2.title_medio[index][0], df_filtered_null_2.title_medio[index][1]), (df_filtered_null_2.title_final[index][0], df_filtered_null_2.title_final[index][1]))

In [None]:
#df_filtered_null_2['Document_Content'][18][3]
df_filtered_null_2['Document_Name'][18]

In [None]:
index = 25
#max([df_filtered_null_2.title_inicial[index][1],df_filtered_null_2.title_medio[index][1],df_filtered_null_2.title_final[index][1]])
print((df_filtered_null_2.title_inicial[index][0], df_filtered_null_2.title_inicial[index][1]), (df_filtered_null_2.title_medio[index][0], df_filtered_null_2.title_medio[index][1]), (df_filtered_null_2.title_final[index][0], df_filtered_null_2.title_final[index][1]))

In [None]:
df_filtered_null_2['Document_Name'][25]

In [None]:
# Subsitute page 3 on documents index 18 and 25 with a blank_page:
df_filtered_null_2['Document_Content'][18][3] = ''
df_filtered_null_2['Document_Content'][25][3] = ''

#### footer and header clean-up

In [None]:
df_filtered_null_2.shape

In [None]:
# to store the extracted content:
df_filtered_null_2['extracted'] = ''

In [None]:
# Clean-up routine (v2.0 - multilingual approach - 2020-08-25)
##

for index, row in df_filtered_null_2.iterrows():
    page_ini = df_filtered_null_2.title_inicial[index][1]
    page_fin = df_filtered_null_2.title_final[index][1]
    
    print('### Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_filtered_null_2['Document_Content'][index][j]
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n[1-9]\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    
    
    # cutting sections based on titles
    if df_filtered_null_2['language'][index] == 'en':  # English
        if re.search(pattern_en_1, texto).span()[0] != None:
            ini = re.search(pattern_en_1, texto).span()[0]
    
        # alternatively:
        else:
            ini = re.search(df_filtered_null_2['title_inicial'][index][0][:-1], texto).span()[0]
    
    else: # Spanish
        if re.search(pattern_es_1, texto).span()[0] != None:
            ini = re.search(pattern_es_1, texto).span()[0]
    
        # alternatively:
        else:
            ini = re.search(df_filtered_null_2['title_inicial'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    ##fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    ##texto = texto[ini:fin].strip()[:-3]
    ##print(texto)
    
    #Presupuesto Indicativo

    if re.search(r'(Indicative [Bb]udget)|(Presupuesto [Ii]ndicativo)', texto) != None:  # search for 'Presupuesto Indicativo'
        fin = re.search(r'(Indicative [Bb]udget)|(Presupuesto [Ii]ndicativo)', texto).span()[0]
        
    else:   # search for pattern_3, as border condition
        if df_filtered_null_2['language'][index] == 'en':  # English
            fin = re.search(pattern_en_3, texto).span()[0]
        else:
            fin = re.search(pattern_es_3, texto).span()[0]
    
    #fin = re.search(pattern_en_3, texto, re.IGNORECASE).span()[0]
    texto = texto[ini:fin].strip()
    #print(texto)
    
    # store extracted content in dataframe
    df_filtered_null_2.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
# (store results as v0.8)

In [None]:
re.search(df_filtered_null_2['title_inicial'][32][0][:-1], texto)

In [None]:
df_filtered_null_2['extracted'][32]

In [None]:
print(texto)

In [None]:
df_filtered_null_2['Document_Name'][43]

In [None]:
df_filtered_null_2.columns

In [None]:
df_filtered_null_2['title_inicial'][43]

In [None]:
'\n\n\nII. Objectives and Justification of the TC  \n\n\n'.find('TC')

In [None]:
df_filtered_null_2['Document_Content'][43][df_filtered_null_2['title_inicial'][43][1]][re.search(df_filtered_null_2['title_inicial'][43][0], df_filtered_null_2['Document_Content'][43][df_filtered_null_2['title_inicial'][43][1]]).span()[0]:]

In [None]:
# Initial Title
for index, row in df_filtered_null_2.iterrows():
    print("processing index", str(index))
    print(df_filtered_null_2['Document_Content'][index][df_filtered_null_2['title_inicial'][index][1]][re.search(df_filtered_null_2['title_inicial'][index][0], df_filtered_null_2['Document_Content'][index][df_filtered_null_2['title_inicial'][index][1]]).span()[0]:])
    print('***')
    print()

In [None]:
# End Title
for index, row in df_filtered_null_2.iterrows():
    print("processing index", str(index))
    print(df_filtered_null_2['Document_Content'][index][df_filtered_null_2['title_final'][index][1]][:re.search(df_filtered_null_2['title_final'][index][0], df_filtered_null_2['Document_Content'][index][df_filtered_null_2['title_final'][index][1]]).span()[0]])
    print('***')
    print()

#### supra-indexes removal

In [None]:
#print(df_pre_en['extracted'][177])

In [None]:
# for cleaned content storing:
df_pre_en['extracted_cleaned'] = ''

In [None]:
for index, row in df_pre_en.iterrows():
    texto = df_pre_en['extracted'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\-\)\”]+\d{1,2}\.?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_pre_en.at[index, 'extracted_cleaned'] = res_clean

In [None]:
df_pre_en.head()

In [None]:
df_pre_en.extracted_cleaned[29]

In [None]:
df_pre_en[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name',
       'Document_Status', 'blank_pages', 'page_count',
       'doc_identifier', 'title_inicial',
       'title_medio', 'title_final', 'extracted', 'extracted_cleaned']].to_excel('TCs_Approval-Docs_EN_Processing_2020-08-23_v09.xlsx')

In [None]:
index = 25
#max([df_filtered_null_2.title_inicial[index][1],df_filtered_null_2.title_medio[index][1],df_filtered_null_2.title_final[index][1]])
print((df_filtered_null_2.title_inicial[index][0], df_filtered_null_2.title_inicial[index][1]), (df_filtered_null_2.title_medio[index][0], df_filtered_null_2.title_medio[index][1]), (df_filtered_null_2.title_final[index][0], df_filtered_null_2.title_final[index][1]))

In [None]:
# For the above cases, the index page identified is removed


#### footer and header clean-up

In [None]:
df_pre_en.shape

In [None]:
# to store the extracted content:
df_pre_en['extracted'] = ''

In [None]:
# Clean-up routine (v1.0)
#for index in [30]:
for index, row in df_pre_en.iterrows():
    page_ini = df_pre_en.title_inicial[index][1]
    page_fin = df_pre_en.title_final[index][1]
    
    print('### Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_pre_en['Document_Content'][index][j]
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}\d\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Aug|Sep|Set|Oct|Nov|Dic|IDB|months|Budget|Development)([A-Z\¿]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    
    
    # cutting sections based on titles
    if re.search(pattern_en_1, texto).span()[0] != None:
        ini = re.search(pattern_en_1, texto).span()[0]
    
    # alternatively:
    else:
        ini = re.search(df_pre_en['title_inicial'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    ##fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    ##texto = texto[ini:fin].strip()[:-3]
    ##print(texto)
    

    if re.search(r'Indicative (B|b)udget', texto) != None:  # search for 'Presupuesto Indicativo'
        fin = re.search(r'Indicative (B|b)udget', texto).span()[0]
        
    else:   # search for pattern_3, as border condition
        fin = re.search(pattern_en_3, texto).span()[0]
    
    #fin = re.search(pattern_en_3, texto, re.IGNORECASE).span()[0]
    texto = texto[ini:fin].strip()
    #print(texto)
    
    # store extracted content in dataframe
    df_pre_en.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
# (store results as v0.8)

#### supra-indexes removal

In [None]:
#print(df_pre_en['extracted'][177])

In [None]:
# for cleaned content storing:
df_pre_en['extracted_cleaned'] = ''

In [None]:
for index, row in df_pre_en.iterrows():
    texto = df_pre_en['extracted'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\-\)\”]+\d{1,2}\.?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_pre_en.at[index, 'extracted_cleaned'] = res_clean

In [None]:
df_pre_en.head()

In [None]:
df_pre_en.extracted_cleaned[29]

In [None]:
df_pre_en[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name',
       'Document_Status', 'blank_pages', 'page_count',
       'doc_identifier', 'title_inicial',
       'title_medio', 'title_final', 'extracted', 'extracted_cleaned']].to_excel('TCs_Approval-Docs_EN_Processing_2020-08-23_v09.xlsx')

In [None]:
############
# 1) remove headers
# 2) remove footers (different types)
# 3) remove supraindexes

### Testing Area -- NOT TO BE USED

#### Copy all files to another folder:

In [None]:
import os.path
from os import path
#path.exists()

In [None]:
import shutil

In [None]:
# PCR location:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\PCR"

print(file_dir)

In [None]:
# Added to store the result:
df['Status'] = ''

In [None]:
# Copy each doc, with name stored in the dataframe, to another folder:
for index, row in df.iterrows():
    file = df.Document_Name[index]
    filename = file_dir + '\\' + file
    if path.exists(filename):
        try: 
            shutil.copyfile(filename, "C:\\Users\\emilianoco\\Desktop\\PCR_" + '\\' + file)
            df.at[index, 'Status'] = 'Copied'
        except Exception as e:
            df.at[index, 'Status'] = 'Not copied'
    else:
        df.at[index, 'Status'] = 'File not exists'

In [None]:
df.Status.value_counts()

In [None]:
## FIN

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                   Change:                                       

#   - 1.2           01/12/2021        Emiliano Colina    - New TCs from November to December 2020
#
#   - 1.1           10/16/2020        Emiliano Colina    - New TCs from July to September 2020 are processed
#
#   - 1.0           08/24/2020        Emiliano Colina    - TCs with NULL_URL processing
#
#   - 0.9           08/23/2020        Emiliano Colina    - Documents extracted and cleaned
#
#   - 0.8           08/22/2020        Emiliano Colina    - English documents processing
#
#   - 0.7           07/14/2020        Emiliano Colina    - Supra-indexes removed from content extracted
#                                                        - Multiple spacing also cleaned in the process

#   - 0.6           07/14/2020        Emiliano Colina    - Content extracted
#                                                        - Headers and footnotes cleaned up on selected Spanish 
#                                                        documents

#   - 0.5           07/12/2020        Emiliano Colina    - Language detection implemented, worked w/Spanish docs
#                                                        - Filtering and processing logic defined and implemented
#                                                        using sections' titles and their respective locations

#   - 0.4           07/09/2020        Emiliano Colina    - Added docx documents as pdf (converted w/MS-Word)
#                                                        - Count blank pages as %
#                                                        - Distribution analysis and filtering
                                                        
#   - 0.3           07/08/2020        Emiliano Colina    - Using new document sources

#   - 0.2           06/29/2020        Emiliano Colina    - Filtered TC and other types of documents using regex
                                                                                                                  
#   - 0.1           06/23/2020        Emiliano Colina    - Initial version, starting with 'Approval Registry'    
#                                                        - Description type of documents                         


#
# **************************************************************************************************************** #
#'''
