# Digital Transformation Advisory

## 02.1 - Document Processing - Loan Proposals

In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 02.1 - Document Processing - Loan Proposals
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): pdf files
#-- Description:  
#                - Loan Proposals processing
#                
#                
#                
#
#-- @author:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  0.7
#-- Last Update: 01/14/2021
#-- Last Revision Date: 07/19/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

<br>

#### Environment Setup

In [None]:
import os
import pandas as pd
import re

###### Required Libraries:

In [None]:
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

In [None]:
# **************************************************************************************************************** #

In [None]:
import joblib

## Loans from Nov and Dec 2020 - v0.7

In [None]:
data_pre = joblib.load('./output/Loans-Doc_Collection_2021-01-12_v10_.joblib.bz2')
print(data_pre.shape)
data_pre.head()

### Document Reading - v0.7

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
# **************************************************************************************************************** #

In [None]:
# v0.7: Blank pages statistics

data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data

### Index page and Language identification - v0.7

In [None]:
# Copy of the previous result to work with:
df_filtered_2 = data.copy()

In [None]:
# identify & get the index page()

In [None]:
# stores the index language
df_filtered_2['language'] = ''
# stores the index page
df_filtered_2['index_page'] = ''

In [None]:
# test:
#CONTENTS \n\n\nPROJECT SUMMARY
#re.search(r'(^(\-(\s+)?i?i\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+|- i - \n\n\nCONTENTS \n\n\nPROJECT SUMMARY)', df_filtered_2.Document_Content[72][3], re.IGNORECASE)

In [None]:
to_review = []
loan_count = 0
for index, row in df_filtered_2.iterrows():
    is_loan = False
    for page in range(0,len(df_filtered_2.Document_Content[index])):
        if re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+|- i - \n\n\nCONTENTS \n\n\nPROJECT SUMMARY)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('English - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'en'
            match_title_type = re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+|- i - \n\n\nCONTENTS \n\n\nPROJECT SUMMARY)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
            
        elif re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('Spanish - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'es'
            match_title_type = re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
        
        
    if not is_loan: 
        print('check regex on:', str(index))
        #df_filtered_2.at[index, 'doc_type'] = 'other'
        #df_filtered_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('Loans identified:', str(loan_count))

In [None]:
to_review

In [None]:
df_filtered_2.language.value_counts()

In [None]:
len(df_filtered_2.OPERATION_NUMBER.unique())

### Index titles - v0.7

In [None]:
# to store key index titles: 
df_filtered_2['index_titles'] = ''

In [None]:
#to_review = []
# key titles are extracted along with their respective page number: 

for index, row in df_filtered_2.iterrows():
    print('*Processing index:', str(index))
    key_titles = re.findall(r'[IV\.]{1,5}\s+[A-ZÁÉÍÓÚ\s\,\n]+[\.\s\-\…]{0,200}\d\d?', df_filtered_2.Document_Content[index][df_filtered_2.index_page[index]])
    print(key_titles)
    if key_titles == []:
        print('Found empty list on:', str(index))
        #to_review.append(index)
    else:
        df_filtered_2.at[index, 'index_titles'] = key_titles
        
    print("~~~")
    print()
#print(to_review)

<br>
<br>

In [None]:
# for storing the results:
df_filtered_2['index_title_I'] = ''
df_filtered_2['index_title_II'] = ''
df_filtered_2['index_title_III'] = ''

In [None]:
# iterate over the index titles and get main titles and pages:
for index, row in df_filtered_2.iterrows():
    print('* Processing index:', str(index))
    for i in range(0,len(df_filtered_2.index_titles[index])):
        resultado = tuple(re.findall(r'[A-ZÁÉÍÓÚ\.\s\-\…\,\n]+|\d+', df_filtered_2.index_titles[index][i]))
        #ini:
        if (resultado[0].startswith('I.') or 'DESCRIP' in resultado[0]):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][2:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_I'] = aux
            del aux
            del resultado
        #medio:
        elif (resultado[0].startswith('II.') or resultado[0].startswith('II ') or resultado[0].startswith('..... ESTRUCTURA')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_II'] = aux
            del aux
            del resultado
        #fin:
        elif (resultado[0].startswith('III') or resultado[0].startswith('.... PLAN')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_III'] = aux
            del aux
            del resultado
        else:
            # do nothing
            print('nothing')
        
    #del aux
    print()
    print('~~~')
    print()

In [None]:
df_filtered_2

##### Test looking for the page of title_I (v0.7):

In [None]:
df_filtered_2.head(50)

In [None]:
df_filtered_2.Document_Content[46][7]

In [None]:
df_filtered_2['index_title_I'][46]

In [None]:
# V0.7: adjustment required on index 46, since an invalid value was found: ('PROJECT DESCRIPTION AND RESULTS MNITORING', '2'):
df_filtered_2.at[46, 'index_title_I'] = ('PROJECT DESCRIPTION AND RESULTS MONITORING', '2')

In [None]:
df_filtered_2['index_title_I'][46]

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)

##### Test looking for the page of title_II (v0.7):

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

##### Test looking for the page of title_III (v0.7):

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

### True Title check - v0.7

In [None]:
# for storing the results:
df_filtered_2['true_title_I'] = ''
df_filtered_2['true_title_II'] = ''
df_filtered_2['true_title_III'] = ''

###### true_title_I

In [None]:
indexes_to_check = []

# identify true_title_I location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_I'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_II

In [None]:
indexes_to_check = []

# identify true_title_II location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_I'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_II'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_III

In [None]:
indexes_to_check = []

# identify true_title_III location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_II'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_III'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


In [None]:
df_filtered_2.head()

#### Check for crossed titles - v0.7

In [None]:
for index, row in df_filtered_2.iterrows():
    if (df_filtered_2.true_title_I[index][1] < df_filtered_2.true_title_II[index][1] < df_filtered_2.true_title_III[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_filtered_2.true_title_III[index][1]> df_filtered_2.true_title_I[index][1] > df_filtered_2.true_title_II[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        
    #if (df_filtered_2.true_title_III[index][1] - df_filtered_2.true_title_I[index][1]) > 10: # alert on cases where extension between titles is greater than 10
    #    print('File to check due to extension between titles:', df_filtered_2['Document_Name'][index])
    #    print((df_filtered_2.true_title_I[index][0], df_filtered_2.true_title_I[index][1]), (df_filtered_2.true_title_II[index][0], df_filtered_2.true_title_II[index][1]), (df_filtered_2.true_title_III[index][0], df_filtered_2.true_title_III[index][1]))
    #    print()

#### Generate the list of pages, delimited by true_title_I y true_title_II - v0.7

In [None]:
df_filtered_2['lista_paginas'] = ''

for index, row in df_filtered_2.iterrows():
    print('processing index', str(index))
    lista_pages = []
    page_ini = df_filtered_2.true_title_I[index][1]
    page_fin = df_filtered_2.true_title_II[index][1]
    if (page_fin - page_ini) < 2: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])

    else: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        for j in range(page_ini+1,page_fin): 
            lista_pages.append(df_filtered_2['Document_Content'][index][j])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])
    
    df_filtered_2.at[index, 'lista_paginas'] = lista_pages
    del lista_pages
    del page_ini
    del page_fin

In [None]:
df_filtered_2.head()

In [None]:
df_filtered_2.columns

In [None]:
# Adjustments (v0.7)
df_filtered_2['doc_type'] = 'loan'

df_filtered_2.rename(columns={'true_title_I':'title_inicial', 'true_title_II': 'title_final'}, inplace=True)

df_filtered_1 = df_filtered_2[['doc_type','language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'Document_Content', 'title_inicial', 'title_final', 'lista_paginas']].copy()

In [None]:
df_filtered_1

#### Version 0.7 - Load Previous Results and Generate the list of pages, delimited by true_title_I y true_title_II

In [None]:
## Load previous results (from v0.6), since an adjusted extraction process is performed
df_previous = joblib.load('./output/df_resultado_loans_2020-11-04_v06.joblib.bz2')
df_previous.head()

In [None]:
# drop these columns:
df_previous.drop(['extracted_v2', 'extracted_cleaned_v2'], axis=1, inplace=True)


In [None]:
df_previous.shape

#### Version 0.7 - Merge Loans from current version with previous results

In [None]:
# Merge Loans from version 0.6 with previous results:
df_loans = pd.concat([df_previous, df_filtered_1], ignore_index=True)

df_loans.tail()

#### Version 0.7 - Text extraction and clean-up routine

In [None]:
df_loans.shape

In [None]:
# to store the extracted content:
df_loans['extracted_v2'] = ''

In [None]:
df_loans.head()

In [None]:
# New text_extraction and clean-up routine (v2.3 - 10/17/2020)

for index, row in df_loans.iterrows():
    #print(df_loans['lista_paginas'][index])
    longitud = len(df_loans['lista_paginas'][index])
    print('### Processing index: ', str(index), ' - page range:', str(longitud))
    texto = ''
    for j in range(0,longitud):

        page = df_loans['lista_paginas'][index][j]
        
        # header cleanup:
        page = re.sub(r'(^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-|^\-\s{5,9})', ' \n ', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
                       
        # footnotes - pending
        elif re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        else: 
            texto = texto + ''.join(page) + ' '
            
    # Additional clean-up
    # - remove urls:
    texto = re.sub(r'https?://\S+', '', texto)
    
    #print(texto)
    
    df_loans.at[index, 'extracted_v2'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
print(df_loans['extracted_v2'][155])

#### Version 0.7 - supra-indexes removal

In [None]:
# for cleaned content storing:
df_loans['extracted_cleaned_v2'] = ''

In [None]:
for index, row in df_loans.iterrows():
    texto = df_loans['extracted_v2'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\)\”\"]+(\d{1,3}|[\¹\²\³\⁴\⁵\⁶\⁷\⁸\⁹\⁰]+)[\.\,\;\:]?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_loans.at[index, 'extracted_cleaned_v2'] = res_clean

In [None]:
df_loans.columns

In [None]:
df_loans.shape

In [None]:
df_loans.language.value_counts()

In [None]:
# **************************************************************************************************************** #

## Version 0.6 - Loans from October 2020

In [None]:
data_pre = joblib.load('./output/Loans-Doc_Collection_2020-11-04_v09_.joblib.bz2')
print(data_pre.shape)
data_pre.head()

## Version 0.6 - Reading

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
# **************************************************************************************************************** #

In [None]:
data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data

## Version 0.6 -  Index page and Language identification

In [None]:
# Copy of the previous result to work with:
df_filtered_2 = data.copy()

In [None]:
# identify & get the index page()

In [None]:
# stores the index language
df_filtered_2['language'] = ''
# stores the index page
df_filtered_2['index_page'] = ''

In [None]:
to_review = []
loan_count = 0
for index, row in df_filtered_2.iterrows():
    is_loan = False
    for page in range(0,len(df_filtered_2.Document_Content[index])):
        if re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('English - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'en'
            match_title_type = re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
            
        elif re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('Spanish - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'es'
            match_title_type = re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
        
        
    if not is_loan: 
        print('check regex on:', str(index))
        #df_filtered_2.at[index, 'doc_type'] = 'other'
        #df_filtered_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('Loans identified:', str(loan_count))

In [None]:
to_review

In [None]:
df_filtered_2.head(15)

In [None]:
df_filtered_2.language.value_counts()

In [None]:
len(df_filtered_2.OPERATION_NUMBER.unique())

## Version 0.6 - Index titles

In [None]:
# to store key index titles: 
df_filtered_2['index_titles'] = ''

In [None]:
#to_review = []
# key titles are extracted along with their respective page number: 

for index, row in df_filtered_2.iterrows():
    print('*Processing index:', str(index))
    key_titles = re.findall(r'[IV\.]{1,5}\s+[A-ZÁÉÍÓÚ\s\,\n]+[\.\s\-\…]{0,200}\d\d?', df_filtered_2.Document_Content[index][df_filtered_2.index_page[index]])
    print(key_titles)
    if key_titles == []:
        print('Found empty list on:', str(index))
        #to_review.append(index)
    else:
        df_filtered_2.at[index, 'index_titles'] = key_titles
        
    print("~~~")
    print()
#print(to_review)

<br>
<br>

In [None]:
# for storing the results:
df_filtered_2['index_title_I'] = ''
df_filtered_2['index_title_II'] = ''
df_filtered_2['index_title_III'] = ''

In [None]:
# iterate over the index titles and get main titles and pages:
for index, row in df_filtered_2.iterrows():
    print('* Processing index:', str(index))
    for i in range(0,len(df_filtered_2.index_titles[index])):
        resultado = tuple(re.findall(r'[A-ZÁÉÍÓÚ\.\s\-\…\,\n]+|\d+', df_filtered_2.index_titles[index][i]))
        #ini:
        if (resultado[0].startswith('I.') or 'DESCRIP' in resultado[0]):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][2:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_I'] = aux
            del aux
            del resultado
        #medio:
        elif (resultado[0].startswith('II.') or resultado[0].startswith('II ') or resultado[0].startswith('..... ESTRUCTURA')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_II'] = aux
            del aux
            del resultado
        #fin:
        elif (resultado[0].startswith('III') or resultado[0].startswith('.... PLAN')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_III'] = aux
            del aux
            del resultado
        else:
            # do nothing
            print('nothing')
        
    #del aux
    print()
    print('~~~')
    print()

In [None]:
df_filtered_2

##### Test looking for the page of title_I:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)

##### Test looking for the page of title_II:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

##### Test looking for the page of title_III:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

### Version 0.6 - True Title check

In [None]:
# for storing the results:
df_filtered_2['true_title_I'] = ''
df_filtered_2['true_title_II'] = ''
df_filtered_2['true_title_III'] = ''

###### true_title_I

In [None]:
indexes_to_check = []

# identify true_title_I location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_I'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_II

In [None]:
indexes_to_check = []

# identify true_title_II location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_I'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_II'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_III

In [None]:
indexes_to_check = []

# identify true_title_III location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_II'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_III'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


In [None]:
df_filtered_2.head()

#### Version 0.6 - Check for crossed titles

In [None]:
for index, row in df_filtered_2.iterrows():
    if (df_filtered_2.true_title_I[index][1] < df_filtered_2.true_title_II[index][1] < df_filtered_2.true_title_III[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_filtered_2.true_title_III[index][1]> df_filtered_2.true_title_I[index][1] > df_filtered_2.true_title_II[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        
    #if (df_filtered_2.true_title_III[index][1] - df_filtered_2.true_title_I[index][1]) > 10: # alert on cases where extension between titles is greater than 10
    #    print('File to check due to extension between titles:', df_filtered_2['Document_Name'][index])
    #    print((df_filtered_2.true_title_I[index][0], df_filtered_2.true_title_I[index][1]), (df_filtered_2.true_title_II[index][0], df_filtered_2.true_title_II[index][1]), (df_filtered_2.true_title_III[index][0], df_filtered_2.true_title_III[index][1]))
    #    print()

#### Version 0.6 - Generate the list of pages, delimited by true_title_I y true_title_II

In [None]:
df_filtered_2['lista_paginas'] = ''

for index, row in df_filtered_2.iterrows():
    print('processing index', str(index))
    lista_pages = []
    page_ini = df_filtered_2.true_title_I[index][1]
    page_fin = df_filtered_2.true_title_II[index][1]
    if (page_fin - page_ini) < 2: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])

    else: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        for j in range(page_ini+1,page_fin): 
            lista_pages.append(df_filtered_2['Document_Content'][index][j])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])
    
    df_filtered_2.at[index, 'lista_paginas'] = lista_pages
    del lista_pages
    del page_ini
    del page_fin

In [None]:
df_filtered_2.head()

In [None]:
df_filtered_2.columns

In [None]:
# Adjustments (v0.6)
df_filtered_2['doc_type'] = 'loan'

df_filtered_2.rename(columns={'true_title_I':'title_inicial', 'true_title_II': 'title_final'}, inplace=True)

df_filtered_1 = df_filtered_2[['doc_type','language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'Document_Content', 'title_inicial', 'title_final', 'lista_paginas']].copy()

In [None]:
df_filtered_1

#### Version 0.6 - Load Previous Results and Generate the list of pages, delimited by true_title_I y true_title_II

In [None]:
## Load previous results (from v0.4), since an adjusted extraction process is performed
df_previous = joblib.load('./output/df_resultado_loans_2020-10-19_v05.joblib.bz2')
df_previous.drop(['extracted_v2', 'extracted_cleaned_v2'], axis=1, inplace=True)
df_previous.head()

#### Version 0.6 - Merge Loans from current version with previous results

In [None]:
# Merge Loans from version 0.5 with previous results:
df_loans = pd.concat([df_previous, df_filtered_1], ignore_index=True)

df_loans.tail()

#### Version 0.6 - Text extraction and clean-up routine

In [None]:
df_loans.shape

In [None]:
# to store the extracted content:
df_loans['extracted_v2'] = ''

In [None]:
df_loans.head()

In [None]:
# New text_extraction and clean-up routine (v2.3 - 10/17/2020)

for index, row in df_loans.iterrows():
    #print(df_loans['lista_paginas'][index])
    longitud = len(df_loans['lista_paginas'][index])
    print('### Processing index: ', str(index), ' - page range:', str(longitud))
    texto = ''
    for j in range(0,longitud):

        page = df_loans['lista_paginas'][index][j]
        
        # header cleanup:
        page = re.sub(r'(^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-|^\-\s{5,9})', ' \n ', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
                       
        # footnotes - pending
        elif re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        else: 
            texto = texto + ''.join(page) + ' '
            
    # Additional clean-up
    # - remove urls:
    texto = re.sub(r'https?://\S+', '', texto)
    
    #print(texto)
    
    df_loans.at[index, 'extracted_v2'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
print(df_loans['extracted_v2'][155])

#### Version 0.6 - supra-indexes removal

In [None]:
# for cleaned content storing:
df_loans['extracted_cleaned_v2'] = ''

In [None]:
for index, row in df_loans.iterrows():
    texto = df_loans['extracted_v2'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\)\”\"]+(\d{1,3}|[\¹\²\³\⁴\⁵\⁶\⁷\⁸\⁹\⁰]+)[\.\,\;\:]?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_loans.at[index, 'extracted_cleaned_v2'] = res_clean

In [None]:
df_loans.columns

# **************************************************************************************************************** #
<br>
<br>
<br>

## Version 0.5 - Loans from July to September 2020

In [None]:
data_pre = joblib.load('./output/Loans-Doc_Collection_2020-10-16_v08_.joblib.bz2')
print(data_pre.shape)
data_pre.head()

## Version 0.5 - Reading

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals_cont"
print(file_dir)

In [None]:
data = data_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
# **************************************************************************************************************** #

In [None]:
data['blank_pages'] = ''

for index, row in data.iterrows():
    print('## Processing index', str(index))
    lista = data['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    data.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100
data.blank_pages.value_counts()

In [None]:
data['page_count'] = data['Document_Content'].apply(lambda x: len(x))
data.shape

In [None]:
data

## Version 0.5 -  Index page and Language identification

In [None]:
# Copy of the previous result to work with:
df_filtered_2 = data.copy()

In [None]:
# identify & get the index page()

In [None]:
# stores the index language
df_filtered_2['language'] = ''
# stores the index page
df_filtered_2['index_page'] = ''

In [None]:
to_review = []
loan_count = 0
for index, row in df_filtered_2.iterrows():
    is_loan = False
    for page in range(0,len(df_filtered_2.Document_Content[index])):
        if re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('English - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'en'
            match_title_type = re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
            
        elif re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('Spanish - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'es'
            match_title_type = re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
        
        
    if not is_loan: 
        print('check regex on:', str(index))
        #df_filtered_2.at[index, 'doc_type'] = 'other'
        #df_filtered_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('Loans identified:', str(loan_count))

In [None]:
df_filtered_2.head(15)

In [None]:
df_filtered_2.language.value_counts()

In [None]:
len(df_filtered_2.OPERATION_NUMBER.unique())

## Version 0.5 - Index titles

In [None]:
# to store key index titles: 
df_filtered_2['index_titles'] = ''

In [None]:
#to_review = []
# key titles are extracted along with their respective page number: 

for index, row in df_filtered_2.iterrows():
    print('*Processing index:', str(index))
    key_titles = re.findall(r'[IV\.]{1,5}\s+[A-ZÁÉÍÓÚ\s\,\n]+[\.\s\-\…]{0,200}\d\d?', df_filtered_2.Document_Content[index][df_filtered_2.index_page[index]])
    print(key_titles)
    if key_titles == []:
        print('Found empty list on:', str(index))
        #to_review.append(index)
    else:
        df_filtered_2.at[index, 'index_titles'] = key_titles
        
    print("~~~")
    print()
#print(to_review)

In [None]:
## Manually adjusted index 127

In [None]:
print(df_filtered_2.Document_Content[127][3])

In [None]:
df_filtered_2.at[127, 'index_titles'] = ['I. DESCRIPCIÓN DEL PROYECTO Y MONITOREO DE RESULTADOS .......................................... 2', \
                                         'II. ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS .......................................... 12', 'III. PLAN DE IMPLEMENTACIÓN Y GESTIÓN .......................................... 14']

<br>
<br>

In [None]:
# for storing the results:
df_filtered_2['index_title_I'] = ''
df_filtered_2['index_title_II'] = ''
df_filtered_2['index_title_III'] = ''

In [None]:
# iterate over the index titles and get main titles and pages:
for index, row in df_filtered_2.iterrows():
    print('* Processing index:', str(index))
    for i in range(0,len(df_filtered_2.index_titles[index])):
        resultado = tuple(re.findall(r'[A-ZÁÉÍÓÚ\.\s\-\…\,\n]+|\d+', df_filtered_2.index_titles[index][i]))
        #ini:
        if (resultado[0].startswith('I.') or 'DESCRIP' in resultado[0]):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][2:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_I'] = aux
            del aux
            del resultado
        #medio:
        elif (resultado[0].startswith('II.') or resultado[0].startswith('II ') or resultado[0].startswith('..... ESTRUCTURA')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_II'] = aux
            del aux
            del resultado
        #fin:
        elif (resultado[0].startswith('III') or resultado[0].startswith('.... PLAN')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_III'] = aux
            del aux
            del resultado
        else:
            # do nothing
            print('nothing')
        
    #del aux
    print()
    print('~~~')
    print()

In [None]:
df_filtered_2

##### Test looking for the page of title_I:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)

##### Test looking for the page of title_II:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

##### Test looking for the page of title_III:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

### Version 0.5 - True Title check

In [None]:
# for storing the results:
df_filtered_2['true_title_I'] = ''
df_filtered_2['true_title_II'] = ''
df_filtered_2['true_title_III'] = ''

###### true_title_I

In [None]:
indexes_to_check = []

# identify true_title_I location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_I'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_II

In [None]:
indexes_to_check = []

# identify true_title_II location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_I'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_II'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_III

In [None]:
indexes_to_check = []

# identify true_title_III location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_II'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_III'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


In [None]:
df_filtered_2.head()

#### Version 0.5 - Check for crossed titles

In [None]:
for index, row in df_filtered_2.iterrows():
    if (df_filtered_2.true_title_I[index][1] < df_filtered_2.true_title_II[index][1] < df_filtered_2.true_title_III[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_filtered_2.true_title_III[index][1]> df_filtered_2.true_title_I[index][1] > df_filtered_2.true_title_II[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        
    #if (df_filtered_2.true_title_III[index][1] - df_filtered_2.true_title_I[index][1]) > 10: # alert on cases where extension between titles is greater than 10
    #    print('File to check due to extension between titles:', df_filtered_2['Document_Name'][index])
    #    print((df_filtered_2.true_title_I[index][0], df_filtered_2.true_title_I[index][1]), (df_filtered_2.true_title_II[index][0], df_filtered_2.true_title_II[index][1]), (df_filtered_2.true_title_III[index][0], df_filtered_2.true_title_III[index][1]))
    #    print()

#### Version 0.5 - Generate the list of pages, delimited by true_title_I y true_title_II

In [None]:
df_filtered_2['lista_paginas'] = ''

for index, row in df_filtered_2.iterrows():
    print('processing index', str(index))
    lista_pages = []
    page_ini = df_filtered_2.true_title_I[index][1]
    page_fin = df_filtered_2.true_title_II[index][1]
    if (page_fin - page_ini) < 2: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])

    else: 
        lista_pages.append(df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]][re.search(df_filtered_2['true_title_I'][index][0], df_filtered_2['Document_Content'][index][df_filtered_2['true_title_I'][index][1]]).span()[0]:])
        for j in range(page_ini+1,page_fin): 
            lista_pages.append(df_filtered_2['Document_Content'][index][j])
        lista_pages.append(df_filtered_2['Document_Content'][index][page_fin][:df_filtered_2['Document_Content'][index][page_fin].find(df_filtered_2['true_title_II'][index][0])+len(df_filtered_2['true_title_II'][index][0])])
    
    df_filtered_2.at[index, 'lista_paginas'] = lista_pages
    del lista_pages
    del page_ini
    del page_fin

In [None]:
df_filtered_2.head()

#### Version 0.5 - Load Previous Results and Generate the list of pages, delimited by true_title_I y true_title_II

In [None]:
## Load previous results (from v0.4), since an adjusted extraction process is performed
df_previous = joblib.load('./output/df_filtered_2_loans_2020-07-21_v04_Content_extracted_cleaned.joblib.bz2')
df_previous.head()

In [None]:
df_previous.drop(['extracted', 'extracted_cleaned'], axis=1, inplace=True)

In [None]:
df_previous['lista_paginas'] = ''

for index, row in df_previous.iterrows():
    print('processing index', str(index))
    lista_pages = []
    page_ini = df_previous.true_title_I[index][1]
    page_fin = df_previous.true_title_II[index][1]
    if (page_fin - page_ini) < 2: 
        lista_pages.append(df_previous['Document_Content'][index][df_previous['true_title_I'][index][1]][re.search(df_previous['true_title_I'][index][0], df_previous['Document_Content'][index][df_previous['true_title_I'][index][1]]).span()[0]:])
        lista_pages.append(df_previous['Document_Content'][index][page_fin][:df_previous['Document_Content'][index][page_fin].find(df_previous['true_title_II'][index][0])+len(df_previous['true_title_II'][index][0])])

    else: 
        lista_pages.append(df_previous['Document_Content'][index][df_previous['true_title_I'][index][1]][re.search(df_previous['true_title_I'][index][0], df_previous['Document_Content'][index][df_previous['true_title_I'][index][1]]).span()[0]:])
        for j in range(page_ini+1,page_fin): 
            lista_pages.append(df_previous['Document_Content'][index][j])
        lista_pages.append(df_previous['Document_Content'][index][page_fin][:df_previous['Document_Content'][index][page_fin].find(df_previous['true_title_II'][index][0])+len(df_previous['true_title_II'][index][0])])
    
    df_previous.at[index, 'lista_paginas'] = lista_pages
    del lista_pages
    del page_ini
    del page_fin

In [None]:
df_previous.head()

#### Version 0.5 - Merge Loans from current version with previous results

In [None]:
# Merge Loans from version 0.5 with previous results:
df_loans = pd.concat([df_previous, df_filtered_2], ignore_index=True)

df_loans.tail()

#### Version 0.5 - Text extraction and clean-up routine

In [None]:
df_loans.shape

In [None]:
# to store the extracted content:
df_loans['extracted_v2'] = ''

In [None]:
df_loans.head()

In [None]:
# New text_extraction and clean-up routine (v2.3 - 10/17/2020)

for index, row in df_loans.iterrows():
    #print(df_loans['lista_paginas'][index])
    longitud = len(df_loans['lista_paginas'][index])
    print('### Processing index: ', str(index), ' - page range:', str(longitud))
    texto = ''
    for j in range(0,longitud):

        page = df_loans['lista_paginas'][index][j]
        
        # header cleanup:
        page = re.sub(r'(^\s?\-\s{0,3}[1-9]\d?\s{0,3}\-|^\-\s{5,9})', ' \n ', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
                       
        # footnotes - pending
        elif re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|Mar|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep|IDB|months|Budget|Development)([A-Z\¿\“]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '

        else: 
            texto = texto + ''.join(page) + ' '
            
    # Additional clean-up
    # - remove urls:
    texto = re.sub(r'https?://\S+', '', texto)
    
    #print(texto)
    
    df_loans.at[index, 'extracted_v2'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()

In [None]:
print(df_loans['extracted_v2'][155])

#### Version 0.5 - supra-indexes removal

In [None]:
# for cleaned content storing:
df_loans['extracted_cleaned_v2'] = ''

In [None]:
for index, row in df_loans.iterrows():
    texto = df_loans['extracted_v2'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\)\”\"]+(\d{1,3}|[\¹\²\³\⁴\⁵\⁶\⁷\⁸\⁹\⁰]+)[\.\,\;\:]?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_loans.at[index, 'extracted_cleaned_v2'] = res_clean

In [None]:
df_loans.columns

In [None]:
df_loans['doc_type'] = 'loan'

In [None]:
df_loans.rename(columns={'true_title_I':'title_inicial', 'true_title_II': 'title_final'}, inplace=True)

In [None]:
resultado = df_loans[['doc_type','language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'Document_Content', 'title_inicial', 'title_final', 'lista_paginas',
       'extracted_v2', 'extracted_cleaned_v2']].copy()

# **************************************************************************************************************** #
<br>
<br>
<br>

#### Load dataframe containing Loans from `Digital Transformation Advisory - 01 - Document Collection` notebook

In [None]:
# Load source file:
df_base_pre = joblib.load('./output/Loans-Doc_Collection_2020-07-15_v07_.joblib.bz2')

#### `Loan Document` Reading

###### Document Location:

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Loans_Approvals"

print(file_dir)

###### Dataframe for text processing:

In [None]:
df_base = df_base_pre[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'DOCUMENT_NAME', 'Document_Name', 'Document_Status']].copy()

In [None]:
df_base['Document_Content'] = ''

In [None]:
df_base.head()

In [None]:
df_base.Document_Status.value_counts()

###### Read the documents and store the content in the dataframe:

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in df_base.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + df_base.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        df_base.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        df_base.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
df_base.head(10)

In [None]:
df_base.Document_Content[6]

In [None]:
df_base.Document_Status.value_counts()

In [None]:
#v0.3 - Store content
f_base = 'Loans_Documents-full_content_v01_2020-07-15.joblib'
joblib.dump(df_base, './output/' + f_base + '.bz2', compress=('bz2', 3))

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base['blank_pages'] = ''

In [None]:
# **************************************************************************************************************** #

In [None]:
df_base.loc[indexes_to_remove]

##### The following indexes are removed from the dataframe: 

- [119, 120, 240, 249, 299, 300, 501, 542, 650, 651]
<br>

In [None]:
# indexes_to_remove since no document was downloaded:
df_base.drop([119, 120, 240, 249, 299, 300, 501, 542, 650, 651], inplace=True)

In [None]:
# additional indexes to remove: 

- 301, 'DR-L1125_Contingent Loan for Natural Disaster... ' is just a one page resolution.
<br>

In [None]:
df_base.drop([301], inplace=True)

In [None]:
# **************************************************************************************************************** #

##### Blank Pages (%) calculation

In [None]:

for index, row in df_base.iterrows():
    print('## Processing index', str(index))
    lista = df_base['Document_Content'][index]
    count = 0

    for i in range(len(lista)):
        if lista[i] == '':
            count += 1
    
    df_base.at[index, 'blank_pages'] = format(count/len(lista)*100, '.4g')
    print(str(count))
    print('')
    #count/len(lista)*100

##### Page count

In [None]:
df_base['page_count'] = df_base['Document_Content'].apply(lambda x: len(x))

In [None]:
df_base.head(10)

In [None]:
# **************************************************************************************************************** #

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pylab as P

In [None]:
# Plotting the document length distribution:

P.figure()
# the histogram of the data with histtype='step'
n, bins, patches = P.hist(df_base.page_count.to_list(), bins, histtype='bar', rwidth=0.8)

In [None]:
blank_pages_per_document = df_base.blank_pages.tolist()

In [None]:
# Plotting the document blank pages (%) distribution:

P.figure()
# the histogram of the data with histtype='step'
bins = [0, 20, 40, 60, 80, 100]
n, bins, patches = P.hist(df_base.blank_pages.to_list(), bins, histtype='bar', rwidth=0.8, color='g')

In [None]:
# **************************************************************************************************************** #

## Filtering 

#### Step_1

As of 07/20, <b>consider</b> files under the following conditions:
* documents with page_count > 10

In [None]:
df_filtered = df_base[(df_base.page_count > 10)].copy()
df_filtered

In [None]:
# Plotting the new results:
P.figure()
# the histogram of the data with histtype='step'
n, bins, patches = P.hist(df_filtered.page_count.to_list(), bins, histtype='bar', rwidth=0.8, color='g')

In [None]:
# **************************************************************************************************************** #

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_filtered.page_count, color='g').set_title('Document length distribution (in pages)');

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_filtered.blank_pages).set_title('Blank Pages distribution (% of blank pages)');

In [None]:
# **************************************************************************************************************** #

In [None]:
# **************************************************************************************************************** #

In [None]:
# **************************************************************************************************************** #

#### Step_2 


In [None]:
# Copy of the previous result to work with:
df_filtered_2 = df_filtered.copy()

In [None]:
# identify & get the index page()

In [None]:
# stores the index language
df_filtered_2['language'] = ''
# stores the index page
df_filtered_2['index_page'] = ''

In [None]:
to_review = []
loan_count = 0
for index, row in df_filtered_2.iterrows():
    is_loan = False
    for page in range(0,len(df_filtered_2.Document_Content[index])):
        if re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('English - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'en'
            match_title_type = re.search(r'(^(\-(\s+)?ii\s+\-s+)?CONTENTS?\s+(PROJECT|PROGRAM) SUMMARY|^(\-\s+ii\s+\-s+)?CONTENTS?\s+I\.|^\-(\s+)?ii(\s+)?\-\s+CONTENTS\s+PROJECT\s+SUMMARY\s+)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
            
        elif re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE):
            print('index', str(index))
            print('Spanish - index page found at page:', str(page))
            loan_count += 1
            is_loan = True
            df_filtered_2.at[index, 'language'] = 'es'
            match_title_type = re.search(r'((Í|I)NDICE\s+RESUM\s?EN (DEL? (PROYECTO|PROGRAMA)|EJECUTIVO)(\.?\…+|\s+|\.+)|ÍNDICE\s+(\d\s+)?I\.)', df_filtered_2.Document_Content[index][page], re.IGNORECASE).group()
            df_filtered_2.at[index, 'index_page'] = page
            print(match_title_type, page)
            print('~ ~ ~')
            break
        
        
    if not is_loan: 
        print('check regex on:', str(index))
        #df_filtered_2.at[index, 'doc_type'] = 'other'
        #df_filtered_2.at[index, 'doc_identifier'] = ('na', 'na')
        to_review.append(index)

print('Loans identified:', str(loan_count))

In [None]:
# indexes of documents to review:
#len(to_review)

df_filtered_2.loc[to_review]

In [None]:
df_filtered_2.head(35)

In [None]:
df_filtered_2.language.value_counts()

In [None]:
len(df_filtered_2.OPERATION_NUMBER.unique())

# **************************************************************************************************************** #
<br>
<br>
<br>

## Storing -intermediate- Results

#### Save results:

In [None]:
# Version 0.7: all loans
f_df_resultado_loans = 'df_resultado_loans_2021-01-14_v07.joblib'
joblib.dump(df_loans, './output/' + f_df_resultado_loans + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.7: all documents to Excel:
df_loans.to_excel('loans_docs_2021-01-14_v07.xlsx')

In [None]:
#### ~ ~ ~

In [None]:
# Version 0.6: all loans
f_df_resultado_loans = 'df_resultado_loans_2020-11-04_v06.joblib'
joblib.dump(df_loans, './output/' + f_df_resultado_loans + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.6: all documents to Excel:
df_loans.to_excel('loans_docs_2020-11-04_v06.xlsx')

In [None]:
#### ~ ~ ~

In [None]:
# Version 0.5: all loans
f_df_resultado_loans = 'df_resultado_loans_2020-10-19_v05.joblib'
joblib.dump(resultado, './output/' + f_df_resultado_loans + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.5: all documents to Excel:
resultado.to_excel('loans_docs_2020-10-19_v05.xlsx')

In [None]:
#### ~ ~ ~

In [None]:
#v0.4: Extracted and cleaned 1st part
f_df_filtered_2_v04 = 'df_filtered_2_loans_2020-07-21_v04_Content_extracted_cleaned.joblib'

joblib.dump(df_filtered_2, './output/' + f_df_filtered_2_v04 + '.bz2', compress=('bz2', 3))#

In [None]:
#### ~ ~ ~

In [None]:
#v0.3: completed titles recognition from the index page - (some documents had titles modification in order to have a matching condition)
f_df_filtered_2_v03 = 'df_filtered_2_loans_2020-07-21_v03.joblib'

joblib.dump(df_filtered_2, './output/' + f_df_filtered_2_v03 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.2: loan read, index page and language detection
f_df_filtered_2_v02 = 'df_filtered_2_loans_2020-07-21_v02.joblib'

joblib.dump(df_filtered_2, './output/' + f_df_filtered_2_v02 + '.bz2', compress=('bz2', 3))#

In [None]:
#v0.1: loan read, index page and language detection
f_df_filtered_2_v01 = 'df_filtered_2_loans_2020-07-16_v01.joblib'

joblib.dump(df_filtered_2, './output/' + f_df_filtered_2_v01 + '.bz2', compress=('bz2', 3))#

### ***********************************
<br>
<br>

##### Load previous/intermediate results (v02):

In [None]:
import joblib

In [None]:
# Load source file:
df_filtered_2 = joblib.load('./output/df_filtered_2_loans_2020-07-21_v02.joblib.bz2')

In [None]:
df_filtered_2.head()

# **************************************************************************************************************** #
<br>
<br>
<br>

## Read index page and Titles Search

#### Manually adjusted: 
##### index 85

In [None]:
df_filtered_2.Document_Name[85]

In [None]:
# Manually change of index page in document `ME-L1258_México. Propuesta de préstamo para el proyecto “Fortalecimiento de la Gestión de las Políticas de Promoción al Empleo”.pdf`: 
lista_aux = df_filtered_2['Document_Content'][85]
lista_aux[3] = df_filtered_2['Document_Content'][85][3].replace('DESCRIPCIÓN DEL PROYECTO Y MONITOREO DE RESULTADOS', 'DESCRIPCIÓN DEL PROYECTO Y MONITOREO DE RESULTADOS 2')
lista_aux[3] = df_filtered_2['Document_Content'][85][3].replace('ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS 16')
lista_aux[3] = df_filtered_2['Document_Content'][85][3].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE IMPLEMENTACIÓN Y GESTIÓN 18')
df_filtered_2.at[85, 'Document_Content'] = lista_aux

In [None]:
print(df_filtered_2.Document_Content[85][3])

In [None]:
# to store key index titles: 
df_filtered_2['index_titles'] = ''

In [None]:
#to_review = []
# key titles are extracted along with their respective page number: 

for index, row in df_filtered_2.iterrows():
    print('*Processing index:', str(index))
    key_titles = re.findall(r'[IV\.]{1,5}\s+[A-ZÁÉÍÓÚ\s\,\n]+[\.\s\-\…]{0,200}\d\d?', df_filtered_2.Document_Content[index][df_filtered_2.index_page[index]])
    print(key_titles)
    if key_titles == []:
        print('Found empty list on:', str(index))
        #to_review.append(index)
    else:
        df_filtered_2.at[index, 'index_titles'] = key_titles
        
    print("~~~")
    print()
#print(to_review)

<br>
<br>

In [None]:
# for storing the results:
df_filtered_2['index_title_I'] = ''
df_filtered_2['index_title_II'] = ''
df_filtered_2['index_title_III'] = ''

In [None]:
# iterate over the index titles and get main titles and pages:
for index, row in df_filtered_2.iterrows():
    print('* Processing index:', str(index))
    for i in range(0,len(df_filtered_2.index_titles[index])):
        resultado = tuple(re.findall(r'[A-ZÁÉÍÓÚ\.\s\-\…\,\n]+|\d+', df_filtered_2.index_titles[index][i]))
        #ini:
        if (resultado[0].startswith('I.') or 'DESCRIP' in resultado[0]):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][2:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_I'] = aux
            del aux
            del resultado
        #medio:
        elif (resultado[0].startswith('II.') or resultado[0].startswith('II ') or resultado[0].startswith('..... ESTRUCTURA')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_II'] = aux
            del aux
            del resultado
        #fin:
        elif (resultado[0].startswith('III') or resultado[0].startswith('.... PLAN')):
            aux = (re.search(r'[A-ZÁÉÍÓÚ\s\,\n]+', resultado[0][3:]).group().strip(),resultado[1])
            print(aux)
            df_filtered_2.at[index, 'index_title_III'] = aux
            del aux
            del resultado
        else:
            # do nothing
            print('nothing')
        
    #del aux
    print()
    print('~~~')
    print()

In [None]:
df_filtered_2.head()

##### Test looking for the page of title_I:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)

##### Test looking for the page of title_II:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

##### Test looking for the page of title_III:

In [None]:
indexes_to_check = []

for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Indexes to check', indexes_to_check)

In [None]:
index = 263
print(df_filtered_2['index_titles'][index])
df_filtered_2['Document_Content'][index][7]

In [None]:
print(repr(df_filtered_2['Document_Content'][index][3]))

### Document adjustments:

#### Found the following documents which the index page titles do not had matches within the document contents:

(Manually replaced content where indicated)

Due to `title_I`:

- index 99: 'GU-L1163_Guatemala. Loan proposal for the “Program to Strengthen the Institutional Healthcare Service Network (PRORISS).pdf' - <b>Solution: </b> replaced title in content at page 7
- index 336: 'BL-L1029_Belize. Propuesta de préstamo para un “Financiamiento Adicional para el Proyecto de Rehabilitación de la Carretera George Price”.pdf' - <b>Solution: </b> replaced title in content at page 7
- index 357: 'EC-L1235_Ecuador. Proposal for a loan for the project “Investment in The Quality of Child Development Services”.pdf' - <b>Solution: </b> replaced title in content at page 7
- index 368: 'BO-L1198_Bolivia. Propuesta de préstamo para el “Programa de Mejora en la Accesibilidad a los.pdf' - <b>Solution: </b> replaced title in content at page 7
- index 475: 'JA-L1085_Jamaica. Propuesta de Línea de Crédito Condicional para Proyectos de Inversión (CCLIP) para el “Programa de Impulso a la Innovación, el Crecimiento y los Ecosistemas Empresariales”~s Empresariales”.pdf' - <b>Solution: </b> replaced title in content at page 8
- index 509: 'UR-L1153_Uruguay. Propuesta de Línea de Crédito Condicional para Proyectos de Inversión (CCLIP) para el “Programa de Mejora de Corredores Viales de Uso Agroindustrial y Forestal” y primer p~rial y Forestal”.pdf' - <b>Solution: </b> replaced title in content at page 8
- index 525: 'BH-L1046_Bahamas. Propuesta de préstamo para el “Programa para Reforzar la Calidad Crediticia de Micro, Pequeñas y Medianas Empresas”.pdf' - <b>Solution: </b> replaced title in content at page 7
- index 643: 'TT-L1058_Trinidad y Tobago. Propuesta de préstamo para el “Programa de Fortalecimiento de la Política Pública y la Gestión Fiscal para la Atención de la Crisis Sanitaria y Económica Causada~inidad y Tobago”.pdf' - <b>Solution: </b> replaced title in content at page 7

<br>
Due to `title_II`:
indexes to review/modify: [15, 83, 85, 336, 473, 475, 509, 561]


<br>
Due to `title_II`:
indexes to review/modify: [53, 132, 146, 148, 158, 182, 336, 432, 475, 525]


<br>
Also, removed from the dataframe: 
- index 304: 'CR-L1135_LP - Paquete aprobado - CR-L1135.pdf'
<br>

##### Due to `title_III`:

In [None]:
index = 525 # index of document to modify
page = 25  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE EJECUCIÓN Y ADMINISTRACIÓN DEL PROGRAMA')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 475 # index of document to modify
page = 27  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE EJECUCIÓN Y ADMINISTRACIÓN', 'PLAN DE EJECUCIÓN Y ADMINISTRACIÓN DEL PROYECTO')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 432 # index of document to modify
page = 24  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE IMPLEM ENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 336 # index of document to modify
page = 26  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE EJECUCIÓN Y ADMINISTRACIÓN', 'PLAN DE IMPLEMENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 182 # index of document to modify
page = 25  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('Plan de Implementación y Gestión', 'PLAN DE IMPLEMENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 158 # index of document to modify
page = 27  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE IMPLEM ENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 148 # index of document to modify
page = 20  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE IMPLEM ENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 146 # index of document to modify
page = 26  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'IMPLEMENTACIÓN Y PLAN DE GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 132 # index of document to modify
page = 23  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('PLAN DE IMPLEMENTACIÓN Y GESTIÓN', 'PLAN DE IMPLEM ENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 53 # index of document to modify
page = 21  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('Plan de Implementación y Gestión', 'PLAN DE IMPLEMENTACIÓN Y GESTIÓN')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
#######################

##### Due to `title_II`:

In [None]:
index = 561 # index of document to modify
page = 19  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('Estructura de Financiamiento y Principales Riesgos', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 509 # index of document to modify
page = 16  # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA FINANCIERA Y PRINCIPALES RIESGOS', 'ESTRUCTURA FINANCIERA Y RIESGOS PRINCIPALES')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 475 # index of document to modify
page =  23 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y RIESGOS PRINCIPALES')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 473 # index of document to modify
page =  27 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DE FINANCIAMIENTO Y RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 336 # index of document to modify
page =  23 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DE FINANCIAMIENTO Y RIESGOS PRINCIPALES', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 85 # index of document to modify
page =  22 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DEL PRÉSTAMO Y PRINCIPALES RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 83 # index of document to modify
page =  29 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DE FINANCIAMIENTO Y RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
index = 15 # index of document to modify
page =  18 # location of the title to be modified
lista_aux = df_filtered_2['Document_Content'][index]
lista_aux[page] = df_filtered_2['Document_Content'][index][page].replace('ESTRUCTURA DEL PRÉSTAMO Y PRINCIPALES RIESGOS', 'ESTRUCTURA DE FINANCIAMIENTO Y PRINCIPALES RIESGOS')
df_filtered_2.at[index, 'Document_Content'] = lista_aux

In [None]:
#######################

##### Due to `title_I`:

In [None]:
# Manually change of index page in document 'GU-L1163_Guatemala. Loan proposal for the “Program to Strengthen the Institutional Healthcare Service Network (PRORISS).pdf': 
lista_aux = df_filtered_2['Document_Content'][99]
lista_aux[7] = df_filtered_2['Document_Content'][99][7].replace('PROJECT DESCRIPTION AND RESULTS MONITORING', 'PROGRAM DESCRIPTION AND RESULTS MONITORING')
df_filtered_2.at[99, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'BL-L1029_Belize. Propuesta de préstamo para un “Financiamiento Adicional para el Proyecto de Rehabilitación de la Carretera George Price”.pdf': 
lista_aux = df_filtered_2['Document_Content'][336]
lista_aux[7] = df_filtered_2['Document_Content'][336][7].replace('DESCRIPCIÓN Y SEGUIMIENTO DE RESULTADOS', 'DESCRIPCIÓN Y SUPERVISIÓN DE RESULTADOS')
df_filtered_2.at[336, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'EC-L1235_Ecuador. Proposal for a loan for the project “Investment in The Quality of Child Development Services”.pdf': 
lista_aux = df_filtered_2['Document_Content'][357]
lista_aux[7] = df_filtered_2['Document_Content'][357][7].replace('DESCRIPTION AND RESULTS MONITORING', 'PROJECT DESCRIPTION AND RESULTS MONITORING')
df_filtered_2.at[357, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'BO-L1198_Bolivia. Propuesta de préstamo para el “Programa de Mejora en la Accesibilidad a los.pdf': 
lista_aux = df_filtered_2['Document_Content'][368]
lista_aux[7] = df_filtered_2['Document_Content'][368][7].replace('DESCRIPCIÓN DEL PROGRAMA Y MONITOREO DE RESULTADOS', 'DESCRIPCIÓN DEL PROYECTO Y MONITOREO DE RESULTADOS')
df_filtered_2.at[368, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'JA-L1085_Jamaica. Propuesta de Línea de Crédito Condicional para Proyectos de Inversión (CCLIP) para el “Programa de Impulso a la Innovación, el Crecimiento y los Ecosistemas Empresariales”~s Empresariales”.pdf': 
lista_aux = df_filtered_2['Document_Content'][475]
lista_aux[8] = df_filtered_2['Document_Content'][475][8].replace('DESCRIPCIÓN Y SEGUIMIENTO DE RESULTADOS', 'DESCRIPCIÓN Y SEGUIMIENTO DE LOS RESULTADOS DEL PROGRAMA')
df_filtered_2.at[475, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'UR-L1153_Uruguay. Propuesta de Línea de Crédito Condicional para Proyectos de Inversión (CCLIP) para el “Programa de Mejora de Corredores Viales de Uso Agroindustrial y Forestal” y primer p~rial y Forestal”.pdf': 
lista_aux = df_filtered_2['Document_Content'][509]
lista_aux[8] = df_filtered_2['Document_Content'][509][8].replace('DESCRIPCIÓN DEL PROYECTO Y MONITOREO DE RESULTADOS', 'DESCRIPCIÓN Y MONITOREO DE RESULTADOS')
df_filtered_2.at[509, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'BH-L1046_Bahamas. Propuesta de préstamo para el “Programa para Reforzar la Calidad Crediticia de Micro, Pequeñas y Medianas Empresas”.pdf': 
lista_aux = df_filtered_2['Document_Content'][525]
lista_aux[7] = df_filtered_2['Document_Content'][525][7].replace('DESCRIPCIÓN DEL PROYECTO Y SEGUIMIENTO DE RESULTADOS', 'DESCRIPCIÓN Y SEGUIMIENTO DE LOS RESULTADOS DEL PROYECTO')
df_filtered_2.at[525, 'Document_Content'] = lista_aux

In [None]:
# Manually change of title based on index page in document 'TT-L1058_Trinidad y Tobago. Propuesta de préstamo para el “Programa de Fortalecimiento de la Política Pública y la Gestión Fiscal para la Atención de la Crisis Sanitaria y Económica Causada~inidad y Tobago”.pdf': 
lista_aux = df_filtered_2['Document_Content'][643]
lista_aux[7] = df_filtered_2['Document_Content'][643][7].replace('DESCRIPCIÓN Y SEGUIMIENTO DE RESULTADOS DEL PROYECTO', 'OBJETIVOS, DESCRIPCIÓN Y SEGUIMIENTO DE RESULTADOS')
df_filtered_2.at[643, 'Document_Content'] = lista_aux

In [None]:
#######################

In [None]:
# Removed from dataframe index 304: 'CR-L1135_LP - Paquete aprobado - CR-L1135.pdf', referring to Nate Storm: document has different format
df_filtered_2.drop([304], inplace=True)

In [None]:
# **************************************************************************************************************** #

### Titles results

In [None]:
df_filtered_2.head()

In [None]:
# for storing the results:
df_filtered_2['true_title_I'] = ''
df_filtered_2['true_title_II'] = ''
df_filtered_2['true_title_III'] = ''

###### true_title_I

In [None]:
indexes_to_check = []

# identify true_title_I location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = int(df_filtered_2['index_page'][index]) + 1 # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_I'][index][0][:-1], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_I'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


In [None]:
df_filtered_2['true_title_I'][2]

###### true_title_II

In [None]:
indexes_to_check = []

# identify true_title_II location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_I'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_II'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_II'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


###### true_title_III

In [None]:
indexes_to_check = []

# identify true_title_III location:
for index, row in df_filtered_2.iterrows():
    print()
    pattern_found = False
    print('Processsing index:', str(index))
    
    page_base = df_filtered_2['true_title_II'][index][1]  # starting page
    
    for i in range(page_base,len(df_filtered_2['Document_Content'][index])):
        if re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]) != None: # pattern found
            
            print('* pattern found at document page:', str(i))
            ## storing:
            inicial_match_title = re.search(df_filtered_2['index_title_III'][index][0], df_filtered_2['Document_Content'][index][i]).group()
            inicial_match_page = i
            df_filtered_2.at[index, 'true_title_III'] = (inicial_match_title, inicial_match_page)
            ##
            print('-----------------    -----------------')
            pattern_found = True
            break
            
    if not pattern_found: 
        print('check regex on:', str(index))
        indexes_to_check.append(index)
print('Index to check', indexes_to_check)


In [None]:
df_filtered_2.head()

#### check for crossed titles

In [None]:
for index, row in df_filtered_2.iterrows():
    if (df_filtered_2.true_title_I[index][1] < df_filtered_2.true_title_II[index][1] < df_filtered_2.true_title_III[index][1]):
        print('Sequence OK for index:', str(index))
    
    elif (df_filtered_2.true_title_III[index][1]> df_filtered_2.true_title_I[index][1] > df_filtered_2.true_title_II[index][1]):
        print('middle title before the first title on index:', str(index))
        
    else: 
        print('other case on:', str(index))
        
    #if (df_filtered_2.true_title_III[index][1] - df_filtered_2.true_title_I[index][1]) > 10: # alert on cases where extension between titles is greater than 10
    #    print('File to check due to extension between titles:', df_filtered_2['Document_Name'][index])
    #    print((df_filtered_2.true_title_I[index][0], df_filtered_2.true_title_I[index][1]), (df_filtered_2.true_title_II[index][0], df_filtered_2.true_title_II[index][1]), (df_filtered_2.true_title_III[index][0], df_filtered_2.true_title_III[index][1]))
    #    print()

In [None]:
df_filtered_2.loc[[2]]

In [None]:
### OK - save v0.3 07/21/2020

#### footer and header clean-up

In [None]:
df_filtered_2.shape

In [None]:
# to store the extracted content:
df_filtered_2['extracted'] = ''

In [None]:
# Testing section (pre v1.0)
for index in [669]:
#for index, row in df_filtered_2.iterrows():
    page_ini = df_filtered_2.true_title_I[index][1]
    page_fin = df_filtered_2.true_title_II[index][1]
    
    print('*** Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_filtered_2['Document_Content'][index][j]
        
        print('+ - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + ')
        print(repr(page))
        print('+ - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + ')
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}\d\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep)([A-Z\¿\“]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n?\n?\n\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep)([A-Z\¿\“]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    # cutting sections based on titles
    ini = re.search(df_filtered_2['index_title_I'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    texto = texto[ini:fin].strip()[:-3]
    print(texto)
    
    # store extracted content in dataframe
    df_filtered_2.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('~~~ *** ~~~')
    print()

In [None]:
# Clean-up routine (v1.0)
#for index in [30]:
for index, row in df_filtered_2.iterrows():
    page_ini = df_filtered_2.true_title_I[index][1]
    page_fin = df_filtered_2.true_title_II[index][1]
    
    print('*** Processing index: ', str(index), ' - page range:', str(page_ini),str(page_fin))
    texto = ''
    for j in range(page_ini,page_fin+1):

        page = df_filtered_2['Document_Content'][index][j]
        
        print('+ - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + ')
        print(repr(page))
        #print('+ - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + - - - + ')
        
        # header cleanup:
        page = re.sub(r'^\s?\-\s{0,3}\d\d?\s{0,3}\-', '', page)
        
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page) != None:    # 1st type of footnote found!
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,2}\s+[A-Z]', page).span()[0]]
            texto = texto + ''.join(page_clean)
                       
        # footnotes - pending
        elif re.search(r'\n\n?\n?\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep)([A-Z\¿\“]|http)', page) != None: #  2nd type of footnote found!
            print('* Footnote 2: \'2 or 3 blanks + 1 or 2 digits\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\n\n?\n?\d\d?\s{1,2}(?!Información\s|Objetivos\s|Descripción\s|Presupuesto\s|May|Jun|Jul|Ago|Sep|Set|Oct|Nov|Dic|PMRep)([A-Z\¿\“]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean)

        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean)

        else: 
            texto = texto + ''.join(page)
            
    texto = re.sub(r'https?[\:\/a-zA-Z0-9\.\?\=\-\_\%\&\;]+', ' ', texto)
    
    # cutting sections based on titles
    ini = re.search(df_filtered_2['index_title_I'][index][0][:-1], texto).span()[0]
    

    #if re.search(r'Presupuesto (I|i)ndicativo', texto) != None:  # search for 'Presupuesto Indicativo'
    #    fin = re.search(r'Presupuesto (I|i)ndicativo', texto).span()[0]
    #    
    #else:   # search for pattern_3, as border condition
    #    fin = re.search(pattern_es_3, texto, re.IGNORECASE).span()[0]
    
    fin = re.search(df_filtered_2['index_title_II'][index][0], texto).span()[0]
    texto = texto[ini:fin].strip()[:-3]
    print(texto)
    
    # store extracted content in dataframe
    df_filtered_2.at[index, 'extracted'] = texto
    
    del texto
    
    print()
    print()
    print('~~~ *** ~~~')
    print()

In [None]:
# (store results as v0.6)

#### supra-indexes and extra-blank spaces removal

In [None]:
print(df_filtered_2['extracted'][669])

In [None]:
# for cleaned content storing:
df_filtered_2['extracted_cleaned'] = ''

In [None]:
for index, row in df_filtered_2.iterrows():
    texto = df_filtered_2['extracted'][index].split()
    resultado = ["".join(filter(lambda x: not x.isdigit(), word)) if re.search(r'[A-Za-záéíóú\-\)\”]+\d{1,2}\.?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_filtered_2.at[index, 'extracted_cleaned'] = res_clean

In [None]:
df_filtered_2.head()

In [None]:
df_filtered_2.extracted_cleaned[0]

In [None]:
#(stored as v0.4)

In [None]:
df_filtered_2.columns

In [None]:
df_filtered_2[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID',
       'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'Document_Status', 'blank_pages', 'page_count',
       'language', 'index_titles', 'true_title_I', 'true_title_II',
       'extracted', 'extracted_cleaned']].to_excel('Loans_Docs_Collection_Processed_2020-07-21_v04.xlsx')

In [None]:
df_filtered_2.head()

In [None]:
#######################

In [None]:
## FIN

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                   Change:                                       

#   - 0.7           01/14/2021        Emiliano Colina    - processed Loans from Nov & Dec 2020
#

#   - 0.6           11/04/2020        Emiliano Colina    - processed Loans approved in October 2020
#

#   - 0.5           10/19/2020        Emiliano Colina    - processed Loans approved between July and Sept 2020
# 

#   - 0.4           07/21/2020        Emiliano Colina    - content extracted and cleaned
#                                                        
                                                        
#   - 0.3           07/21/2020        Emiliano Colina    - title search on documents and content extracted

#   - 0.2           07/20/2020        Emiliano Colina    - index titles recognition
                                                                                                                  
#   - 0.1           07/16/2020        Emiliano Colina    - Initial version, Loan documents read, index page and      
#                                                        language detection


#
# **************************************************************************************************************** #
#'''
