# Digital Transformation Advisory

## 02.0 - Document Processing Procurement_Plans

In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 02 - Document Processing - Procurement_Plans
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): xlsx, doc, pdf files
#-- Description:  
#                - Procurement Plans, multiple types of documents: xlsx, doc, pdf
#                
#                
#                
#
#-- @author:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  2.1
#-- Last Update: 11/18/2020
#-- Last Revision Date: 10/28/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#### Environment Setup

In [None]:
import os
import pandas as pd
import re

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

In [None]:
# **************************************************************************************************************** #

In [None]:
import joblib

In [None]:
from pprint import pprint

## v2.1: Procurement_Plans

In [None]:
# Load source file:
df_pre_2 = pd.read_excel('./input/Data-30 Sep 2020-All documents - original.xlsx', sheet_name='duplicates_filtered')
df_pre_2.head(30)

In [None]:
df_pre_2.drop(['Unnamed: 0'], axis=1, inplace=True)
df_pre_2.head(30)

## Reading

In [None]:
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser

In [None]:
desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\procur_plans"
print(file_dir)

In [None]:
df_pre_2.columns

In [None]:
data = df_pre_2[['Oper_type', 'OPERATION_ID', 'OPERATION_NUMBER', 'Country', 'Region',
       'Sector', 'Sector_Subsector', 'OPERATION_NAME_ES', 'OPERATION_YEAR',
       'APPROVAL_DATE', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION',
       'Document_Name', 'Document_Status']].copy()
data['Document_Content'] = ''
#data.head()
print(data.Document_Status.value_counts())

In [None]:
%%time

doc_count = 0
indexes_to_remove = []
for index, row in data.iterrows():
    print("## Processing item:", str(index))
    filename = file_dir + '\\' + data.Document_Name[index]
    pages_txt = []
    
    if (not(str(filename).endswith('found')) | (str(filename).endswith('downloaded'))):
 
        # Read PDF file
        data_ = parser.from_file(filename, xmlContent=True)
        xhtml_data = BeautifulSoup(data_['content'])
        for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
            # Parse PDF data using TIKA (xml/html)
            # It's faster and safer to create a new buffer than truncating it
            # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
            _buffer = StringIO()
            _buffer.write(str(content))
            parsed_content = parser.from_buffer(_buffer.getvalue())
        
            # Add pages
            if parsed_content['content'] != None:    # page is not blank page
                text = parsed_content['content'].strip()
            else: 
                text = ''
            
            pages_txt.append(text)
            
            
        # save results and report status:
        data.at[index, 'Document_Content'] = pages_txt
        doc_count += 1
        print()
        print("Completed doc index:", str(index), "Document number:", str(doc_count))
        del pages_txt
        del filename
        print('------')
        print()
    
    else:
        print("Document not available")
        data.at[index, 'Document_Content'] = 'not available'
        del pages_txt
        del filename
        print('------')
        print()
        indexes_to_remove.append(int(index))

print()
print('-------')
print('Indexes to remove:', str(indexes_to_remove))

In [None]:
data.iloc[9]

In [None]:
data[data.Document_Content == 'not available']

In [None]:
# to adjust reading xlsx:
data[data.Document_Content.str.len() == 0]

In [None]:
# read OK:
data[(data.Document_Content.str.len() == 0) & ~(data.Document_Content == 'not available')]

In [None]:
# drop the following doc since it is not a Procurement Plan:
data[data.OPERATION_NUMBER == 'UR-L1142']
data.drop([261], inplace=True)

In [None]:
# Processing excel files:

In [None]:
def process_excel(file_xlsx):
    '''
    Reads all sheets from a procurement plan file in excel, keeping only str type of content
    and returns a list.
    Caveats: all hiden sheets will are read as well.
    date: 11/01/2020
    author: emilianoco@iadb.org
    version: 0.1
    '''
    xls = pd.ExcelFile(file_xlsx)
    #xls.sheet_names
    lista_result = []
    for i in range(0,len(xls.sheet_names)):
        test = pd.read_excel(file_xlsx, sheet_name = xls.sheet_names[i])
        lista_test = test.values.tolist()
        flat_list = [item for sublist in lista_test for item in sublist] # flaten a list of lists
        flat_list = [item for item in flat_list if type(item) == str]
        lista_result.append(flat_list)
        
    lista_result = [item.strip() for sublist in lista_result for item in sublist] # flaten the result as well
    return lista_result

In [None]:
import docx
def process_docx(file_docx):
    '''
    Reads all tables from a procurement plan file in ms-word, keeping only str type of content
    and returns a list.
    date: 11/01/2020
    author: emilianoco@iadb.org
    version: 0.1
    '''
    #from docx import Document
    document = docx.Document(file_docx)
    tables = []
    for table in document.tables:
        df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
        for i, row in enumerate(table.rows):
            for j, cell in enumerate(row.cells):
                if cell.text:
                    df[i][j] = cell.text
        tables.append(pd.DataFrame(df))
    lista_result = []
    for i in range(0,len(tables)):
        lista_test = tables[i].values.tolist()
        flat_list = [item for sublist in lista_test for item in sublist] # flaten a list of lists
        flat_list = [item for item in flat_list if type(item) == str]
        lista_result.append(flat_list)
        
    lista_result = [item.strip() for sublist in lista_result for item in sublist] # flaten the result as well
    del tables
    return lista_result


In [None]:
file_index = data[(data.Document_Content.str.len() == 0) & ~(data.Document_Content == 'not available')].index.tolist()
for indice in file_index: 
    file_xlsx = file_dir + '/' + data.Document_Name[indice]
    print('Index:', str(indice))
    print('Processing file:', file_xlsx)
    
    if file_xlsx.endswith('xlsx') or file_xlsx.endswith('xls'):
        data.at[indice, 'Document_Content'] = process_excel(file_xlsx)
        
    elif file_xlsx.endswith('docx'):
        data.at[indice, 'Document_Content'] = process_docx(file_xlsx)
    
    print()

In [None]:
data.Document_Content[145]

In [None]:
# source: https://stackoverflow.com/questions/26521266/using-pandas-to-pd-read-excel-for-multiple-worksheets-of-the-same-workbook

In [None]:
data.columns

In [None]:
data['Document_Content_2'] = ''

In [None]:

# check all pdf's:
for index, row in data.iterrows():
    if data['Document_Name'][index].endswith('pdf'):
        print(index)
        lista_res = []
        for i in range(0, len(data['Document_Content'][index])):
            lista_res.append(data['Document_Content'][index][i].lower().strip().split('\n\n\n'))
        flat_list = [item.strip().replace('\n', '') for sublist in lista_res for item in sublist] # flaten a list of lists
        data.at[index, 'Document_Content_2'] = flat_list
        
    else:
        data.at[index, 'Document_Content_2'] = data['Document_Content'][index]
    
        
        

In [None]:
data['Document_Content_2'][0]

In [None]:
#data['Document_Content'][data[data.OPERATION_NUMBER == 'PE-L1229'].index.astype(int)[0]][4].lower().strip().split('\n\n\n')

###### Store intermediate results as xlsx and joblib

In [None]:
# to excel:
data.to_excel('./output/procurement_plans.xlsx')

In [None]:
#v2.1: all documents 
f_data = 'df_procurement_plans_2020-11-01_v21.joblib'
joblib.dump(data, './output/' + f_data + '.bz2', compress=('bz2', 3))#

### v2.2: continuation:

In [None]:
# load previous run
data = joblib.load('./output/df_procurement_plans_2020-11-01_v21.joblib.bz2')
data.head()

In [None]:
# merge list content and remove '/n'
data['Document_Content_3'] = ''
for index, row in data.iterrows():
    data.at[index, 'Document_Content_3'] = '. '.join(data.Document_Content_2[index]).replace('/n', ' ')

#### Language Detection

In [None]:
from langdetect import detect

In [None]:
%%time
#run language detection on 'Document_Content_3'
data['language'] = ''

for index, row in data.iterrows():
    data.at[index, 'language'] = detect(data['Document_Content_3'][index])

data.language.value_counts()

In [None]:
data[data['language'] == 'ca']

In [None]:
### Stanza:
import stanza
from stanza import *
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma', lang='es', use_gpu=True) 

In [None]:
stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True) 
stNLP_en = stanza.Pipeline(processors='tokenize,mwt,pos,lemma', lang='en', use_gpu=True) 

In [None]:
stanza.download('pt', package='bosque', processors='tokenize,mwt,pos,lemma', verbose=True) 
stNLP_pt = stanza.Pipeline(processors='tokenize,mwt,pos,lemma', lang='pt', use_gpu=True) 

In [None]:
###########################################################################################################

In [None]:
data['data_lemmatized'] = ''

In [None]:
%%time

allowed_postags = ['PROPN', 'NOUN', 'ADJ']

for index, row in data.iterrows():
    print('processing index:', index)
    if data['language'][index] == 'es':
        test = stNLP(data['Document_Content_3'][index])
        data.at[index, 'data_lemmatized'] = [word.lemma.lower() for sent in test.sentences for word in sent.words if word.pos in allowed_postags]
        
    elif data['language'][index] == 'en':
        test = stNLP_en(data['Document_Content_3'][index])
        data.at[index, 'data_lemmatized'] = [word.lemma.lower() for sent in test.sentences for word in sent.words if word.pos in allowed_postags]
        
    elif data['language'][index] == 'pt':
        test = stNLP_pt(data['Document_Content_3'][index])
        data.at[index, 'data_lemmatized'] = [word.lemma.lower() for sent in test.sentences for word in sent.words if word.pos in allowed_postags]
        
    else:
        data.at[index, 'data_lemmatized'] = 'Not available'
        
    

In [None]:
data.data_lemmatized[0]

In [None]:
### Terms to search for:

In [None]:
keyterms_orig = ['web', 'Sistema', 'Informático ', 'Digital', 'Analítico ', 'Dato', 'Información', 'dato', 'Hardware', 'Software', 'Integración', 'Implementación', 'Desarrollo', 'Control', 'Tecnología', 'Interoperable', 'Automatización ', 'Aplicación', 'Aplicativo ', 'Protocolo', 'Plataforma', 'Electrónico', 'Servidor', \
                 'Biométrico', 'Internet', 'Big', 'data', 'Inteligencia',  'artificial', 'Red', 'Sitio', 'Licencia', 'Comunicaciones', 'Comunicación', 'Equipos', 'Equipo']
keyterms_sp = [item.lower().strip() for item in keyterms_orig]
#keyterms_sp

In [None]:
print(keyterms_sp)

In [None]:
keyterms_en = ['web', 'website', 'system', 'information', 'digital', 'analytical', 'data', 'information', 'data', 'hardware', 'software', 'integration', 'implementation', 'developing', 'control', 'technology', 'interoperable', 'automation', 'application', 'applicative', 'protocol', 'platform', 'electronic', 'server', 'biometric', 'internet', 'big', 'data', 'intelligence', 'artificial', 'network', 'site', 'license', 'communication', 'equipment']
keyterms_pt = ['web', 'sistema', 'informático', 'digital', 'analítico', 'dado', 'informação', 'dados', 'hardware', 'software', 'integração', 'implementação', 'desenvolvimento', 'controle', 'tecnologia', 'interoperável', 'automação', 'aplicação', 'aplicativo', 'protocolo', 'plataforma', 'eletrônico', 'electrónico', 'electronico',  'servidor', 'server', 'biométrico', 'internet', 'grande', 'big', 'data', 'dados', 'inteligência', 'artificial', 'internet', 'rede', 'network', 'licença', 'license', 'comunicações', 'comunicação', 'equipes', 'equipe']

In [None]:
# search function: 
def words_in_string(word_list, a_string):
    return set(word_list).intersection(a_string.split())

In [None]:
for word in words_in_string(keyterms_sp, ' '.join(data['data_lemmatized'][155])):
    print(word)

In [None]:
list(words_in_string(keyterms_sp, ' '.join(data['data_lemmatized'][155])))

In [None]:
### Search:

In [None]:
data['search_results'] = ''

In [None]:
for index, row in data.iterrows():
    print('processing index:', index)
    if data['language'][index] == 'es':
        data.at[index, 'search_results'] = list(words_in_string(keyterms_sp, ' '.join(data['data_lemmatized'][index])))
        
    elif data['language'][index] == 'en':
        data.at[index, 'search_results'] = list(words_in_string(keyterms_en, ' '.join(data['data_lemmatized'][index])))
        
    elif data['language'][index] == 'pt':
        data.at[index, 'search_results'] = list(words_in_string(keyterms_pt, ' '.join(data['data_lemmatized'][index])))
        
    else:
        data.at[index, 'search_results'] = 'Not available'
        

In [None]:
data.head()

In [None]:
# store_results:

In [None]:
data.to_excel('./output/procurement_plans-keyterms_search_2020-11-18.xlsx')

In [None]:
####

In [None]:
data['Document_Content'][0]



# **************************************************************************************************************** #
<br>
<br>
<br>

In [None]:
## FIN

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                   Change:                                       


#   - 2.1           11/18/2020        Emiliano Colina    - Procurement Plans in diverse formats: xlsx, doc, pdf
#                                                        - xlsx files with multiple sheets
                                                                                                                  
#   - 2.0           08/31/2020        Emiliano Colina    - Initial version, starting with already read TCs from     
#                                                        notebook "Digital Transformation Advisory - 02.0 - Document Processing TCs"


#
# **************************************************************************************************************** #
#'''
