In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 03.0 - NLP Processing (loans and TCs)
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): pdf files
#-- Description:  
#                - Merge all dataframes (Loans and TCs)
#                - Perform NLP tasks
#                - Spanish and English documents are processed separately and filtered based on external list
#                
#                
#
#-- @authors:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  1.3
#-- Last Update: 01/15/2020
#-- Last Revision Date: 10/19/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import os
import re, numpy as np, pandas as pd
from pprint import pprint
import joblib

In [None]:
from collections import Counter

#### Environment Setup

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

In [None]:
# download the es_core_news_lg:
#!python -m spacy download es_core_news_lg

In [None]:
# ------- START v1.3 ---------- #

#### Load TCs and Loans

In [None]:
## TCs - v1.3
df_tcs = joblib.load('./output/df_resultado_tcs_2021-01-14_v22.joblib.bz2')
df_tcs.head()

In [None]:
## Loans - v1.3
df_loans = joblib.load('./output/df_resultado_loans_2021-01-14_v07.joblib.bz2')
df_loans.head()

#### Merge

In [None]:
data_base = pd.concat([df_tcs, df_loans])
data_base.head()

In [None]:
# rename
data_base = data_base.rename(columns = {'extracted_cleaned_v2':'extracted'})
print('* Number of operations:', data_base.shape)
print('* Columns:', data_base.columns )

#### Adjustments

In [None]:
# data_base.reset_index(inplace=True)
data_base.reset_index(drop=True, inplace=True)

In [None]:
# Drop specific columns 
data_base.drop(['Document_Content', 'title_inicial', 'title_final', 'lista_paginas', 'extracted_v2'], axis=1, inplace=True)

In [None]:
data_base.head()

In [None]:
# v1.3
# remove trailing spaces:
data_base['OPERATION_NUMBER'] = data_base['OPERATION_NUMBER'].str.strip()
data_base['FK_OPERATION_ID'] = data_base['FK_OPERATION_ID'].apply(str).str.strip()
data_base['DOCUMENT_ID'] = data_base['DOCUMENT_ID'].apply(str).str.strip()
data_base['DOCUMENT_REFERENCE'] = data_base['DOCUMENT_REFERENCE'].apply(str).str.strip()
data_base['DESCRIPTION'] = data_base['DESCRIPTION'].apply(str).str.strip()

# set everything to str
data_base['extracted'] = data_base['extracted'].apply(str)

In [None]:
data_base

## Filter Operations with Document - v1.3

In [None]:
# Load operations' filters
df_filters = pd.read_excel('./input/Lista de operaciones y documentos 2017-2020.xlsx')
df_filters['FK_OPERATION_ID'] = df_filters['FK_OPERATION_ID'].apply(str).str.strip()
df_filters['DOCUMENT_ID'] = df_filters['DOCUMENT_ID'].apply(str).str.strip()
df_filters['DOCUMENT_REFERENCE'] = df_filters['DOCUMENT_REFERENCE'].apply(str).str.strip()
df_filters['DESCRIPTION'] = df_filters['DESCRIPTION'].apply(str).str.strip()
df_filters

In [None]:
# filter by selected operations:
df1 = df_filters.merge(data_base, on=['FK_OPERATION_ID', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION'])

In [None]:
df1

### Spanish Documents - v1.3

In [None]:
df_base = df1[df1['language'] == 'es']
df_base

In [None]:
df_base.doc_type.value_counts()

In [None]:
# check for duplicates:
df_base[df_base.duplicated(subset=['OPERATION_NUMBER'])]

In [None]:
print(df_base[df_base.OPERATION_NUMBER == 'UR-L1140'])
print(df_base[df_base.OPERATION_NUMBER == 'UR-L1156'])

In [None]:
df_base.drop([369], inplace=True)
df_base.drop([1273], inplace=True)
df_base.reset_index(drop=True, inplace=True)

In [None]:
print(df_base.shape)
df_base.doc_type.value_counts()

## Spanish: Text Preparation

### 1. Textacy processing - Annotation

In [None]:
import textacy

# Load Spacy Spanish model in Textacy:
es = textacy.load_spacy_lang('es_core_news_lg')

In [None]:
# Textacy processing on extracted text: 
df_base['textacy_processing'] = ''

In [None]:
%%time
for index, row in df_base.iterrows():
    #print('Processing index:', str(index))
    df_base.at[index, 'textacy_processing'] = textacy.make_spacy_doc(df_base.extracted[index], lang=es)
    
df_base.head()

#### v1.3: save results and continue processing in notebook “Digital Transformation - 03.1 - NLP Processing Spanish (Loans and TCs - Stanza) (workpaper)”

In [None]:
%%time
# v1.3: Store df_base, in Spanish, containing terms
f_df_base_es2 = 'nlp_2021-01-15_spacy_annotated_spanish.joblib'
joblib.dump(df_base, './output/' + f_df_base_es2 + '.bz2', compress=('bz2', 3))#

In [None]:
# -------- END v1.3 Spanish ----------- #

# **************************************************************************************************************** #
<br>
<br>
<br>

## Load and Merge the Documents Collection

#### Load TCs and Loans

In [None]:
## TCs
df_tcs = joblib.load('./output/df_resultado_tcs_2020-10-17_v21.joblib.bz2')
df_tcs.head()

In [None]:
## Loans
df_loans = joblib.load('./output/df_resultado_loans_2020-11-04_v06.joblib.bz2')
df_loans.head()

#### Merge

In [None]:
data_base = pd.concat([df_tcs, df_loans])

In [None]:
data_base.head()

In [None]:
# rename
data_base = data_base.rename(columns = {'extracted_cleaned_v2':'extracted'})
print('* Number of operations:', data_base.shape)
print('* Columns:', data_base.columns )

#### Adjustments

In [None]:
# data_base.reset_index(inplace=True)
data_base.reset_index(drop=True, inplace=True)

In [None]:
# Drop specific columns 
data_base.drop(['Document_Content', 'title_inicial', 'title_final', 'lista_paginas', 'extracted_v2'], axis=1, inplace=True)

In [None]:
data_base.head()

# **************************************************************************************************************** #
<br>
<br>
<br>

#### Storing (intermediate)

In [None]:
# v1.3
# store merged TCs and Loans documents
f_df_data_base = 'merged_tcs_and_loans_2021-01-15_vfinal.joblib'
joblib.dump(data_base, './output/' + f_df_data_base + '.bz2', compress=('bz2', 3))#

In [None]:
# v1.3
data_base.to_excel('merged_tcs_and_loans_2021-01-15_vfinal.xlsx')

In [None]:
# v1.1
# store merged TCs and Loans documents
f_df_data_base = 'merged_tcs_and_loans_2020-11-09_vfinal.joblib'
joblib.dump(data_base, './output/' + f_df_data_base + '.bz2', compress=('bz2', 3))#

In [None]:
# v1.1
data_base.to_excel('merged_tcs_and_loans_2020-11-09_vfinal.xlsx')

In [None]:
# store merged TCs and Loans documents
f_df_data_base = 'merged_tcs_and_loans_2020-10-19_vfinal.joblib'
joblib.dump(data_base, './output/' + f_df_data_base + '.bz2', compress=('bz2', 3))#

In [None]:
data_base.to_excel('merged_tcs_and_loans_2020-10-19_vfinal.xlsx')

# **************************************************************************************************************** #
<br>
<br>
<br>

### Load whole document collection (Loans + TCs)

In [None]:
data_base = joblib.load('./output/merged_tcs_and_loans_2020-11-09_vfinal.joblib.bz2')

In [None]:
# remove trailing spaces:
data_base['OPERATION_NUMBER'] = data_base['OPERATION_NUMBER'].str.strip()

# set everything to str
data_base['extracted'] = data_base['extracted'].apply(str)

In [None]:
data_base

# Spanish Language Documents

In [None]:
# Load operations' filters: SPANISH documents
df_filters_1 = pd.read_excel('./input/Lista de Operaciones con Documento Encontrado-ES-EN.xlsx', sheet_name='ES')
df_filters_1['OPERATION_NUMBER'] = df_filters_1['OPERATION_NUMBER'].str.strip()

In [None]:
# Load operations' filters: latest documents
df_filters_2 = pd.read_excel('./input/Data-30 Sep 2020.xlsx', sheet_name='data_filtered')
df_filters_2['OPERATION_NUMBER'] = df_filters_2['OPERATION_NUMBER'].str.strip()

In [None]:
# Load operations' filters: latest documents
df_filters_3 = pd.read_excel('./input/Data-01 Nov 2020.xlsx', sheet_name='data_filtered')
df_filters_3['OPERATION_NUMBER'] = df_filters_3['OPERATION_NUMBER'].str.strip()

In [None]:
df_filters = pd.concat([df_filters_1[['OPERATION_NUMBER']], df_filters_2[['OPERATION_NUMBER']], \
                        df_filters_3[['OPERATION_NUMBER']]], ignore_index=True)

In [None]:
df_filters

In [None]:
# filter by selected operations:
df_base = data_base[data_base['OPERATION_NUMBER'].isin(df_filters['OPERATION_NUMBER'])]
# select the Spanish documents:
df_base = df_base[df_base['language'] == 'es']

In [None]:
df_base

In [None]:
# check for duplicates
df_base[df_base.duplicated(subset=['OPERATION_NUMBER'])]

In [None]:
print(df_base[df_base.OPERATION_NUMBER == 'UR-L1140'])
print(df_base[df_base.OPERATION_NUMBER == 'UR-L1156'])

In [None]:
df_base.drop([1393], inplace=True)
df_base.drop([1828], inplace=True)
df_base.reset_index(drop=True, inplace=True)

In [None]:
df_base

## Spanish: NLP n-Gram Analysis - using Textacy bag-of-terms

In [None]:
import spacy, es_core_news_lg
nlp_es = spacy.load('es_core_news_lg', disable=['ner'])

#### Stop Words Setup

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
# Spacy stop_words
stop_words.extend(nlp_es.Defaults.stop_words)

In [None]:
# custom stop_words:
stop_words.extend(['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'ill', 'descripción', 'componente', 'objetivo', 'ct', 'mailto', 'país', 'millón', 'millones', \
                   'año', 'años', 'dólar', 'dolar', 'dólares', 'si', 'bid', 'us', 'oc', 'gn', 'tc', 'atn', 'opc', 'pib', 'ar', 'br', 'uy', 'cl', 'co', \
                   'cclip', 'pbl', 'uis', 'ab', 'org', 'pr', 'bo', 'bl', 'pe', 'ec', 'ja', 'mx', 'ca', 'gu', 'su', 'ho', 'hn', 'mr', 'rg', 'ee', 'uu', \
                   'cr', 'tdr', 'rn', 'nº', 'usd', 'gy', 've', 'et', 'the', 'for', 'to', 'grt', 'fm', 'pr', 'pa', 'ni', 'aa', 'es', 'sp', \
                   'inglés', 'cty', 'nv', 'profisco', 'asimismo', 'actual', 'costo', 'resultar', 'esperar', 'ejecutar', 'unidad', 'agencia', 'justificación', \
                   'véase', 'ct', 'dela', 'enel', 'sobrar', 'of'])

stop_words = list(set(stop_words))

In [None]:
#sorted(stop_words)

In [None]:
## New version 11/09
#stop_words_en = stopwords.words('english')
#stop_words_en

### 1. Textacy processing

In [None]:
import textacy

# Load Spacy Spanish model in Textacy:
es = textacy.load_spacy_lang('es_core_news_lg')

In [None]:
# Textacy processing on extracted text: 
df_base['textacy_processing'] = ''

In [None]:
%%time
for index, row in df_base.iterrows():
    #print('Processing index:', str(index))
    df_base.at[index, 'textacy_processing'] = textacy.make_spacy_doc(df_base.extracted[index].lower(), lang=es)
    
df_base.head()

### 2. List of Terms (Bag-of-Terms): n-Grams extraction

#### 2.2. List of Terms Generation

In [None]:
df_base['list_of_terms'] = ''

### Alternativa 2 (11/11):

In [None]:
df_base.drop(['list_of_terms_alt2'], axis=1, inplace=True)
df_base['alt2_list_terms_base'] = ''
df_base['alt2_list_terms'] = ''
df_base.head()

In [None]:
%%time
for index, row in df_base.iterrows():
    print('processing:', index)
    #generate terms (returns a generator):
    terms_list = df_base['textacy_processing'][index]._.to_terms_list(ngrams=(2, 3, 4, 5, 6), entities=False, normalize="lower", weighting="count", as_strings=True, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=['PROPN', 'NOUN', 'ADJ', 'ADP', 'DET'], min_freq=2)

    #convert to list:
    terms_list = list(terms_list)

    #create dictio {term, count}:
    resultado_pre = Counter([item.lower() for item in terms_list])
    
    #store result:
    df_base.at[index, 'alt2_list_terms_base'] = resultado_pre
    print('done!')

In [None]:
%%time
for index, row in df_base.iterrows():
    resultado = []
    
    print('processing:', index)
    #compute desired pos and join terms:
    for k,v in df_base['alt2_list_terms_base'][index].items():
        doc = stNLP(k); term = ('_'.join([word.lemma for sent in doc.sentences for word in sent.words if word.pos in ['NOUN', 'ADJ', 'PROPN', 'PUNCT']]), v)
        if '_' in term[0]:
            resultado.append(term)
    
    #merge repetitive terms and counts :
    resultado_pre = {x[0] for x in resultado}
    resultado_post = [(i,sum(x[1] for x in resultado if x[0] == i)) for i in resultado_pre]
    
    #store:
    df_base.at[index, 'alt2_list_terms'] = resultado_post
    
    del resultado; del resultado_pre; del resultado_post
    print('done!')
    

In [None]:
df_base.head()

In [None]:
%%time
# v1.2 - intermediate
# store intermediate processed TCs and Loans documents
f_df_data_base = 'merged_tcs_and_loans_2020-11-12_intermed.joblib'
joblib.dump(df_base, './output/' + f_df_data_base + '.bz2', compress=('bz2', 3))#

# Revision

In [None]:
df_base[df_base.OPERATION_NUMBER == 'RG-T3352']

In [None]:
df_base['alt2_list_terms_base'][743]

In [None]:
df_base['alt2_list_terms'][743]

In [None]:
index = 550
#convert to list:
terms_list = list(df_base['list_of_terms'][index])

#create dictio {term, count}:
resultado_pre_1 = Counter([item.lower() for item in terms_list])
sorted(resultado_pre_1)

In [None]:
df_base['alt2_list_terms_base'][550]

### end alternativa 2

In [None]:
%%time
for index, row in df_base.iterrows():
    #print('Processing index:', str(index))
    
    #generate terms:
    terms_list = df_base['textacy_processing'][index]._.to_terms_list(ngrams=(2, 3, 4, 5, 6), entities=False, normalize="lemma", weighting="count", as_strings=True, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=['PROPN', 'NOUN', 'ADJ', 'ADP'], min_freq=2)
    
    #replace blanks with '_':
    resultado_pre = Counter([(item.lower()).replace(' ', '_') for item in terms_list])
    
    #select terms that appear 2 or more times, convert to list including count and store:
    df_base.at[index, 'list_of_terms'] = [k for (k,v) in resultado_pre.items() for count in range(v) if v > 1 ]


In [None]:
%%time
# remove stop_words from terms
for index, row in df_base.iterrows():
    lista_test_1 = df_base['list_of_terms'][index]
    resultado = []
    for item in lista_test_1:
        #print(item.split('_'))
        subitem = item.split('_')
        resultado.append('_'.join([word for word in subitem if word not in stop_words]))

    lista_test_1 = [word for word in resultado if '_' in word]        
    df_base.at[index, 'list_of_terms'] = lista_test_1
    del resultado
    del lista_test_1

In [None]:
##### alternative #1 (11/10)
df_base['list_of_terms_pure'] = ''

In [None]:
df_base.columns

In [None]:
#######

In [None]:
#df_base[df_base.OPERATION_NUMBER == 'CO-T1496'][['list_of_terms']]
df_base['list_of_terms'][139]

#### 2.3. List of Terms: Clean-up

##### Clean-up

In [None]:
df_base

In [None]:
%%time
#expand the list of tuples:
for index, row in df_base.iterrows():
    print('processing index:', index)
    df_base.at[index, 'alt2_list_terms'] = [k for (k,v) in df_base.alt2_list_terms[index] for count in range(v)]
    print('done!')

In [None]:
terms_result = df_base.alt2_list_terms.to_list()

In [None]:
terms_grams = []
for i in range(len(terms_result)):
    for token in terms_result[i]:
        terms_grams.append(token)

In [None]:
(len((terms_grams)),len(set(terms_grams)))

In [None]:
terms_grams = Counter(terms_grams)
sort_orders_terms = sorted(terms_grams.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms:
    print(i[0], i[1])

In [None]:
len(sort_orders_terms)

In [None]:
terms_to_remove = []
for i in range(0,len(sort_orders_terms)):
    if (sort_orders_terms[i][0].endswith('_iv') or sort_orders_terms[i][0].endswith('_ii') or sort_orders_terms[i][0].endswith('_us$') or sort_orders_terms[i][0].endswith('/') \
        or sort_orders_terms[i][0].endswith('.') or sort_orders_terms[i][0].endswith('_i') or sort_orders_terms[i][0].endswith('_iii') or sort_orders_terms[i][0].endswith('_”')\
        or sort_orders_terms[i][0].endswith('_a') or sort_orders_terms[i][0].startswith('“_') or sort_orders_terms[i][0].startswith('f._') or sort_orders_terms[i][0].startswith('a._') \
        or sort_orders_terms[i][0].startswith('b._') or sort_orders_terms[i][0].startswith('c._') or sort_orders_terms[i][0].startswith('d._') \
        or sort_orders_terms[i][0].startswith('e._') or sort_orders_terms[i][0].startswith('v._') or sort_orders_terms[i][0].startswith('i._') \
        or sort_orders_terms[i][0].startswith('g._') or sort_orders_terms[i][0].startswith('iv._') or sort_orders_terms[i][0].startswith('&_') \
        or sort_orders_terms[i][0].startswith('actividad/_') or sort_orders_terms[i][0].startswith('ct_') or sort_orders_terms[i][0].startswith('atn_/') \
        or sort_orders_terms[i][0].startswith('/_') or sort_orders_terms[i][0].startswith('ii.') or sort_orders_terms[i][0].startswith('iii_') or sort_orders_terms[i][0].startswith('iv_')\
        or sort_orders_terms[i][0].startswith('a_') or sort_orders_terms[i][0].endswith('_rev') or sort_orders_terms[i][0].startswith('x_') or sort_orders_terms[i][0].startswith('p_') \
        or sort_orders_terms[i][0].startswith('d_') or sort_orders_terms[i][0].startswith('enel_') or sort_orders_terms[i][0].endswith('_enel') or sort_orders_terms[i][0].endswith('_p') or sort_orders_terms[i][0].endswith('_d')\
        or sort_orders_terms[i][0].endswith('_figura') or sort_orders_terms[i][0].endswith('_sp') or sort_orders_terms[i][0].endswith('_cis') or sort_orders_terms[i][0].endswith('_csc') or sort_orders_terms[i][0].endswith('_cobit')\
        or sort_orders_terms[i][0].startswith('dela_') or sort_orders_terms[i][0].endswith('_dela') or sort_orders_terms[i][0].endswith('_nist') or sort_orders_terms[i][0].endswith('_cert') \
        or sort_orders_terms[i][0].endswith('_t') or sort_orders_terms[i][0].endswith('_m') or sort_orders_terms[i][0].startswith('m_') or sort_orders_terms[i][0].startswith('is_') or sort_orders_terms[i][0].startswith('for_')\
        or sort_orders_terms[i][0].startswith('and_') or sort_orders_terms[i][0].startswith('of_') or sort_orders_terms[i][0].startswith('or_') or sort_orders_terms[i][0].startswith('this_') or sort_orders_terms[i][0].startswith('does_')\
        or sort_orders_terms[i][0].startswith('are_') or sort_orders_terms[i][0].startswith('j_') or sort_orders_terms[i][0].startswith('c_') or sort_orders_terms[i][0].endswith('_is') \
        or sort_orders_terms[i][0].endswith('_be') or sort_orders_terms[i][0].endswith('_and') or sort_orders_terms[i][0].endswith('_are') or sort_orders_terms[i][0].endswith('_of')\
        or sort_orders_terms[i][0].endswith('_nº') or sort_orders_terms[i][0].endswith('_t(')
       ):
       
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
len(set(terms_to_remove))

In [None]:
def num_there(s):
    return any(i.isdigit() for i in s)

In [None]:
# remove grmas containing digits:
for i in range(0,len(sort_orders_terms)):
    if (num_there(sort_orders_terms[i][0]) and not ('covid' in sort_orders_terms[i][0] or '2700' in sort_orders_terms[i][0] \
                                                    or 'p2p' in sort_orders_terms[i][0] or '5g' in sort_orders_terms[i][0])):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('indicado_' in sort_orders_terms[i][0]) or (sort_orders_terms[i][0].endswith('_indicado')):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('mencionado' in sort_orders_terms[i][0]) or ('siguiente' in sort_orders_terms[i][0]) or ('párrafo' in sort_orders_terms[i][0]) or ('referido' in sort_orders_terms[i][0]) or ('lleva' in sort_orders_terms[i][0]) or ('recibi' in sort_orders_terms[i][0]) or\
        ('esperado' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if (sort_orders_terms[i][0].endswith('_cabo') or sort_orders_terms[i][0].startswith('cabo_')) and \
            not ('haitiano' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in sort_orders_terms:
    if 'paralo'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '▪' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'componente' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'agencia'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'eficiente'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'figura' in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'consultor'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'actual_'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '/'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_+_' in i[0] or i[0].endswith('_+') or i[0].startswith('+_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'eeo'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'acordado'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_x_' in i[0] or i[0].endswith('_x') or i[0].startswith('x_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_dicho_' in i[0] or i[0].endswith('_dicho') or i[0].startswith('dicho_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_d_' in i[0] or i[0].endswith('_d') or i[0].startswith('d_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_meta_' in i[0] or i[0].endswith('_meta') or i[0].startswith('meta_')) and not 'inflación' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
#for i in sort_orders_terms:
#    if  'operación_cooperación' in i[0] or 'asistencia_técnico' in i[0] or 'capacidad_institucional' in i[0] or 'estrategia_institucional' in i[0] or 'área_transversal' in i[0] or 'agencia_ejecutora' in i[0]:
#        print(i[0], i[1])
#        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'ejecutor'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'relacionado'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'utilizado' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'definido' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'sigla' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'útil' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].startswith('único_'):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if (i[0].startswith('._') or i[0].endswith('_.') or '.' in i[0]):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if (i[0].startswith('-_') or i[0].endswith('_-') or i[0].startswith('−_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if (i[0].startswith('órgano_')):
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    #if (i[0].startswith('-_') or i[0].endswith('_-') or i[0].startswith('−_')):
    if 'organización' in i[0]:
        print(i[0])#, i[1])
        #terms_to_remove.append(i[0])

In [None]:
'asistencia_técnico', 'capacidad_institucional', 'estrategia_institucional', 'área_transversal', 'estructurar_ejecución', \
 'componente_descripción', 'operación_préstamo', 'organismo_ejecutor', 'américa_latina', 'cooperación_técnico', 'cooperación_técnica', 'operación_de_cooperación', 'agencia_ejecutora', \
  'operación_de_cooperación_técnico', 'agencia_ejecutor', 'nivel_mundial', 'organismo_ejecutor', 'unidad_ejecutor', 'resultar_esperar', 'producto_esperar', 'esperar_del_componente', \
                                    'resultados_esperar', 'resultar_esperar_del_componente', 'principal_resultar_esperar', 'poner_en_funcionamiento', 'problema_específico', \
                                    'nivel_nacional', 'autoridad_nacional', 'presentar_operación', 'resultar_anual', 'estructurar_organizacional', 'gobernar_central', 'tomar_de_decisión',\
                                    'adicional_con_programar', 'documento_de_marco_sectorial', 'adquisición_de_insumo', 'proyectar_pilotar', 'alto_impactar', 'efectividad_comparar', \
                                    'modelar_de_negociar', 'término_de_referenciar', 'aumentar_sostener', 'aumentar_inicial', 'aumentar_del_nivel', 'lección_aprender', 'et_al', 'new_area', \
                                    'technical_assistance', 'optional_link', 'good_practice', 'sector_framework', 'action_plan', 'short_term', 'long_term', 'medium_term', 'year_implementation'\
                                    'tc_resource', 'year_action', 'grant_operation', 'year_of_age'

In [None]:
test = []
for i in sort_orders_terms:
    #if  i[0].startswith('banco_') or i[0].endswith('_banco'): # (i[0].startswith('nivel_') or or i[0].startswith('−_')):
    if 'acompa' in i[0]:
       # print(i[0], i[1])
        test.append((i[0], i[1]))
sorted(test)

In [None]:
sorted([item[0] for item in test])

In [None]:
test = []
for i in sort_orders_terms:
    #if (i[0].startswith('-_') or i[0].endswith('_-') or i[0].startswith('−_')):
    if 'préstamo' in i[0]:
       # print(i[0], i[1])
        test.append((i[0], i[1]))
sorted(test)

In [None]:
for i in sort_orders_terms:
    #if (i[0].startswith('-_') or i[0].endswith('_-') or i[0].startswith('−_')):
    if '+' in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    #if (i[0].startswith('-_') or i[0].endswith('_-') or i[0].startswith('−_')):
    if 'aplicación' in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
#terms_to_remove = terms_to_remove + ['componente_iia', 'asistencia_técnico', 'capacidad_institucional', 'estrategia_institucional', 'área_transversal', 'estructurar_ejecución', \
# 'componente_descripción', 'operación_préstamo', 'organismo_ejecutor', 'américa_latina', 'cooperación_técnico', 'cooperación_técnica', 'operación_de_cooperación', 'agencia_ejecutora', \
#  'operación_de_cooperación_técnico', 'agencia_ejecutor', 'nivel_mundial', 'organismo_ejecutor', 'unidad_ejecutor', 'resultar_esperar', 'producto_esperar', 'esperar_del_componente', \
#                                     'resultados_esperar', 'resultar_esperar_del_componente', 'principal_resultar_esperar', 'poner_en_funcionamiento', 'problema_específico', \
#                                     'nivel_nacional', 'autoridad_nacional', 'presentar_operación', 'resultar_anual', 'estructurar_organizacional', 'gobernar_central', 'tomar_de_decisión'\
#                                    'adicional_con_programar', 'documento_de_marco_sectorial', 'adquisición_de_insumo', 'proyectar_pilotar', 'alto_impactar', 'efectividad_comparar', \
#                                    'modelar_de_negociar', 'término_de_referenciar', 'aumentar_sostener', 'aumentar_inicial', 'aumentar_del_nivel', 'lección_aprender', 'partir_interesar', \
#                                    'resultar_desear', 'centralizar_capaz', 'área_relacionar', 'información_utilizar', 'decisión_relacionar', 'generación_transmisiónuna_combinación', \
#                                    'meta_b']

In [None]:
terms_to_remove = list(set(terms_to_remove))

In [None]:
len(terms_to_remove)

#### 2.5. Remove selected terms

In [None]:
df_base['alt2_terms'] = ''

In [None]:
%%time
for index, row in df_base.iterrows():
    #print('Processing index:', str(index))
    df_base.at[index, 'alt2_terms'] = [word for word in df_base['alt2_list_terms'][index] if word not in terms_to_remove and '_' in word]

#### 2.6. Check results

In [None]:
terms_final = df_base.alt2_terms.to_list()

In [None]:
terms_final_flat = []
for i in range(len(terms_final)):
    for token in terms_final[i]:
        terms_final_flat.append(token)

In [None]:
len(set(terms_final_flat))

In [None]:
terms_final_flat = Counter(terms_final_flat)
sort_orders_terms_final = sorted(terms_final_flat.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms_final:
    print(i[0], i[1])

In [None]:
df_base.head()

In [None]:
## v1.2: Store terms processed by stanza (Spanish)
with open('./output/terms_lemmat_tcs_and_loans_2020-11-12_spanish.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(df_base.alt2_terms, filehandle)

# **************************************************************************************************************** #
<br>
<br>
<br>

## Spanish: NLP Token extraction and processing

In [None]:
import gensim

In [None]:
# Tokenize Sentences and Clean
def sent_to_words(sentences):
    for sent in sentences:
        #sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        #sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=False) #modificado
        yield(sent) 

In [None]:
# Convert to list
data = df_base['extracted'].values.tolist()
data_words = list(sent_to_words(data))

In [None]:
pprint(data_words[:1])

In [None]:
# Main functions
#def remove_stopwords(texts, stop_words):
#    return [[word for word in gensim.utils.simple_preprocess(str(doc), deacc=False) if word not in stop_words] for doc in texts]
#
##
#def lemmatization(texts, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP']):
#    texts_out = []
#    for sent in texts:
#        doc = nlp_es(" ".join(sent)) 
#        texts_out.append([token.lemma_ for token in doc if (len(token) > 1 and token.pos_ in allowed_postags)])
#    # remove stopwords once more after lemmatization
#    texts_out = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]   
#    return texts_out

In [None]:
#stop_words

In [None]:
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words, stop_words)
#
## Data Lemmatized
#data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP'])

In [None]:
#
def lemmatization(texts, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP']):
    texts_out = []
    for sent in texts:
        doc = stNLP(" ".join(sent)) 
        texts_out.append([word.lemma.lower() for sent in doc.sentences for word in sent.words if word.pos in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]   
    return texts_out

In [None]:
%%time
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words, stop_words)

# Data Lemmatized
data_lemmatized = lemmatization(data_words, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP'])

In [None]:
# Word evaluation:
word_stats_only_tokens = []
for i in range(len(data_lemmatized)):
    for token in data_lemmatized[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats_only_tokens.append(token)

In [None]:
len(set(word_stats_only_tokens))

### Merge tokens and terms/n-Grams

In [None]:
df_base.head()

In [None]:
data_lemmatized[550]

In [None]:
df_base.reset_index(drop=True, inplace=True)

In [None]:
# obtained terms/n-grams are added to the dataset
data_lemmatized_full = []
for index, row in df_base.iterrows():
    data_lemmatized_full.append(data_lemmatized[index] + df_base.alt2_terms[index])

#  ~~ ****** ~~ 

In [None]:
# Word evaluation:
word_stats = []
for i in range(len(data_lemmatized_full)):
    for token in data_lemmatized_full[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats.append(token)

In [None]:
word_stats = Counter(word_stats)

sort_orders = sorted(word_stats.items(), key=lambda x: x[1], reverse=True)

for i in sort_orders:
    print(i[0], i[1])

In [None]:
df_base['data_lemmatized_full'] = data_lemmatized_full

# **************************************************************************************************************** #
<br>
<br>
<br>

## Store results

In [None]:
import pickle

In [None]:
df_base.head()

In [None]:
## v1.2: Store data_lemmatized_full ONLY TOKENS (Spanish)
with open('./output/data_lemmatized_TOKENS_es_2020-11-12.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized, filehandle)

In [None]:
## v1.2: Store data_lemmatized_full (Spanish)
with open('./output/data_lemmatized_full_es_2020-11-12.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full, filehandle)

In [None]:
# v1.2: Store df_base, in Spanish, containing terms
f_df_base_es = 'nlp_df_base_2020-11-12_spanish.joblib'
joblib.dump(df_base[['doc_type', 'language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'Document_Name',
       'extracted', 'alt2_list_terms_base', 'alt2_list_terms', 'alt2_terms', 'data_lemmatized_full']], './output/' + f_df_base_es + '.bz2', compress=('bz2', 3))#

In [None]:
%%time
# v1.2: Store df_base, in Spanish, containing terms
f_df_base_es2 = 'nlp_2020-11-12_spacy_annotated_spanish.joblib'
joblib.dump(df_base[['OPERATION_NUMBER', 'DOCUMENT_REFERENCE', 'Document_Name', 'textacy_processing']], './output/' + f_df_base_es2 + '.bz2', compress=('bz2', 3))#

# **************************************************************************************************************** #
<br>
<br>
<br>

# **************************************************************************************************************** #
<br>
<br>
<br>

In [None]:
#####

In [None]:
## v1.1: Store data_lemmatized_full (Spanish)
with open('./output/data_lemmatized_full_es_2020-11-10.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full, filehandle)

In [None]:
# v1.1: Store df_base, in Spanish, containing terms
f_df_base_es = 'nlp_df_base_2020-11-10_spanish.joblib'
joblib.dump(df_base[['doc_type', 'language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'extracted', 'terms', 'data_lemmatized_full']], './output/' + f_df_base_es + '.bz2', compress=('bz2', 3))#

In [None]:
#####

In [None]:
## v1.0: Store data_lemmatized_full (Spanish)
with open('./output/data_lemmatized_full_es_2020-10-20.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full, filehandle)

In [None]:
# v1.0: Store df_base, in Spanish, containing terms
f_df_base_es = 'nlp_df_base_2020-10-20_spanish.joblib'
joblib.dump(df_base[['doc_type', 'language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'extracted', 'terms']], './output/' + f_df_base_es + '.bz2', compress=('bz2', 3))#

In [None]:
############ ----------------------------- FIN Spanish ---------------------------- ############

# **************************************************************************************************************** #
<br>
<br>
<br>

<br>
<br>
<br>

# English Language Documents

### English Documents - v1.3

In [None]:
df_base_en = df1[df1['language'] == 'en']
df_base_en

In [None]:
df_base_en.doc_type.value_counts()

In [None]:
# check for duplicates:
df_base_en[df_base_en.duplicated(subset=['OPERATION_NUMBER'])]

In [None]:
print(df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1140'])
print(df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1156'])
print(df_base_en[df_base_en.OPERATION_NUMBER == 'BR-T1415'])

In [None]:
df_base_en.drop([367], inplace=True)
df_base_en.drop([1271], inplace=True)
df_base_en.drop([1259], inplace=True)
df_base_en.reset_index(drop=True, inplace=True)

In [None]:
print(df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1140'])
print(df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1156'])
print(df_base_en[df_base_en.OPERATION_NUMBER == 'BR-T1415'])

In [None]:
print(df_base_en.shape)
df_base_en.doc_type.value_counts()

## English: Text Preparation

### 1. Textacy processing - Annotation

In [None]:
import textacy

# Load Spacy English model in Textacy:
en = textacy.load_spacy_lang('en_core_web_lg')

In [None]:
# Textacy processing on extracted text: 
df_base_en['textacy_processing'] = ''

In [None]:
%%time
for index, row in df_base_en.iterrows():
    #print('Processing index:', str(index)) - no usar .lower() !!!!
    df_base_en.at[index, 'textacy_processing'] = textacy.make_spacy_doc(df_base_en.extracted[index].lower(), lang=en)
    
df_base_en.head()

#### v1.3: save results and continue processing in notebook “Digital Transformation - 03.2 - NLP Processing English (Loans and TCs - Stanza) (workpaper)”

In [None]:
%%time
# v1.3: Store df_base, in English, containing terms
f_df_base_en2 = 'nlp_2021-01-15_spacy_annotated_english.joblib'
joblib.dump(df_base_en, './output/' + f_df_base_en2 + '.bz2', compress=('bz2', 3))#

In [None]:
# -------- END v1.3 English ----------- #

In [None]:
# Load operations' filters: ENGLISH documents
df_filters_1_en = pd.read_excel('./input/Lista de Operaciones con Documento Encontrado-ES-EN.xlsx', sheet_name='EN')
df_filters_1_en['OPERATION_NUMBER'] = df_filters_1_en['OPERATION_NUMBER'].str.strip()

In [None]:
df_filters_en = pd.concat([df_filters_1_en[['OPERATION_NUMBER']], df_filters_2[['OPERATION_NUMBER']], \
                        df_filters_3[['OPERATION_NUMBER']]], ignore_index=True)

In [None]:
df_filters_en

In [None]:
# filter by selected operations:
df_base_en = data_base[data_base['OPERATION_NUMBER'].isin(df_filters_en['OPERATION_NUMBER'])]
# select the Spanish documents:
df_base_en = df_base_en[df_base_en['language'] == 'en']

##### Check for duplicates:

In [None]:
len(df_base_en.OPERATION_NUMBER.unique())

In [None]:
df_base_en[df_base_en.duplicated(subset=['OPERATION_NUMBER'])]

In [None]:
df_base_en[df_base_en.OPERATION_NUMBER == 'BR-T1415']

In [None]:
df_base_en.drop([742], inplace=True)

In [None]:
df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1140']

In [None]:
df_base_en.drop([1392], inplace=True)

In [None]:
df_base_en[df_base_en.OPERATION_NUMBER == 'UR-L1156']

In [None]:
df_base_en.drop([1827], inplace=True)

In [None]:
df_base_en.reset_index(drop=True, inplace=True)

In [None]:
### ~~ ### ~~ ### ~~ ###

In [None]:
df_base_en.shape

In [None]:
df_base_en.doc_type.value_counts()

In [None]:
# Need to remove the following Loans associated to Haiti since latest searches in Covergence do not show these operations as approved:
df_base_en[df_base_en.OPERATION_NUMBER.str.startswith('HA-L')]['OPERATION_NUMBER'] 

In [None]:
lista_index_to_remove = list(df_base_en[df_base_en.OPERATION_NUMBER.str.startswith('HA-L')]['OPERATION_NUMBER'].index)
for i in lista_index_to_remove:
    df_base_en.drop([i], inplace=True)

In [None]:
df_base_en.doc_type.value_counts()

In [None]:
df_base_en.reset_index(drop=True, inplace=True)

In [None]:
df_base_en.head()

In [None]:
# Store df with both document types:
f_df_base_en = 'df_loans_tcs_2020-12-07_english.joblib'
joblib.dump(df_base_en, './output/' + f_df_base_en + '.bz2', compress=('bz2', 3))#

In [None]:
#!python -m spacy download en_core_web_lg

## English: NLP n-Gram Analysis - using Textacy bag-of-terms

In [None]:
import spacy, en_core_web_lg
nlp_en = spacy.load('en_core_web_lg', disable=['ner'])

#### Stop Words Setup

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Spacy stop_words
stop_words.extend(nlp_en.Defaults.stop_words)

In [None]:
# custom stop_words:
stop_words.extend(['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'ill', 'descripción', 'componente', 'objetivo', 'ct', 'mailto', 'país', 'millón', 'millones', \
                   'año', 'años', 'dólar', 'dolar', 'dólares', 'si', 'bid', 'us', 'oc', 'gn', 'tc', 'atn', 'opc', 'pib', 'ar', 'br', 'uy', 'cl', 'co', \
                   'cclip', 'pbl', 'uis', 'ab', 'org', 'pr', 'bo', 'bl', 'pe', 'ec', 'ja', 'mx', 'ca', 'gu', 'su', 'ho', 'hn', 'mr', 'rg', 'ee', 'uu', \
                   'cr', 'tdr', 'rn', 'nº', 'usd', 'gy', 've', 'et', 'the', 'for', 'to', 'grt', 'fm', 'pr', 'pa', 'ni', 'aa', 'es', 'sp', 'tor', 'tr', \
                   'inglés', 'cty', 'nv', 'profisco', 'asimismo', 'actual', 'costo', 'resultar', 'esperar', 'ejecutar', 'unidad', 'agencia', 'justificación', \
                   'véase', 'ct', 'loan', 'paragraph', 'lac', 'optional'])

stop_words = list(set(stop_words))

### 1. Textacy processing

In [None]:
import textacy

# Load Spacy Spanish model:
en = textacy.load_spacy_lang('en_core_web_lg')

In [None]:
# Textacy processing on extracted text: 
df_base_en['textacy_processing'] = ''

In [None]:
%%time
for index, row in df_base_en.iterrows():
    #print('Processing index:', str(index))
    df_base_en.at[index, 'textacy_processing'] = textacy.make_spacy_doc(df_base_en.extracted[index].lower(), lang=en)
    
df_base_en.head()

In [None]:
type(df_base_en.textacy_processing[0])

### 2. List of Terms (Bag-of-Terms): n-Grams extraction

#### 2.2. List of Terms Generation

In [None]:
df_base_en['list_of_terms'] = ''

In [None]:
%%time
for index, row in df_base_en.iterrows():
    print('Processing index:', str(index))
    
    #generate terms:
    terms_list_en = df_base_en['textacy_processing'][index]._.to_terms_list(ngrams=(2, 3, 4, 5, 6), entities=False, normalize="lemma", weighting="count", as_strings=True, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=['PROPN', 'NOUN', 'ADJ', 'ADP'], min_freq=2)
    
    #replace blanks with '_':
    resultado_pre_en = Counter([(item.lower()).replace(' ', '_') for item in terms_list_en])
    
    #select terms that appear 2 or more times, convert to list including count and store:
    df_base_en.at[index, 'list_of_terms'] = [k for (k,v) in resultado_pre_en.items() for count in range(v) if v > 1 ]


#### 2.3. List of Terms: Clean-up

##### Clean-up

In [None]:
df_base_en

In [None]:
terms_result = df_base_en.list_of_terms.to_list()

In [None]:
terms_grams = []
for i in range(len(terms_result)):
    for token in terms_result[i]:
        terms_grams.append(token)

In [None]:
(len((terms_grams)),len(set(terms_grams)))

In [None]:
terms_grams = Counter(terms_grams)
sort_orders_terms = sorted(terms_grams.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms:
    print(i[0], i[1])

In [None]:
len(sort_orders_terms)

In [None]:
terms_to_remove_en = []
for i in range(0,len(sort_orders_terms)):
    if (sort_orders_terms[i][0].endswith('_iv') or sort_orders_terms[i][0].endswith('_ii') or sort_orders_terms[i][0].endswith('_us$') or sort_orders_terms[i][0].endswith('/') \
        or sort_orders_terms[i][0].endswith('.') or sort_orders_terms[i][0].endswith('_i') or sort_orders_terms[i][0].endswith('_iii') or sort_orders_terms[i][0].endswith('_”')\
        or sort_orders_terms[i][0].endswith('_a') or sort_orders_terms[i][0].startswith('“_') or sort_orders_terms[i][0].startswith('f._') or sort_orders_terms[i][0].startswith('a._') \
        or sort_orders_terms[i][0].startswith('b._') or sort_orders_terms[i][0].startswith('c._') or sort_orders_terms[i][0].startswith('d._') \
        or sort_orders_terms[i][0].startswith('e._') or sort_orders_terms[i][0].startswith('v._') or sort_orders_terms[i][0].startswith('i._') \
        or sort_orders_terms[i][0].startswith('g._') or sort_orders_terms[i][0].startswith('iv._') or sort_orders_terms[i][0].startswith('&_') \
        or sort_orders_terms[i][0].startswith('actividad/_') or sort_orders_terms[i][0].startswith('ct_') or sort_orders_terms[i][0].startswith('atn_/') \
        or sort_orders_terms[i][0].startswith('/_') or sort_orders_terms[i][0].startswith('ii.') or sort_orders_terms[i][0].startswith('iii_') or sort_orders_terms[i][0].startswith('iv_')\
        or sort_orders_terms[i][0].startswith('a_') or sort_orders_terms[i][0].endswith('_rev') or sort_orders_terms[i][0].startswith('x_') or sort_orders_terms[i][0].startswith('p_') \
        or sort_orders_terms[i][0].startswith('d_') or sort_orders_terms[i][0].startswith('enel_') or sort_orders_terms[i][0].endswith('_enel') or sort_orders_terms[i][0].endswith('_p') or sort_orders_terms[i][0].endswith('_d')\
        or sort_orders_terms[i][0].endswith('_figura') or sort_orders_terms[i][0].endswith('_sp') or sort_orders_terms[i][0].endswith('_cis') or sort_orders_terms[i][0].endswith('_csc') or sort_orders_terms[i][0].endswith('_cobit')\
        or sort_orders_terms[i][0].startswith('dela_') or sort_orders_terms[i][0].endswith('_dela') or sort_orders_terms[i][0].endswith('_nist') or sort_orders_terms[i][0].endswith('_cert') \
        or sort_orders_terms[i][0].endswith('_t') or sort_orders_terms[i][0].endswith('_m') or sort_orders_terms[i][0].startswith('m_') or sort_orders_terms[i][0].startswith('is_') or sort_orders_terms[i][0].startswith('for_')\
        or sort_orders_terms[i][0].startswith('and_') or sort_orders_terms[i][0].startswith('of_') or sort_orders_terms[i][0].startswith('or_') or sort_orders_terms[i][0].startswith('this_') or sort_orders_terms[i][0].startswith('does_')\
        or sort_orders_terms[i][0].startswith('are_') or sort_orders_terms[i][0].startswith('j_') or sort_orders_terms[i][0].startswith('c_') or sort_orders_terms[i][0].endswith('_is') \
        or sort_orders_terms[i][0].endswith('_be') or sort_orders_terms[i][0].endswith('_and') or sort_orders_terms[i][0].endswith('_are') or sort_orders_terms[i][0].endswith('_of')\
        or sort_orders_terms[i][0].endswith('_nº') or sort_orders_terms[i][0].endswith('_t(') or sort_orders_terms[i][0].startswith('table_i') or sort_orders_terms[i][0].endswith('_c') or sort_orders_terms[i][0].endswith('_rg') \
        or sort_orders_terms[i][0].endswith('_al') or sort_orders_terms[i][0].endswith('_atn') or sort_orders_terms[i][0].endswith('_aim') or sort_orders_terms[i][0].endswith('_™') or sort_orders_terms[i][0].startswith('δ') \
        or sort_orders_terms[i][0].endswith('_de') or sort_orders_terms[i][0].startswith('de_') or sort_orders_terms[i][0].endswith('_del') or sort_orders_terms[i][0].startswith('del_') or sort_orders_terms[i][0].endswith('_of_new')
       ):
       
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
len(set(terms_to_remove_en))

In [None]:
def num_there(s):
    return any(i.isdigit() for i in s)

In [None]:
# remove grmas containing digits:
for i in range(0,len(sort_orders_terms)):
    if (num_there(sort_orders_terms[i][0]) and not ('covid' in sort_orders_terms[i][0] or '2700' in sort_orders_terms[i][0] or 'revolution' in sort_orders_terms[i][0] \
                                                    or 'p2p' in sort_orders_terms[i][0] or '5g' in sort_orders_terms[i][0])):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('mention' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('following' in sort_orders_terms[i][0]) or ('received' in sort_orders_terms[i][0]) or ('section' in sort_orders_terms[i][0]) \
    or ('component' in sort_orders_terms[i][0] and not 'critical_component' in sort_orders_terms[i][0]) \
    or ('option' in sort_orders_terms[i][0] and not 'adoption' in sort_orders_terms[i][0]) or ('this_component' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('paragraph' in sort_orders_terms[i][0]) or ('abbreviation' in sort_orders_terms[i][0]) or ('conclusion' in sort_orders_terms[i][0]) \
    or ('revision_of' in sort_orders_terms[i][0]) or ('▪' in sort_orders_terms[i][0]) or ('.' in sort_orders_terms[i][0]) or ('+' in sort_orders_terms[i][0]) or ('¶' in sort_orders_terms[i][0]) \
    or ('$' in sort_orders_terms[i][0]) or ('' in sort_orders_terms[i][0]) or ('=' in sort_orders_terms[i][0]) or ('/' in sort_orders_terms[i][0]) \
    or ('€' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
for i in sort_orders_terms:
    if 'figure' in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'related'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if sort_orders_terms[i][0].endswith('_term') or sort_orders_terms[i][0].startswith('term_'):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove_en.append(sort_orders_terms[i][0])

In [None]:
for i in sort_orders_terms:
    if 'term'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '▪' in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'component' in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'executing'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'execut'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_x_' in i[0] or i[0].endswith('_x') or i[0].startswith('x_')) or ('_d_' in i[0] or i[0].endswith('_d') or i[0].startswith('d_')) or \
        ('_m_' in i[0] or i[0].endswith('_m') or i[0].startswith('m_')) or (i[0].endswith('_c') or i[0].startswith('c_') or '_c_' in i[0]) or \
        (i[0].endswith('_e') or i[0].startswith('e_') or '_e_' in i[0]):
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_l') or i[0].startswith('l_') or i[0].endswith('_del') or i[0].startswith('del_') or i[0].endswith('detallado') or i[0].startswith('detallado_') \
        or i[0].endswith('_n') or i[0].startswith('n_'):
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_ic') or i[0].startswith('ic_') or i[0].endswith('_r') or i[0].startswith('r_') or i[0].startswith('f_') or i[0].endswith('_v') or i[0].startswith('sa_') \
        or i[0].startswith('an_') or i[0].endswith('_an') or i[0].startswith('by_') or i[0].endswith('_by') or i[0].startswith('or_') or i[0].endswith('_or') \
        or i[0].startswith('can_') or i[0].endswith('_can') or i[0].startswith('to_') or i[0].endswith('_to') or i[0].startswith('cabo_') or i[0].endswith('_cabo') \
        or i[0].startswith('the_') or i[0].endswith('_the') or i[0].startswith('of_') or i[0].endswith('_of') or i[0].startswith('and_') or i[0].endswith('_and') \
        or '_and_' in i[0] or i[0].startswith('h_') or i[0].endswith('_b') or i[0].endswith('_nis') or i[0].endswith('_n') or i[0].startswith('one_') or i[0].endswith('_one') \
        or i[0].endswith('_af') or i[0].endswith('_aes') or i[0].endswith('_cc') or i[0].endswith('_sncti') or i[0].endswith('_cb') or i[0].endswith('_foppa'):
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'country_office'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_inc') or i[0].startswith('inc_') or ('_inc_' in i[0]) or i[0].endswith('_int') or i[0].startswith('int_') or ('_int_' in i[0]):
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
#for i in sort_orders_terms:
#    if 'utilizado' in i[0] or 'amplio_cobertura' in i[0] or 'definido' in i[0] or 'frecuencia_reportar' in i[0] or 'red_saro' in i[0] or 'brasil' in i[0] \
#        or 'uruguay' in i[0] or 'trinidad' in i[0] or 'concerniente' in i[0] or 'corea_sur' in i[0] or 'example' in i[0] or 'electric_corp' in i[0] or 'haiti' in i[0] or 'colombia' in i[0]\
#        or 'american_' in i[0] or 'lan_recuperac' in i[0] or 'forma_claro' in i[0] or 'costa_rica' in i[0] or 'proyecto_ley' in i[0] or 'ley_específico' in i[0]  or 'venezuela' in i[0]\
#        or 'cuarto_nivel' in i[0] or 'paraguay' in i[0] or 'barbado' in i[0] or 'taller' in i[0] or 'nacional_contratación_público' in i[0] or 'nacional_inteligencia' in i[0] \
#        or 'nacional_telecomunicación' in i[0] or 'aplicación_ens' in i[0] or 'instrumento_línea' in i[0] or 'evento_capacitación' in i[0] or 'curso_capacitación' in i[0] \
#        or 'calidad_software_control' in i[0] or 'ciberseguridadriesgo' in i[0] or 'guyana' in i[0] or 'chile' in i[0] or 'argentin' in i[0] or 'peru' in i[0] \
#        or 'ecuador' in i[0] or 'bolivia' in i[0] or 'guatemala' in i[0] or 'república_dominicano' in i[0] or 'dominican' in i[0] or 'país_caribe_oriental' in i[0]:    
#        print(i[0], i[1])
#        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'previous'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'wait'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'actual'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'in_fact'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'main_'  in i[0]:
        print(i[0], i[1])
        terms_to_remove_en.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '_to_the_'  in i[0] or '_for_the_'  in i[0] or '_in_the_'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

In [None]:
terms_to_remove_en = terms_to_remove_en + ['componente_iia', 'asistencia_técnico', 'capacidad_institucional', 'estrategia_institucional', 'área_transversal', 'estructurar_ejecución', \
 'componente_descripción', 'operación_préstamo', 'organismo_ejecutor', 'américa_latina', 'cooperación_técnico', 'cooperación_técnica', 'operación_de_cooperación', 'agencia_ejecutora', \
  'operación_de_cooperación_técnico', 'agencia_ejecutor', 'nivel_mundial', 'organismo_ejecutor', 'unidad_ejecutor', 'resultar_esperar', 'producto_esperar', 'esperar_del_componente', \
                                    'resultados_esperar', 'resultar_esperar_del_componente', 'principal_resultar_esperar', 'poner_en_funcionamiento', 'problema_específico', \
                                    'nivel_nacional', 'autoridad_nacional', 'presentar_operación', 'resultar_anual', 'estructurar_organizacional', 'gobernar_central', 'tomar_de_decisión',\
                                    'adicional_con_programar', 'documento_de_marco_sectorial', 'adquisición_de_insumo', 'proyectar_pilotar', 'alto_impactar', 'efectividad_comparar', \
                                    'modelar_de_negociar', 'término_de_referenciar', 'aumentar_sostener', 'aumentar_inicial', 'aumentar_del_nivel', 'lección_aprender', 'et_al', 'new_area', \
                                    'technical_assistance', 'optional_link', 'good_practice', 'sector_framework', 'action_plan', 'short_term', 'long_term', 'medium_term', 'year_implementation'\
                                    'tc_resource', 'year_action', 'grant_operation', 'year_of_age', 'year_implementation', 'property_of_all_the_document', 'support_of_the_world', 'main_lesson']

In [None]:
terms_to_remove_en = list(set(terms_to_remove_en))

In [None]:
len(terms_to_remove_en)

In [None]:
index=121
#df_base['list_of_terms'][1729]
[word for word in df_base_en['list_of_terms'][index] if word not in terms_to_remove_en and '_' in word]

In [None]:
### check for "datum":

for i in sort_orders_terms:
    if 'datum'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

In [None]:
# test for replacing "datum":
test = ['datum_center', 'datum_science', 'datum_initiative', 'open_datum_initiative', 'otra_cosa', 'cosa_otra']
[word if 'datum' not in word else word.replace('datum', 'data') for word in test]

#### 2.5. Remove selected terms

In [None]:
df_base_en['terms'] = ''

In [None]:
%%time
for index, row in df_base_en.iterrows():
    #print('Processing index:', str(index))
    df_base_en.at[index, 'terms'] = [word for word in df_base_en['list_of_terms'][index] if word not in terms_to_remove_en and '_' in word]
    # replace "datum":
    df_base_en.at[index, 'terms'] = [word if 'datum' not in word else word.replace('datum', 'data') for word in df_base_en['terms'][index]]

#### 2.6. Check results

In [None]:
terms_final = df_base_en.terms.to_list()

In [None]:
terms_final_flat = []
for i in range(len(terms_final)):
    for token in terms_final[i]:
        terms_final_flat.append(token)

In [None]:
len(set(terms_final_flat))

In [None]:
terms_final_flat = Counter(terms_final_flat)
sort_orders_terms_final = sorted(terms_final_flat.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms_final:
    print(i[0], i[1])

In [None]:
for i in sort_orders_terms_final:
    if 'data'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove_en.append(i[0])

# **************************************************************************************************************** #
<br>
<br>
<br>

## NLP Token extraction and processing

In [None]:
df_base_en.head()

In [None]:
df_base_en.textacy_processing[0]

In [None]:
# Convert to list
data_en = df_base_en['extracted'].values.tolist()
data_words_en = list(sent_to_words(data_en))

In [None]:
pprint(data_words_en[:1])

In [None]:
sorted(stop_words)

In [None]:
# Main functions
def remove_stopwords_en(texts, stop_words):
    return [[word for word in gensim.utils.simple_preprocess(str(doc), deacc=False) if word not in stop_words] for doc in texts]

#
def lemmatization_en(texts, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP']):
    texts_out = []
    for sent in texts:
        doc = nlp_en(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if (len(token) > 1 and token.pos_ in allowed_postags)])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]   
    return texts_out

In [None]:
%%time
# Remove Stop Words
data_words_nostops_en = remove_stopwords_en(data_words_en, stop_words)

# Data Lemmatized
data_lemmatized_en = lemmatization_en(data_words_nostops_en, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP'])

In [None]:
# replace "datum" with data:
for i in range(len(data_lemmatized_en)):
    data_lemmatized_en[i] = [word if 'datum' not in word else word.replace('datum', 'data') for word in data_lemmatized_en[i]]

In [None]:
# Word evaluation:
word_stats_only_tokens_en = []
for i in range(len(data_lemmatized_en)):
    for token in data_lemmatized_en[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats_only_tokens_en.append(token)

In [None]:
len(set(word_stats_only_tokens_en))

### Merge tokens and terms/n-Grams

In [None]:
df_base_en.reset_index(drop=True, inplace=True)

In [None]:
# obtained terms/n-grams are added to the dataset
data_lemmatized_full_en = []
for index, row in df_base_en.iterrows():
    data_lemmatized_full_en.append(data_lemmatized_en[index] + df_base_en.terms[index])

#  ~~ ****** ~~ 

In [None]:
# Word evaluation:
word_stats = []
for i in range(len(data_lemmatized_full_en)):
    for token in data_lemmatized_full_en[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats.append(token)

In [None]:
word_stats = Counter(word_stats)

sort_orders = sorted(word_stats.items(), key=lambda x: x[1], reverse=True)

for i in sort_orders:
    print(i[0], i[1])

# **************************************************************************************************************** #
<br>
<br>
<br>

In [None]:
# Adding data_lemmatized as a new column and store the results:
df_base_en['data_lemmatized'] = data_lemmatized_full_en

## Store results

In [None]:
import pickle

In [None]:
## v1.2: Store data_lemmatized_full (English)
with open('./output/data_lemmatized_full_en_2020-12-07.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full_en, filehandle)

In [None]:
# v1.2: Store df_base_en (English documents) containing all terms and tokens
f_df_base_en = 'nlp_df_base_2020-12-07_english_v1.2.joblib'
joblib.dump(df_base_en, './output/' + f_df_base_en + '.bz2', compress=('bz2', 3))#

In [None]:
######### ------- ##########

In [None]:
## v1.0: Store data_lemmatized_full (English)
with open('./output/data_lemmatized_full_en_2020-10-20.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full_en, filehandle)

In [None]:
# v1.0: Store df_base, in English, containing terms
f_df_base_en = 'nlp_df_base_2020-10-20_english.joblib'
joblib.dump(df_base_en[['doc_type', 'language', 'FK_OPERATION_ID', 'OPERATION_NUMBER',
       'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 'Document_Name',
       'extracted', 'terms']], './output/' + f_df_base_en + '.bz2', compress=('bz2', 3))#

# **************************************************************************************************************** #
<br>
<br>
<br>

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                   Change:                                       

#   - 1.3           01/15/2021        Emiliano Colina    - Filter updated to include all operations from 01/2017 to      
#                                                         12/2020

#   - 1.2           12/07/2020        Emiliano Colina    - Only English documents NLP Processing with Spacy, since      
#                                                         Spanish docs were processed in a separate ntbk using Stanza


#   - 1.1           11/10/2020        Emiliano Colina    - Updated version, removing stop-words from terms,      
#                                                         tested inflector


#   - 1.0           10/19/2020        Emiliano Colina    - Initial version, to include a separate layer for NLP     
#                                                         processing on a dedicated notebook

#
# **************************************************************************************************************** #
#'''
