# Digital Transformation Advisory

## Specialized Documents in English - Stanza (workpaper)

In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: Specialized Documents in English - Stanza (workpaper)
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): pdf files
#-- Description:  
#                - Specialized Documents in English, transitioned to Stanza
#                
#                
#                
#
#-- @author:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  1.2
#-- Last Update: 02/08/2021
#-- Last Revision Date: 12/08/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

#### Environment Setup

In [None]:
import os
import pandas as pd
import re

In [None]:
from ast import literal_eval
import joblib
import pickle
from collections import Counter
from pprint import pprint

#### PDF libraries

In [None]:
from io import StringIO
from bs4 import BeautifulSoup
from tika import parser

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

In [None]:
# **************************************************************************************************************** #

In [None]:
def folder_selector(knowledge_area, language):
    '''
    Returns the folder name containing the desired documents.
    @ author: emilianoco
    Version:
        - v0.1 - (10/14/2020)
    '''   
    if knowledge_area == 'cybersecurity':
        if language == 'en':
            return "/specialized_docs_EN"
        else:
            return "/specialized_docs_SP"
    elif knowledge_area == 'digital_transf':
        if language == 'en':
            return "/specialized_docs_Digital_EN" 
        
        
def filter_df(knowledge_area, language, df):
    '''
    Returns the dataframe df filtered by 'Knowledge_Area' (cybersecurity, digital_transf) and language (en,sp)
    @ author: emilianoco
    Version:
        - v0.1 - (10/14/2020)
    ''' 
    try: 
        return df[df.Applicability.str.contains(language) & df.Knowledge_Area.str.contains(knowledge_area)]
    except Exception as e:
        print('error in dataframe: ', e)

def generate_base(knowledge_area, language, df):
    '''
    Returns a new and filtered dataframe of documents to be read along with their respective path.
    @ author: emilianoco
    Version:
        - v0.1 - (10/14/2020)
    '''     
    # English documents:
    #knowledge_area = 'cybersecurity'; language = 'en'; df = doc_library
    folder_prefix = './' + folder_selector(knowledge_area, language) + '/'
    #filter_df('digital_trans', 'en', doc_library)
    if language == 'en':
        return folder_prefix, filter_df(knowledge_area, language, df).iloc[0:][['Title_EN', 'Filename_EN', 'Short_Name', 'Page_Range']].reset_index(drop=True)
    else:
        return folder_prefix, filter_df(knowledge_area, language, df).iloc[0:][['Title_SP', 'Filename_SP', 'Short_Name', 'Page_Range']].reset_index(drop=True)

In [None]:
def read_pdf(filename):
    '''
    Reads a pdf file using Tika+Beautiful Soup and returns a list containing each page as string
    @ author: emilianoco
    Version:
        - v0.1 - (10/14/2020)
    ''' 
    pages_txt = []
    
    # Read PDF file
    data = parser.from_file(filename, xmlContent=True)
    xhtml_data = BeautifulSoup(data['content'])
    for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
        # Parse PDF data using TIKA (xml/html)
        # It's faster and safer to create a new buffer than truncating it
        # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
    
        # Add pages
        if parsed_content['content'] != None:    # page is not blank page
            text = parsed_content['content'].strip()
        else: 
            text = ''
        
        pages_txt.append(text)
    
    return pages_txt

In [None]:
# **************************************************************************************************************** #

### Specialized Docs database

In [None]:
doc_library = pd.read_excel('./output/LDA_analisis.xlsx', sheet_name='documentos', skiprows=2, usecols='B:P')
doc_library

In [None]:
doc_library[doc_library.Applicability.str.contains('en')]

### Cybersecurity documents (English)

In [None]:
folder_prefix, df_base = generate_base('cybersecurity', 'en', doc_library)

In [None]:
from ast import literal_eval
# modify the page range to literally be as tuple
df_base['Page_Range'] = [literal_eval(x) for x in df_base['Page_Range']]

In [None]:
df_base

#### 1. Read Specialized documents and store their content

In [None]:
%%time
# add a column to store the file content:
df_base['content'] = ''

doc_count = 0

for index, row in df_base.iterrows():
    print("## Processing item:", str(index))
    
    # get filename:
    filename = folder_prefix + df_base.iloc[:,1][index]
    
    # read pdf file
    pages_txt = read_pdf(filename)
    
    # save results and report status:
    df_base.at[index, 'content'] = pages_txt
    doc_count += 1
    
    print("Completed doc index:", str(index), "Document number:", str(doc_count))
    del pages_txt
    del filename
    print('------')
    print()
print('Documents read:', str(doc_count))
del doc_count

In [None]:
df_base

In [None]:
df_base.at[9, 'Page_Range'] = (9, 68)

In [None]:
df_base.content[9][68]

In [None]:
# Headers to be removed: 
hd_list_en = ['April 16, 2018  Cybersecurity Framework Version 1.1 \n\n\nThis publication is available free of charge from: https://doi.org/10.6028/NIST.CSWP.04162018 ', \
              None, \
              'This publication is available free of charge from\n: https://doi.org/10.6028/N\n\n\nIS\nT.S\n\n\nP\n.800-53r5 \n\n\n\n\n\n', \
              'NIST IR 7298 Revision 2, Glossary of Key Information Security Terms \n\n\n', \
              'POWER SECTOR DEPENDENCY ON TIME SERVICE \nApril 2020 \n\n\n ', \
              ' GUIDE FOR AN ASSET INVENTORY MANAGEMENT IN ICS \n\n\n', \
              'PROCUREMENT GUIDELINES FOR CYBERSECURITY IN HOSPITALS \nFebruary 2020 \n\n\n', \
              'Communication network dependencies for ICS/SCADA Systems \nDecember 2016   \n\n\n\n\n\n', \
              'Prime Minister’s Office National Cybersecurity Strategy  2019', \
              None, \
              'SPECIAL PUBLICATION 800-82 REVISION 2                   GUIDE TO INDUSTRIAL CONTROL SYSTEMS (ICS) SECURITY', \
              '  THE (ISC)2 CYBERSECURITY LEXICON THE (ISC)2 CYBERSECURITY LEXICON  ', \
              None, \
              None]

In [None]:
# store the content cleaned:
df_base['content_cleaned'] = ''

In [None]:
for index, row in df_base.iterrows():
    print('### Processing index: ', str(index), ' - page range:', df_base['Page_Range'][index])
    texto = ''
    for j in range(df_base['Page_Range'][index][0],df_base['Page_Range'][index][1]+1):
        
        # header clean-up
        if hd_list_en[index] != None:
            page = df_base['content'][index][j].replace(hd_list_en[index], ' \n ')
        else:
            page = df_base['content'][index][j].replace("\n\n\n", " ")
            
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,3}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,3}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
            
        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
            
        else: 
            texto = texto + ''.join(page) + ' '
            
    # Additional clean-up
    # - remove urls:
    texto = re.sub(r'https?://\S+', '', texto)
    
    # - remove chars:
    texto = texto.replace('\uf0b7', ''); texto = texto.replace('\uf06e', '')
    
    # - remove newline:
    texto = texto.replace('\n\n\n', ' '); texto = texto.replace('\n', '')
    
        
    df_base.at[index, 'content_cleaned'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()    
        
        

In [None]:
df_base.head()

In [None]:
df_base['extracted_cleaned'] = ''

for index, row in df_base.iterrows():
    texto = df_base['content_cleaned'][index].split()
    resultado = ["".join(filter(lambda x: not (x.isdigit()), word)) if re.search(r'[A-Za-záéíóú\)\”\"]+(\d{1,3}|[\¹\²\³\⁴\⁵\⁶\⁷\⁸\⁹\⁰]+)[\.\,\;\:]?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_base.at[index, 'extracted_cleaned'] = res_clean


In [None]:
print(df_base['extracted_cleaned'][5])

In [None]:
df_base['content'][5][9]

In [None]:
df_base['content'][5][10]

In [None]:
# v1: technical documents store (English)
f_df_base = 'df_technical_docs_cyber_english_2020-12-08_v1.joblib'
joblib.dump(df_base, './output/' + f_df_base + '.bz2', compress=('bz2', 3))#

In [None]:
###

### Digital Transformation documents (English)

In [None]:
folder_prefix, df_base_digital = generate_base('digital_transf', 'en', doc_library)

In [None]:
from ast import literal_eval
# modify the page range to literally be as tuple
df_base_digital['Page_Range'] = [literal_eval(x) for x in df_base_digital['Page_Range']]

In [None]:
df_base_digital.at[3, 'Short_Name'] = 'dig_workgrp_2'

In [None]:
df_base_digital

#### 1. Read Specialized documents and store their content

In [None]:
%%time
# add a column to store the file content:
df_base_digital['content'] = ''

doc_count = 0

for index, row in df_base_digital.iterrows():
    print("## Processing item:", str(index))
    
    # get filename:
    filename = folder_prefix + df_base_digital.iloc[:,1][index]
    
    # read pdf file
    pages_txt = read_pdf(filename)
    
    # save results and report status:
    df_base_digital.at[index, 'content'] = pages_txt
    doc_count += 1
    
    print("Completed doc index:", str(index), "Document number:", str(doc_count))
    del pages_txt
    del filename
    print('------')
    print()
print('Documents read:', str(doc_count))
del doc_count

In [None]:
df_base_digital

In [None]:
df_base_digital.content[4][5:9]

In [None]:
# Headers to be removed: 
hd_list_digital_en = [None, \
                      'Roundtable on Digitising European Industry: Working Group 1 - Digital Innovation Hubs \n\n\n', \
                      None, \
                      None, \
                      'The Future of Electricity']

In [None]:
# store the content cleaned:
df_base_digital['content_cleaned'] = ''

In [None]:
for index, row in df_base_digital.iterrows():
    print('### Processing index: ', str(index), ' - page range:', df_base_digital['Page_Range'][index])
    texto = ''
    for j in range(df_base_digital['Page_Range'][index][0],df_base_digital['Page_Range'][index][1]+1):
        
        # header clean-up
        if hd_list_digital_en[index] != None:
            page = df_base_digital['content'][index][j].replace(hd_list_digital_en[index], ' \n ')
        else:
            page = df_base_digital['content'][index][j].replace("\n\n\n", " ")
            
        # check for footnote and remove:
        if re.search(r'\s{30,}\d{1,3}\s+([A-Z]|http)', page) != None:    # 1st type of footnote found
            print('* Footnote pattern 1: \'30+ blanks + digit\' at:', str(j))
            # cut footnote area:
            page_clean = page[:re.search(r'\s{30,}\d{1,3}\s+([A-Z]|http)', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
            
        elif re.search(r'\n+\xa0+\n\d', page) != None: # 3rd type of footnote found!
            print('* Footnote 3: \'xa0 type\' at:', str(j))
            #  cut footnote area:
            page_clean = page[:re.search(r'\n+\xa0+\n\d', page).span()[0]]
            texto = texto + ''.join(page_clean) + ' \n '
            
        else: 
            texto = texto + ''.join(page) + ' '
            
    # Additional clean-up
    # - remove urls:
    texto = re.sub(r'https?://\S+', '', texto)
    
    # - remove chars:
    texto = texto.replace('\uf0b7', ''); texto = texto.replace('\uf06e', '') ; texto = texto.replace('\uf0a4', '')
    
    # - remove newline:
    texto = texto.replace('\n\n\n', ' '); texto = texto.replace('\n', '')
    
        
    df_base_digital.at[index, 'content_cleaned'] = texto.strip()
    
    del texto
    
    print()
    print()
    print('#-#-#-#')
    print()    
        
        

In [None]:
df_base_digital.head()

In [None]:
df_base_digital['extracted_cleaned'] = ''

for index, row in df_base_digital.iterrows():
    texto = df_base_digital['content_cleaned'][index].split()
    resultado = ["".join(filter(lambda x: not (x.isdigit()), word)) if re.search(r'[A-Za-záéíóú\)\”\"]+(\d{1,3}|[\¹\²\³\⁴\⁵\⁶\⁷\⁸\⁹\⁰]+)[\.\,\;\:]?$', word) else word for word in texto]
    res_clean = ' '.join(resultado)
    df_base_digital.at[index, 'extracted_cleaned'] = res_clean


In [None]:
print(df_base_digital['extracted_cleaned'][3])

In [None]:
df_base_digital['content'][3][9]

In [None]:
# v1: technical documents store (English)
f_df_base_digital = 'df_technical_docs_digital_english_2020-12-08_v1.joblib'
joblib.dump(df_base_digital, './output/' + f_df_base_digital + '.bz2', compress=('bz2', 3))#

In [None]:
###

In [None]:
df_base_digital

In [None]:
df_test = pd.concat([df_base,df_base_digital], ignore_index=False)
df_test.reset_index(drop=True, inplace=True)
df_test

In [None]:
# v1: both cyber and digital_transformation technical documents store (English)
f_df_test = 'df_technical_docs_all_english_2020-12-08_v1.joblib'
joblib.dump(df_test, './output/' + f_df_test + '.bz2', compress=('bz2', 3))#

In [None]:
###

## English: NLP n-Gram Analysis - using Textacy bag-of-terms

In [None]:
import joblib

In [None]:
import spacy, en_core_web_lg
nlp_en = spacy.load('en_core_web_lg', disable=['ner'])

In [None]:
nlp_en.max_length = 1432000

In [None]:
# v1.2 update:
df_test = joblib.load('./output/nlp_spec_docs_annotated_2020-12-09_english_v1_final.joblib.bz2')

In [None]:
df_test

#### Stop Words Setup

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words_es = stopwords.words('spanish')
stop_words_en = stopwords.words('english')

final_stop_words = stop_words_es + stop_words_en

# Spacy stop_words
final_stop_words.extend(nlp_en.Defaults.stop_words)

In [None]:
# custom stop_words:
final_stop_words.extend(['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'ill', 'descripción', 'componente', 'objetivo', 'ct', 'mailto', 'país', 'millón', 'millones', \
                   'año', 'años', 'dólar', 'dolar', 'dólares', 'si', 'bid', 'us', 'oc', 'gn', 'tc', 'atn', 'opc', 'pib', 'ar', 'br', 'uy', 'cl', 'co', \
                   'cclip', 'pbl', 'uis', 'ab', 'org', 'pr', 'bo', 'bl', 'pe', 'ec', 'ja', 'mx', 'ca', 'gu', 'su', 'ho', 'hn', 'mr', 'rg', 'ee', 'uu', \
                   'cr', 'tdr', 'rn', 'nº', 'usd', 'gy', 've', 'et', 'the', 'for', 'to', 'grt', 'fm', 'pr', 'pa', 'ni', 'aa', 'es', 'sp', \
                   'inglés', 'cty', 'nv', 'profisco', 'asimismo', 'actual', 'costo', 'resultar', 'esperar', 'ejecutar', 'unidad', 'agencia', 'justificación', \
                   'véase', 'ct', 'dela', 'enel', 'sobrar', 'of', 'único'])

final_stop_words = list(set(final_stop_words))

In [None]:
sorted(final_stop_words)

### 1. Textacy processing

In [None]:
import textacy

# Load Spacy English model in Textacy:
en = textacy.load_spacy_lang('en_core_web_lg')

In [None]:
# Textacy processing on extracted text: 
df_test['textacy_processing'] = ''

In [None]:
# unused module removed from the pipeline and memory increase for processing the documents:
en.remove_pipe('ner'); en.remove_pipe('parser'); en.pipe_names

In [None]:
en.max_length = 1432000 # or even higher

In [None]:
%%time
for index, row in df_test.iterrows():
    #print('Processing index:', str(index))
    df_test.at[index, 'textacy_processing'] = textacy.make_spacy_doc(df_test.extracted_cleaned[index], lang=en)
    
df_test.head()

### 2. List of Terms (Bag-of-Terms): n-Grams extraction

#### 2.2. List of Terms Generation

In [None]:
df_test['textacy_processing'][0]

In [None]:
df_test['alt2_list_terms_base'] = ''
df_test['alt2_list_terms'] = ''
df_test.head()

In [None]:
%%time
for index, row in df_test.iterrows():
    print('processing:', index)
    #generate terms (returns a generator):
    terms_list = df_test['textacy_processing'][index]._.to_terms_list(ngrams=(2, 3, 4, 5, 6), entities=False, normalize="lower", weighting="count", as_strings=True, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=['PROPN', 'NOUN', 'ADJ', 'ADP'], min_freq=2)


    #convert to list:
    terms_list = list(terms_list)

    #create dictio {term, count}:
    resultado_pre = Counter([item.lower() for item in terms_list])
    
    #store result:
    df_test.at[index, 'alt2_list_terms_base'] = resultado_pre
    print('done!')

In [None]:
df_test

In [None]:
df_test.alt2_list_terms_base[11]

In [None]:
import stanza

stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma', lang='en', use_gpu=True) 

In [None]:
%%time
for index, row in df_test.iterrows():
    resultado = []
    
    print('processing:', index)
    #compute desired pos and join terms:
    for k,v in df_test['alt2_list_terms_base'][index].items():
        doc = stNLP(k); term = ('_'.join([word.lemma for sent in doc.sentences for word in sent.words if word.pos in ['NOUN', 'ADJ', 'PROPN', 'PUNCT']]), v)
        if '_' in term[0]:
            resultado.append(term)
    
    #merge repetitive terms and counts :
    resultado_pre = {x[0] for x in resultado}
    resultado_post = [(i,sum(x[1] for x in resultado if x[0] == i)) for i in resultado_pre]
    
    #store:
    df_test.at[index, 'alt2_list_terms'] = resultado_post
    
    del resultado; del resultado_pre; del resultado_post
    print('done!')
    

In [None]:
#%%time
#for index, row in df_test.iterrows():
#    resultado = []
#    
#    print('processing:', index)
#    #compute desired pos and join terms:
#    for k,v in df_test['alt2_list_terms_base'][index].items():   # 'ADJ' needed: 'telefonía satelital'
#        doc = nlp_en(k); term = ('_'.join([word.lemma_ for word in doc]), v)
#        if '_' in term[0]:
#            resultado.append(term)
#    
#    #merge repetitive terms and counts :
#    resultado_pre = {x[0] for x in resultado}
#    resultado_post = [(i,sum(x[1] for x in resultado if x[0] == i)) for i in resultado_pre]
#    
#    #store:
#    df_test.at[index, 'alt2_list_terms'] = resultado_post
#    
#    del resultado; del resultado_pre; del resultado_post
#    print('done!')

#### 2.3. List of Terms: Clean-up

In [None]:
df_test.alt2_list_terms[0]

In [None]:
%%time
#expand the list of tuples:
for index, row in df_test.iterrows():
    print('processing index:', index)
    df_test.at[index, 'alt2_list_terms'] = [k for (k,v) in df_test.alt2_list_terms[index] for count in range(v)]
    print('done!')

In [None]:
# previous values: (36293, 4543)

In [None]:
terms_result = df_test.alt2_list_terms.to_list()

terms_grams = []
for i in range(len(terms_result)):
    for token in terms_result[i]:
        terms_grams.append(token)
        
print((len((terms_grams)),len(set(terms_grams))))

In [None]:
terms_grams = Counter(terms_grams)
sort_orders_terms = sorted(terms_grams.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms:
    print(i[0], i[1])

In [None]:
len(sort_orders_terms)

In [None]:
terms_to_remove = []
for i in range(0,len(sort_orders_terms)):
    if (sort_orders_terms[i][0].endswith('_iv') or sort_orders_terms[i][0].endswith('_ii') or sort_orders_terms[i][0].endswith('_us$') or sort_orders_terms[i][0].endswith('/') \
        or sort_orders_terms[i][0].endswith('.') or sort_orders_terms[i][0].endswith('_i') or sort_orders_terms[i][0].endswith('_iii') or sort_orders_terms[i][0].endswith('_”')\
        or sort_orders_terms[i][0].endswith('_a') or sort_orders_terms[i][0].startswith('“_') or sort_orders_terms[i][0].startswith('f._') or sort_orders_terms[i][0].startswith('a._') \
        or sort_orders_terms[i][0].startswith('b._') or sort_orders_terms[i][0].startswith('c._') or sort_orders_terms[i][0].startswith('d._') \
        or sort_orders_terms[i][0].startswith('e._') or sort_orders_terms[i][0].startswith('v._') or sort_orders_terms[i][0].startswith('i._') \
        or sort_orders_terms[i][0].startswith('g._') or sort_orders_terms[i][0].startswith('iv._') or sort_orders_terms[i][0].startswith('&_') \
        or sort_orders_terms[i][0].startswith('actividad/_') or sort_orders_terms[i][0].startswith('ct_') or sort_orders_terms[i][0].startswith('atn_/') \
        or sort_orders_terms[i][0].startswith('/_') or sort_orders_terms[i][0].startswith('ii.') or sort_orders_terms[i][0].startswith('iii_') or sort_orders_terms[i][0].startswith('iv_')\
        or sort_orders_terms[i][0].startswith('a_') or sort_orders_terms[i][0].endswith('_rev') or sort_orders_terms[i][0].startswith('x_') or sort_orders_terms[i][0].startswith('p_') \
        or sort_orders_terms[i][0].startswith('d_') or sort_orders_terms[i][0].startswith('enel_') or sort_orders_terms[i][0].endswith('_enel') or sort_orders_terms[i][0].endswith('_p') or sort_orders_terms[i][0].endswith('_d')\
        or sort_orders_terms[i][0].endswith('_figura') or sort_orders_terms[i][0].endswith('_sp') or sort_orders_terms[i][0].endswith('_cis') or sort_orders_terms[i][0].endswith('_csc') or sort_orders_terms[i][0].endswith('_cobit')\
        or sort_orders_terms[i][0].startswith('dela_') or sort_orders_terms[i][0].endswith('_dela') or sort_orders_terms[i][0].endswith('_nist') or sort_orders_terms[i][0].endswith('_cert') \
        or sort_orders_terms[i][0].endswith('_t') or sort_orders_terms[i][0].endswith('_m') or sort_orders_terms[i][0].startswith('m_') or sort_orders_terms[i][0].startswith('is_') or sort_orders_terms[i][0].startswith('for_')\
        or sort_orders_terms[i][0].startswith('and_') or sort_orders_terms[i][0].startswith('of_') or sort_orders_terms[i][0].startswith('or_') or sort_orders_terms[i][0].startswith('this_') or sort_orders_terms[i][0].startswith('does_')\
        or sort_orders_terms[i][0].startswith('are_') or sort_orders_terms[i][0].startswith('j_') or sort_orders_terms[i][0].startswith('c_') or sort_orders_terms[i][0].endswith('_is') \
        or sort_orders_terms[i][0].endswith('_be') or sort_orders_terms[i][0].endswith('_and') or sort_orders_terms[i][0].endswith('_are') or sort_orders_terms[i][0].endswith('_of')\
        or sort_orders_terms[i][0].endswith('_nº') or sort_orders_terms[i][0].endswith('_t(') or sort_orders_terms[i][0].startswith('sp_') or sort_orders_terms[i][0].endswith('_sp')\
        or sort_orders_terms[i][0].startswith('|_') or sort_orders_terms[i][0].endswith('_|') or sort_orders_terms[i][0].startswith('χ_') or sort_orders_terms[i][0].endswith('_χ') \
        or sort_orders_terms[i][0].startswith('_') or sort_orders_terms[i][0].endswith('') or sort_orders_terms[i][0].endswith('_kk') or sort_orders_terms[i][0].endswith('_kl') \
        or sort_orders_terms[i][0].endswith('_se') or sort_orders_terms[i][0].endswith('_ad') or sort_orders_terms[i][0].endswith('_w') or sort_orders_terms[i][0].endswith('_r') \
        or sort_orders_terms[i][0].endswith('_ar') or sort_orders_terms[i][0].endswith('_�') or sort_orders_terms[i][0].startswith('�_')
       ):
       
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
len(set(terms_to_remove))

In [None]:
def num_there(s):
    return any(i.isdigit() for i in s)

# remove grmas containing digits:
for i in range(0,len(sort_orders_terms)):
    if (num_there(sort_orders_terms[i][0]) and not ('covid' in sort_orders_terms[i][0] or '2700' in sort_orders_terms[i][0] \
                                                    or 'p2p' in sort_orders_terms[i][0] or '5g' in sort_orders_terms[i][0])):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('following' in sort_orders_terms[i][0]) or ('received' in sort_orders_terms[i][0]) or ('section' in sort_orders_terms[i][0]) \
    or ('option' in sort_orders_terms[i][0] and not 'adoption' in sort_orders_terms[i][0]) or ('this_component' in sort_orders_terms[i][0]) \
    or ('*' in sort_orders_terms[i][0]) or ('|' in sort_orders_terms[i][0]) or ('' in sort_orders_terms[i][0]) or ('�' in sort_orders_terms[i][0]) \
    or ('_x_' in sort_orders_terms[i][0]) or ('χ' in sort_orders_terms[i][0]) or ('PRON' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in range(0,len(sort_orders_terms)):
    if ('paragraph' in sort_orders_terms[i][0]) or ('abbreviation' in sort_orders_terms[i][0]) or ('conclusion' in sort_orders_terms[i][0]) \
    or ('revision_of' in sort_orders_terms[i][0]) or ('▪' in sort_orders_terms[i][0]) or ('.' in sort_orders_terms[i][0]) or ('+' in sort_orders_terms[i][0]) or ('¶' in sort_orders_terms[i][0]) \
    or ('$' in sort_orders_terms[i][0]) or ('' in sort_orders_terms[i][0]) or ('=' in sort_orders_terms[i][0]) or ('/' in sort_orders_terms[i][0]) or ('*' in sort_orders_terms[i][0]) \
    or ('€' in sort_orders_terms[i][0]):
        print(sort_orders_terms[i][0], sort_orders_terms[i][1])
        terms_to_remove.append(sort_orders_terms[i][0])

In [None]:
for i in sort_orders_terms:
    if ('_x_' in i[0] or i[0].endswith('_x') or i[0].startswith('x_')) or ('_d_' in i[0] or i[0].endswith('_d') or i[0].startswith('d_')) or \
        ('_m_' in i[0] or i[0].endswith('_m') or i[0].startswith('m_')) or (i[0].endswith('_c') or i[0].startswith('c_') or '_c_' in i[0]) or \
        (i[0].endswith('_e') or i[0].startswith('e_') or '_e_' in i[0]):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_l') or i[0].startswith('l_') or i[0].endswith('_del') or i[0].startswith('del_') or i[0].endswith('detallado') or i[0].startswith('detallado_') \
        or i[0].endswith('_n') or i[0].startswith('n_'):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_ic') or i[0].startswith('ic_') or i[0].endswith('_r') or i[0].startswith('r_') or i[0].startswith('f_') or i[0].endswith('_v') or i[0].startswith('sa_') \
        or i[0].startswith('an_') or i[0].endswith('_an') or i[0].startswith('by_') or i[0].endswith('_by') or i[0].startswith('or_') or i[0].endswith('_or') \
        or i[0].startswith('can_') or i[0].endswith('_can') or i[0].startswith('to_') or i[0].endswith('_to') or i[0].startswith('cabo_') or i[0].endswith('_cabo') \
        or i[0].startswith('the_') or i[0].endswith('_the') or i[0].startswith('of_') or i[0].endswith('_of') or i[0].startswith('and_') or i[0].endswith('_and') \
        or '_and_' in i[0] or i[0].startswith('h_') or i[0].endswith('_b') or i[0].endswith('_nis') or i[0].endswith('_n') or i[0].startswith('one_') or i[0].endswith('_one') \
        or i[0].endswith('_af') or i[0].endswith('_aes') or i[0].endswith('_cc') or i[0].endswith('_sncti') or i[0].endswith('_cb') or i[0].endswith('_foppa'):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'previous'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'et_al'  in i[0]:
        print(i[0], i[1])
        #terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if ('_x_' in i[0] or i[0].endswith('_x') or i[0].startswith('x_')) or ('_d_' in i[0] or i[0].endswith('_d') or i[0].startswith('d_')) or \
        ('_m_' in i[0] or i[0].endswith('_m') or i[0].startswith('m_')):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'j_k' in i[0] or 'f_g_' in i[0] or 'g_h' in i[0] or 'g_h' in i[0] or 't_ico' in i[0] or 'icac_ión' in i[0] or 'serv_ic_ios' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '>' in i[0] or '<' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'which' in i[0] or 'where' in i[0] or 'when' in i[0] or 'what' in i[0] or 'that' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_l') or i[0].startswith('l_') or i[0].endswith('_del') or i[0].startswith('del_') or i[0].endswith('detallado') or i[0].startswith('detallado_') \
        or i[0].endswith('_n') or i[0].startswith('n_'):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_c') or i[0].startswith('c_') or '_c_' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '_to_the_'  in i[0] or '_for_the_'  in i[0] or '_in_the_'  in i[0] or 'appendix'  in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'actual' in i[0] and not 'contractual' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if 'relate' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if '.' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
for i in sort_orders_terms:
    if i[0].endswith('_ic') or i[0].startswith('ic_') or i[0].endswith('_r') or i[0].startswith('r_') or i[0].startswith('f_') or i[0].endswith('_v') \
        or i[0].startswith('an_') or i[0].endswith('_an') or i[0].startswith('by_') or i[0].endswith('_by') or i[0].startswith('or_') or i[0].endswith('_or') \
        or i[0].startswith('can_') or i[0].endswith('_can') or i[0].startswith('to_') or i[0].endswith('_to') or i[0].startswith('cabo_') or i[0].endswith('_cabo') \
        or i[0].startswith('the_') or i[0].endswith('_the') or i[0].startswith('of_') or i[0].endswith('_of') or i[0].startswith('and_') or i[0].endswith('_and') \
        or '_and_' in i[0] or i[0].startswith('h_') or i[0].endswith('_b') or i[0].endswith('_nis') or i[0].endswith('_n') or i[0].startswith('one_') or i[0].endswith('_one'):
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
final_stop_words

In [None]:
terms_to_remove = terms_to_remove + final_stop_words 
terms_to_remove = list(set(terms_to_remove))
len(terms_to_remove)

In [None]:
sorted(terms_to_remove)

In [None]:
for i in sort_orders_terms:
    if '→' in i[0] or 'ˇ' in i[0] or 'μ' in i[0]:
        print(i[0], i[1])
        terms_to_remove.append(i[0])

In [None]:
flagged_terms = ['-PRON-', '_', 'aadt', 'aaf', 'aaps', 'aastaraamatu', 'ababa', 'abac', 'abbreviation', 'abc', 'abd', \
                 'abovementioned', 'aforementioned', 'abraham', 'abrams', 'abs', 'zzz', 'ºc', 'õppetunnid', 'ˇthi', 'μs', '_recommendation', \
                 '→_insufficient', '→_insufficient', '→_weak', '→_weak', 'resolution_→', 'resolution_→', '→_weak_strategic_integral_management', '→_weak_strategic_integral_management', '→_lack', '→_lack', 'sector_→', 'sector_→', 'sector_→', 'private_sector_→', 'private_sector_→', 'private_sector_→', '→_weak_strategic_integral', '→_weak_strategic_integral', '→_lack_of_specialist', '→_lack_of_specialist', '→_weak_strategic', '→_weak_strategic', \
                 'framework_©_customer', 'framework_©_customer', 'framework_©', 'framework_©', 'framework_©', 'digital_transformation_framework_©_customer_insight', 'digital_transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'framework_©_customer_insight', 'framework_©_customer_insight', 'use_digital_transformation_framework_©', 'use_digital_transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', '©_customer_insight', '©_customer_insight', 'digital_transformation_framework_©_customer', 'digital_transformation_framework_©_customer', 'transformation_framework_©_customer_insight_customer', 'transformation_framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'transformation_framework_©_customer', 'transformation_framework_©_customer', '©_customer', '©_customer', '©_customer_insight_customer', '©_customer_insight_customer', 'use_digital_transformation_framework_©_customer', 'use_digital_transformation_framework_©_customer', \
                 'μs', 'offset_of_μs']

#### 2.5. Remove selected terms

In [None]:
df_test['alt2_terms'] = ''

In [None]:
%%time
for index, row in df_test.iterrows():
    #print('Processing index:', str(index))
    df_test.at[index, 'alt2_terms'] = [word for word in df_test['alt2_list_terms'][index] if word not in terms_to_remove and '_' in word]
    
    # replace "datum":
    df_test.at[index, 'alt2_terms'] = [word if 'datum' not in word else word.replace('datum', 'data') for word in df_test['alt2_terms'][index]]
    
    #print([word for word in df_base['list_of_terms'][index] if word not in terms_to_remove])

In [None]:
df_test.head()

#### 2.6. Check results

In [None]:
terms_final = df_test.alt2_terms.to_list()

In [None]:
terms_final_flat = []
for i in range(len(terms_final)):
    for token in terms_final[i]:
        terms_final_flat.append(token)

In [None]:
len(set(terms_final_flat))

In [None]:
terms_final_flat = Counter(terms_final_flat)
sort_orders_terms_final = sorted(terms_final_flat.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_terms_final:
    print(i[0], i[1])

<br>
<br>
<br>

###  NLP Token extraction and processing

In [None]:
df_test.head()

### Processing

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
# Tokenize Sentences and Clean
def sent_to_words(sentences):
    for sent in sentences:
        #sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        #sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=False) #modificado
        yield(sent) 

In [None]:
# Main functions
def remove_stopwords(texts, stop_words):
    return [[word for word in gensim.utils.simple_preprocess(str(doc), deacc=False) if word not in stop_words] for doc in texts]

#
def lemmatization(texts, allowed_postags=['PROPN', 'NOUN', 'ADJ', 'ADP']):
    texts_out = []
    for sent in texts:
        doc = stNLP(" ".join(sent)) 
        texts_out.append([word.lemma.lower() for sent in doc.sentences for word in sent.words if word.pos in allowed_postags])
    # remove stopwords once more after lemmatization
    #texts_out = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]   
    return texts_out

In [None]:
# Convert to list
data = df_test['extracted_cleaned'].values.tolist()
data_words = list(sent_to_words(data))
pprint(data_words[:1])

In [None]:
%%time
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words, final_stop_words)

# Data Lemmatized
data_lemmatized = lemmatization(data_words, allowed_postags=['PROPN', 'NOUN', 'ADJ'])

In [None]:
data_lemmatized[0]

In [None]:
# Word evaluation:
word_stats_only_tokens = []
for i in range(len(data_lemmatized)):
    for token in data_lemmatized[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats_only_tokens.append(token)
len(set(word_stats_only_tokens))

In [None]:
final_stop_words = final_stop_words + ['iec', 'iso', 'nist', 'pr', 'sr', 'apo', 'isa', 'sp', 'cobit', 'rev', 'the', 'dss', 'and', 'of', 'csc', 'cis', 'or', 'vez', 'itu', 'usuarioen', 'sc', 'bai', 'ca', 'ref', 'año', 'co', 'sin', 'embargo', 'an', 'ac', 'respecto', 'pm', 'cabo', 'banco', 'sa', 'inglés', 'cp', 'ic', 're', 'subsector', 'subcategoría', 'rsi', 'tanto', 'ia', 'sigla', 'eldesarrollo', 'sd', 'cm', 'pi', 'pe', 'rc', 'is', 'ens', 'mc', 'gi', 'for', 'cuanto', 'capabilitie', 'so', 'día', 'by', 'that', 'ccn', 'ra', 'gr', 'ver', 'type', 'ga', 'csi', 'oea', 'es', 'ps', 'caribe', 'bid', 'are', 'au', 'ae', 'on', 'note', 'as', 'mes', 'cf', 'be', 'tal', 'decir', 'sf', 'one', 'acrónimo', 'pd', 'taller', 'españa', 'which', 'ad', 'cs', 'ma', 'agencia', 'etapa', 'bahama', 'go', 'mea', 'er', 'tp', 'ión', 'ción', 'solo', 'pl', 'mp', 'diciembre', 'guyana', 'costa', 'chile', 'último', 'panamá', 'with', 'physical', 'can', 'gh', 'falta', 'pesar', 'europa', 'not', 'octubre', 'junio', 'préstamo', 'barbado', 'brasil', 'república', 'acreditación', 'from', 'category', 'plazo', 'hoc', 'program', 'abril', 'enero', 'san', 'edm', 'bia', 'enisa', 'semana', 'ddo', 'inc', 'mano', 'pregunta', 'eps', 'gracias', 'trinidad', 'simposio', 'salvador', 'espíritu', 'santo', 'another', 'more', 'at', 'reunión', 'conferencia', 'fecha', 'idente', 'obstante', 'idad', 'marzo', 'scrm', 'dado', 'ministro', 'based', 'budapest', 'saint', 'nevis', 'provided', 'using', 'this', 'such', 'herrmann', 'uruguay', 'site', 'segur', 'it', 'ion', 'aa', 'noviembre', 'argentino', 'belice', 'haití', 'paraguay', 'rd', 'kitt', 'granadina', 'pgp', 'used', 'any', 'where', 'least', 'related', 'institute', 'uy', 'rp', 'iberseguridad', 'corea', 'ico', 'nes', 'también', 'torno', 'lidad', 'def', 'coord', 'ibersegur', 'lar', 'ds', 'mancomunidad', 'mayo', 'barbuda', 'ii', 'septiembre', 'julio', 'ja', 'perú', 'venezuela', 'par', 'pro', 'crl', 'day', 'owner', 'cnss', 'include', 'applications', 'object', 'other', 'may', 'but', 'their', 'have', 'between', 'order', 'need', 'mua', 'ieee', 'cip', 'latinoamérica', 'ge', 'safe', 'reino', 'ue', 'cr', 'recuperac', 'iber', 'bg', 'febrero', 'policy', 'belize', 'colombia', 'iii', 'granada', 'gy', 'mx', 'santo', 'cesión', 'em', 'presa', 'mente', 'coa', 'dad', 'nado', 'assigned', 'responsibility', 'official', 'inst', 'ability', 'example', 'defined', 'some', 'set', 'party', 'relationship', 'does', 'might', 'support', 'operation', 'customers', 'drae', 'needed', 'run', 'capabilities', 'performance', 'target', 'provide', 'also', 'measure', 'organization', 'cnssi_', 'privilege', 'its', 'know', 'modification', 'default', 'most', 'eco', 'subdivisión', 'csf', 'american', 'ansi', 'abm', 'abreviatura', 'setiembre', 'art', 'north', 'corporation', 'good', 'appropriate', 'measur', 'rs', 'man', 'cci', 'ucrania', 'trend', 'root', 'corp', 'sur', 'situ', 'acls', 'tación', 'tructura', \
                                      'ís', 'responsabi', 'icac', 'ica', 'cana', 'iva', 'lataforma', 'igac', 'fac', 'iesgo', 'serv', 'ios', 'desarro', 'tecno', 'us', 'kiviat', 'fa', 'cea', 'ee', 'cps', 'usc', 'cmm', 'loan', 'pbl', 'ba', 'israel', 'indio', 'ran', 'caminoa', 'mintic', 'india', 'ecuador', 'nº', 'gob', 'guatemala', 'get', 'guyanés', 'francés', 'conatel', 'hondura', 'méxico', 'mitic', 'agosto', 'dpsm', 'suriname', 'tt', 'cita', 'ne', 'lopd', 'tre', 'trato', 'favor', 'español', 'res', 'pue', 'cabida', 'sis', 'mas', 'ans', 'coe', 'otan', 'iv', 'necessarily', 'under', 'put', 'behalf', 'out', 'actions', 'either', 'both', 'overall', 'them', 'dsaas', 'when', 'certain', 'conditions', 'tests', 'instance', 'mentira', 'generally', 'applied', 'sec', 'against', 'objetivo', 'objective', 'levels', 'ss', 'antel', 'ceibal', 'ínsita', 'emg', 'agesic', 'firewa', 'll', 'bidbanco', 'oeaorganización', 'guidela', 'gridsla', 'iisf', 'consortium', 'iic', 'iiot', 'crf', 'unece', 'united', 'nations', 'economic', 'commission', 'deming', 'act', 'erm', 'purduemodelo', 'purdue', 'dnp', 'opc', 'ua', 'gridred', 'dcssistemas', 'rtuunidad', 'ataqueconjunto', 'ataquemétodo', 'idssistema', 'ais', 'central', 'siemsistema', 'soccentro', 'cisodirector', 'middle', 'alc', 'industroyer', 'turquía', 'farewell', 'bellingham', 'australia', 'david', 'salt', 'sobig', 'csx', 'sasser', 'british', 'airway', 'delta', 'tehama', 'colusa', 'dugu', 'gauss', 'dragon', 'exxon', 'shell', 'bp', 'hemisferio', 'oriente', 'asia', 'zotob', 'daimler', 'chrisler', 'ag', 'kwc', 'onion', 'city', 'probalidad', 'rtic', 'ipa', 'nte', 'respuestatoda', 'instar', 'lación', 'proba', 'segmen', 'infra', 'infraestructu', 'op', 'cor', 'pico', 'xx', 'hart', 'eng', 'regis', 'tra', 'end', 'point', 'respuestano', 'cisne', 'cercanía', 'transmisiónuna', 'despositivo', 'adición', 'tan', 'pt', 'edifico', 'restrición', 'un sitio', 'aplicacio', 'billón', 'usd', 'proteccióna', 'naciona', 'igar', 'lega', 'lmente', 'lu', 'jo', 'lat', 'hac', 'abr', 'preparac', 'automát', 'env', 'laborat', 'irtua', 'invest', 'izac', 'mit', 'contro', 'ive', 'orquestac', 'recuperaciónc', 'inac', 'especia', 'ías', 'inado', 'mater', 'imulac', 'inada', 'detecc', 'acc', 'restaurac', 'compet', 'habi', 'desarrol', 'istema', 'prote', 'jan', 'conf', 'idenc', 'ing', 'exce', 'lenc', 'profe', 'doe', 'dhs', 'mil', 'ef', 'ej', 'sumin', 'iclo', 'ponibilidad', 'vulnerabilida', 'dcpd', 'hr', 'dólar', 'rmp', 'fu', 'nc', 'ar', 'ross', 'et', 'quehacer', 'oxford', 'organizacionesy', 'cumbre', 'siglo', 'xxi', 'chino', 'icic', 'pwc', 's', 'puc', 'bolivia', 'cgii', 'br', 'numeral', 'ente', 'colcert', 'colombiano', 'delegatura', 'europol', 'micitt', 'desde', 'costarricense', 'dominiqués', 'dominica', 'arcotel', 'ecucert', 'deloitte', 'sv', 'encargada', 'grenada', 'luz', 'gt', 'getsafeonline', 'summer', 'incibe', 'jamaica', 'mstem', 'sarrollo', 'mexicano', \
                                       'citi', 'py', 'pcm', 'peruano', 'kittsand', 'dpm', 'ttconnect', 'gub', 'suscerte', 'sede', 'sucer', 'venezolano', 'cos', 'ceptible', 'uti', 'dida', 'grama', 'có', 'digo', 'infor', 'mación', 'cas', 'ejem', 'plo', 'jun', 'profesiona', 'poste', 'cer', 'na', 'moneda', 'lssi', 'edi', 'electronic', 'actuali', 'congéner', 'riante', 'comunicacio', 'ghz', 'trozo', 'tico', 'gan', 'cho', 'abo', 'gado', 'di', 'nero', 'kilómetro', 'sue', 'longi', 'interesar', 'conocimien', 'ci', 'frado', 'intimi', 'activi', 'traseña', 'car', 'pio', 'embar', 'ciu', 'mos', 'progra', 'cues', 'tión', 'db', 'utili', 'minio', 'ñan', 'd', 'perso', 'transferen', 'cia', 'gar', 'glés', 'miento', 'apli', 'infrastructu', 'grafía', 'transaccio', 'cla', 'za', 'sqldefinición', 'vali', 'dación', 'intru', 'tec', 'nología', 'cortafue', 'landefinición', 'elenco', 'admi', 'nistración', 'emisor', 'seguri', 'indi', 'tarje', 'ssc', 'nera', 'dose', 'apa', 'renta', 'cumento', 'dar', 'siste', 'gocio', 'tele', 'termediario', 'tario', 'mini', 'proce', 'dente', 'servi', 'cio', 'ri', 'vest', 'shamir', 'adleman', 'ase', 'mencio', 'usua', 'rio', 'ciberdelincuen', 'dress', 'fre', 'cuencia', 'ssldefinición', 'ingle', 'transport', 'fun', 'ciona', 'propagar', 'direc', 'soft', 'ware', 'men', 'ministrador', 'orde', 'tocol', 'tar', 'zero', 'ciberdelin', 'ter', 'sci', 'subsanación', 'configuracio', 'ied', 'atlántico', 'norte', 'igf', 'foc', 'ccd', 'ass', 'om', 'mcafee', 'sl', 'isdefe', 'loud', 'offered', 'invoked', 'possess', 'qualities', 'purpose', 'imply', 'reason', 'was', 'resulted', 'exercising', 'published', 'contains', 'operate', 'made', 'obtained', 'publicly', 'however', 'produced', 'likely', 'following', 'principles', 'unless', 'there', 'provisions', 'contrary', 'activitie', 'mak', 'includ', 'uses', 'entitie', 'deviz', 'csv', 'who', 'compaa', 'than', 'small', 'ón', 'own', 'sport', 'golf', 'easily', 'without', 'required', 'reenter', 'ease', 'moving', 'essence', 'here', 'achieved', 'exactly', 'format', 'accepted', 'even', 'if', 'formats', 'simple', 'straightforward', 'achieve', 'commonly', 'rekeying', 'could', 'described', 'easy', 'three', 'representations', 'manner', 'suitable', 'guide', 'clic', 'cabalgamiento', 'two', 'different', 'instrument', 'engind', 'compound', 'therefore', 'useful', 'whose', 'existence', 'strongly', 'various', 'together', 'iaa', 'underlying', 'over', 'deployed', 'components', 'host firewalls', 'eufemismo', 'ligthweight', 'numbers', 'symbols', 'entities', 'describir', 'according', 'clear', 'rules', 'repeated', 'measuring', 'comparison', 'reference', 'measurements', 'they', 'orient', 'decisions', 'better', 'understanding', 'casual', 'relationships', 'intended', 'expectations', 'observed', 'facts', 'number', 'symbol', 'characterize', 'attribute', 'compensating', 'technical', 'safeguard', 'countermeasure', 'employed', 'lieu', 'recommended', \
                                       'low', 'moderate', 'high', 'baselín', 'provid', 'equivalent', 'decimal', 'metric', 'proposed', 'facilitate', 'decisio', 'n', 'making', 'improve', 'through', 'reporting', 'relevant', 'ribagorda', 'principle', 'should', 'each', 'granted', 'minimum', 'needs', 'multi', 'computations', 'terms', 'express', 'same', 'while', 'processe', 'feature', 'automatically', 'sesgo', 'picaresca', 'paas', 'supported', 'single', 'ce', 'potentially', 'harbor', 'serve', 'statutory', 'staff', 'directs', 'today', 'responsible', 'among', 'organizational', 'elements', 'involved', 'derived', 'after', 'agreed', 'period', 'ver_pag', 'whether', 'int', 'many', 'lead', 'what', 'actually', 'result', 'the', 'precise', 'meaning', 'remain', 'identifies', 'established', 'omb', 'appendix', 'spp', 'saa', 'fip', 'troya', 'erc', 'ksk', 'priori', 'cc', 'lucia', 'subcategorie', 'subcategory']

In [None]:
len(set(final_stop_words))

In [None]:
## Check the tokens:

In [None]:
tokens_final_flat = []
for i in range(len(data_lemmatized)):
    for token in data_lemmatized[i]:
        tokens_final_flat.append(token)

In [None]:
tokens_final_flat = Counter(tokens_final_flat)
sort_orders_tokens_final = sorted(tokens_final_flat.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_tokens_final:
    print(i[0], i[1])

In [None]:
## filter out tokens
df_test['alt2_tokens'] = ''

In [None]:
%%time
for index, row in df_test.iterrows():
    #print('Processing index:', str(index))
    df_test.at[index, 'alt2_tokens'] = [word for word in data_lemmatized[index] if word not in final_stop_words]
    
    # replace "datum":
    df_test.at[index, 'alt2_tokens'] = [word if 'datum' not in word else word.replace('datum', 'data') for word in df_test['alt2_tokens'][index]]
    
    #print([word for word in df_base['list_of_terms'][index] if word not in terms_to_remove])

In [None]:
df_test['alt2_tokens'][0]

In [None]:
## tokens - final check

In [None]:
# Convert to list
tokens_final = df_test['alt2_tokens'].values.tolist()

In [None]:
tokens_final_flat = []
for i in range(len(tokens_final)):
    for token in tokens_final[i]:
        tokens_final_flat.append(token)

In [None]:
len(set(tokens_final_flat))

In [None]:
tokens_final_flat = Counter(tokens_final_flat)
sort_orders_tokens_final = sorted(tokens_final_flat.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders_tokens_final:
    print(i[0], i[1])

##### Merge tokens and terms/n-Grams

In [None]:
df_test.reset_index(drop=True, inplace=True)

In [None]:
# previously obtained terms/n-grams are added to the dataset
data_lemmatized_full = []
for index, row in df_test.iterrows():
    data_lemmatized_full.append(df_test['alt2_tokens'][index] + df_test['alt2_terms'][index])

In [None]:
len(data_lemmatized_full)

<br>
<br>

In [None]:
# Word evaluation:
word_stats = []
for i in range(len(data_lemmatized_full)):
    for token in data_lemmatized_full[i]:
        #if '_' in token:
        #    print(str(i),token)
        word_stats.append(token)

In [None]:
word_stats = Counter(word_stats)
sort_orders = sorted(word_stats.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders:
    print(i[0], i[1])

# **************************************************************************************************************** 
<br>
<br>
<br>

In [None]:
# Adding data_lemmatized as a new column and store the results:
df_test['alt2_data_lemmatized'] = data_lemmatized_full

In [None]:
df_test

# **************************************************************************************************************** 
<br>
<br>
<br>

In [None]:
# Testing: final clean-up since some additional tokens and terms were flagged to be removed when doing the clustering analysis

In [None]:
flagged_terms = ['-PRON-', '_', 'aadt', 'aaf', 'aaps', 'aastaraamatu', 'ababa', 'abac', 'abbreviation', 'abc', 'abd', \
                 'abovementioned', 'aforementioned', 'abraham', 'abrams', 'abs', 'zzz', 'ºc', 'õppetunnid', 'ˇthi', 'μs', '_recommendation', \
                 '→_insufficient', '→_insufficient', '→_weak', '→_weak', 'resolution_→', 'resolution_→', '→_weak_strategic_integral_management', '→_weak_strategic_integral_management', '→_lack', '→_lack', 'sector_→', 'sector_→', 'sector_→', 'private_sector_→', 'private_sector_→', 'private_sector_→', '→_weak_strategic_integral', '→_weak_strategic_integral', '→_lack_of_specialist', '→_lack_of_specialist', '→_weak_strategic', '→_weak_strategic', \
                 'framework_©_customer', 'framework_©_customer', 'framework_©', 'framework_©', 'framework_©', 'digital_transformation_framework_©_customer_insight', 'digital_transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'framework_©_customer_insight', 'framework_©_customer_insight', 'use_digital_transformation_framework_©', 'use_digital_transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', '©_customer_insight', '©_customer_insight', 'digital_transformation_framework_©_customer', 'digital_transformation_framework_©_customer', 'transformation_framework_©_customer_insight_customer', 'transformation_framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'transformation_framework_©_customer', 'transformation_framework_©_customer', '©_customer', '©_customer', '©_customer_insight_customer', '©_customer_insight_customer', 'use_digital_transformation_framework_©_customer', 'use_digital_transformation_framework_©_customer', \
                 'μs', 'offset_of_μs']


In [None]:
%%time
for index, row in df_test.iterrows():
    #print('Processing index:', str(index))
    #df_test.at[index, 'alt2_tokens'] = [word for word in data_lemmatized[index] if word not in final_stop_words]
    
    # replace "datum":
    print(index, [word for word in df_test['alt2_data_lemmatized'][index] if word in flagged_terms])
    #print(index, [word for word in df_test['alt2_data_lemmatized'][index] if word == '_'])

# **************************************************************************************************************** 
<br>
<br>
<br>

In [None]:
# v1.2: Store df_test containing terms
f_df_test = 'nlp_spec_docs_2021-02-08_english_v12_final.joblib'
joblib.dump(df_test[['Short_Name', 'extracted_cleaned', 'alt2_terms', 'alt2_tokens', 'alt2_data_lemmatized']], './output/' + f_df_test + '.bz2', compress=('bz2', 3))#

In [None]:
# v1.2: Store df_test full
f_df_test_full = 'nlp_spec_docs_2021-02-08_english_v12_FULL.joblib'
joblib.dump(df_test, './output/' + f_df_test_full + '.bz2', compress=('bz2', 3))#

# **************************************************************************************************************** 
<br>
<br>
<br>

In [None]:
# v1.0: Store df_test containing terms
f_df_test = 'nlp_spec_docs_2020-12-09_english_v1_final.joblib'
joblib.dump(df_test[['Short_Name', 'extracted_cleaned', 'alt2_terms', 'alt2_tokens', 'alt2_data_lemmatized']], './output/' + f_df_test + '.bz2', compress=('bz2', 3))#

In [None]:
## v1.0: Store data_lemmatized_full (English)
with open('./output/data_lemmat_spec_docs_2020-12-09_english_v1_final.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(data_lemmatized_full, filehandle)

In [None]:
# v1.0: Store df_test full - including annotation done by Spacy
f_df_test = 'nlp_spec_docs_annotated_2020-12-09_english_v1_final.joblib'
joblib.dump(df_test, './output/' + f_df_test + '.bz2', compress=('bz2', 3))#

In [None]:
### 

# **************************************************************************************************************** #
<br>
<br>
<br>

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                     Change:                                       



#   - 1.0           12/08/2020        Emiliano Colina        - Forked from Spanish specialized documents processed
#                                                            with Stanza
                                                                                                                  
#   - 0.1           10/13/2020        Emiliano Colina        - Initial version
#                                                            - All type of documents included.


#
# **************************************************************************************************************** #
#'''
