### doc_entity
ESTABELECE RELAÇÕES ENTRE DOCUMENTOS E ENTIDADES NA BASE DE DADOS NO MYSQL. TRABALHAMOS COM O ACERVO **ANTONIO AZEREDO DA SILVEIRA, MINISTÉRIO DAS RELAÇÕES EXTERIORES**.  

AS ENTIDADES A SEREM TRABALHADAS SÃO:
* PAÍSES
* PESSOAS

In [5]:
import subprocess
import sys
import os
import pickle
import re
import pandas as pd
from IPython.display import clear_output
import getpass
import pymysql

In [11]:
encoding_type = 'ISO-8859-1'
sql_user='marcelobribeiro'
#sql_user='rsouza'

In [17]:
path = "../../"
path_inputs = path+'textfiles-corrected-ignorecase/'

In [7]:
WORD_CLASSES = {'N':'Nouns', 
                'PROP':'Proper nouns', 
                'SPEC': 'Specifiers', 
                'DET': 'Determiners',
                'PERS':'Personal pronouns', 
                'ADJ':'Adjectives',
                'ADV':'Adverbs', 
                'V':'Verbs', 
                'NUM':'Numerals', 
                'PRP':'Preposition',
                'KS':'Subordinating conjunctions',
                'KC':'Coordinationg conjunctions',
                'IN':'Interjections',
                'EC':'Hyphen-separated prefix',
                'BL': 'Blank Line',
                'ES': 'End of Sentence',
                'NW': 'Non Word'}

INF_TAGS = {'M':'Male gender', 
            'F':'Female gender', 
            'M/F':'Neutral gender',
            'S':'Singular number', 
            'P':'Plural number', 
            'S/P':'Neutral number',
            'NOM':'Nominative case', 
            'ACC':'Accusative case', 
            'DAT':'Dative case',
            'PIV':'Prepositive case', 
            'ACC/DAT':'Accusative-Dative case', 
            'DAT':'Nominative-Prepositive case', 
            '1':'First person', 
            '2':'Second person', 
            '3':'Third person',
            '1S':'First person singular', 
            '2S':'Second person singular',
            '3S':'Third person singular',
            '1P':'First person plural', 
            '2P':'Second person plural',
            '3P':'Third person plural',
            '1/3S':'First or Third person singular', 
            '0/1/3S':'Impersonal or First or Third person singular',
            'PR':'presente simples',
            'IMPF':'preterito imperfeito', 
            'PS':'preterito perfeito',
            'MQP':'preterito mais-que-perfeito', 
            'FUT':'futuro do presente', 
            'COND':'futuro do preterito',
            'IND':'indicativo', 
            'SUBJ':'subjuntivo', 
            'IMP':'imperativo', 
            'VFIN':'Verbo Finito',
            'INF':'infinitivo', 
            'PCP':'participio', 
            'GER':'gerundio'}

SYN_TAGS = {'@SUBJ>':'subject', 
            '@<SUBJ':'subject',
            '@ACC>':'accusative direct object',
            '@<ACC':'accusative direct object', 
            '@DAT>':'dative object only pronominal', 
            '@<DAT':'dative object only pronominal',
            '@PIV>':'prepositional indirect object', 
            '@<PIV':'prepositional indirect object',
            '@ADVS> / @SA>':'adverbial object (place, time, duration, quantity), subject-related',
            '@<ADVS / @<SA':'adverbial object (place, time, duration, quantity), subject-related',
            '@ADVO> / @OA>':'adverbial object object-related',
            '@<ADVO / @<OA':'adverbial object object-related',
            '@SC>':'subject predicative',
            '@<SC':'subject predicative',
            '@OC>':'object predicative',
            '@<OC':'object predicative',
            '@ADVL>':'adverbial',
            '@<ADVL':'adverbial',
            '@PASS>':'agent of passive',
            '@<PASS':'agent of passive', #All above clause arguments attach to the nearest main verb to the left [<] or right [>]
            '@ADVL':'free adverbial phrase in non-sentence expression',
            '@NPHR':'free noun phrase in non-sentence expression without verbs',
            '@VOK':'vocative',
            '@>N':'prenominal adject', 
            '@N<':'postnominal adject', #both last attaches to the nearest NP-head that is not an adnominal itself
            '@N<PRED':'postnominal in-group predicative', #or predicate in small clause introduced by com/sem',
            '@APP':'identifying apposition',
            '@>A':'prepositioned adverbial adject', #attaches to the nearest ADJ/PCP/ADV or attributive used N to the right 
            '@A<': 'postpositioned adverbial adject', #or dependent/argument of attributive participle (with function tag attached
            '@PRED>': 'forward free predicative', #refers to the following @SUBJ, even when this is incorporated in the VP 
            '@<PRED': 'backward free predicative', #refers to the nearest NP-head to the left, or to the nearest @SUBJ to the left
            '@P<': 'argument of preposition', 
            '@S<': 'sentence anaphor', 
            '@FAUX': 'finite auxiliary',
            '@FMV': 'finite main verb', 
            '@IAUX': 'infinite auxiliary', 
            '@IMV': 'infinite main verb', 
            '@PRT-AUX<': 'verb chain particle',
            '@CO': 'coordinating conjunction', 
            '@SUB': 'subordinating conjunction', 
            '@KOMP<': 'argument of comparative',
            '@COM': 'direct comparator without preceding comparative',
            '@PRD': 'role predicator',
            '@FOC>': 'focus marker',
            '@<FOC': 'focus marker',
            '@TOP': 'topic constituent',
            '@#FS-': 'finite subclause', #combines with clausal role and intraclausal word tag, e.g.@#FS-<ACC @SUB for "não acredito que seja verdade") 
            '@#ICL-': 'infinite subclause', #combines with clausal role and intraclausal word tag, e.g. @#ICL-SUBJ> @IMV in "consertar um relógio não é fácil") 
            '@#ICL-AUX<': 'argument verb in verb chain', #refers to preceding auxiliary (the verb chain sequence @FAUX - @#ICL-AUX< is used, where both verbs have the same subject, @FMV - @#ICL-<ACC is used where the subjects are different) 
            '@#AS-': 'averbal subclause', #combines with clausal role and intraclausal word tag, e.g. @#AS-<ADVL @ADVL> in "ajudou onde possível") 
            '@AS<': 'argument of complementiser in averbal subclause'}

DEP_MOD_BUG = {'ADJ0/1/3S': 'ADJ 0/1/3S',
               'ADJ1': 'ADJ 1',
               'ADJ1/3S': 'ADJ 1/3S',
               'ADJ1P': 'ADJ 1P',
               'ADJ1S': 'ADJ 1S',
               'ADJ2': 'ADJ 2',
               'ADJ2P': 'ADJ 2P',
               'ADJ2S': 'ADJ 2S',
               'ADJ3': 'ADJ 3',
               'ADJ3P': 'ADJ 3P',
               'ADJ3S': 'ADJ 3S',
               'ADJACC': 'ADJ ACC',
               'ADJACC/DAT': 'ADJ ACC/DAT',
               'ADJCOND': 'ADJ COND',
               'ADJDAT': 'ADJ DAT',
               'ADJF': 'ADJ F',
               'ADJFUT': 'ADJ FUT',
               'ADJGER': 'ADJ GER',
               'ADJIMP': 'ADJ IMP',
               'ADJIMPF': 'ADJ IMPF',
               'ADJIND': 'ADJ IND',
               'ADJINF': 'ADJ INF',
               'ADJM': 'ADJ M',
               'ADJM/F': 'ADJ M/F',
               'ADJMQP': 'ADJ MQP',
               'ADJNOM': 'ADJ NOM',
               'ADJP': 'ADJ P',
               'ADJPCP': 'ADJ PCP',
               'ADJPIV': 'ADJ PIV',
               'ADJPR': 'ADJ PR',
               'ADJPS': 'ADJ PS',
               'ADJS': 'ADJ S',
               'ADJS/P': 'ADJ S/P',
               'ADJSUBJ': 'ADJ SUBJ',
               'ADJVFIN': 'ADJ VFIN',
               'ADV0/1/3S': 'ADV 0/1/3S',
               'ADV1': 'ADV 1',
               'ADV1/3S': 'ADV 1/3S',
               'ADV1P': 'ADV 1P',
               'ADV1S': 'ADV 1S',
               'ADV2': 'ADV 2',
               'ADV2P': 'ADV 2P',
               'ADV2S': 'ADV 2S',
               'ADV3': 'ADV 3',
               'ADV3P': 'ADV 3P',
               'ADV3S': 'ADV 3S',
               'ADVACC': 'ADV ACC',
               'ADVACC/DAT': 'ADV ACC/DAT',
               'ADVCOND': 'ADV COND',
               'ADVDAT': 'ADV DAT',
               'ADVF': 'ADV F',
               'ADVFUT': 'ADV FUT',
               'ADVGER': 'ADV GER',
               'ADVIMP': 'ADV IMP',
               'ADVIMPF': 'ADV IMPF',
               'ADVIND': 'ADV IND',
               'ADVINF': 'ADV INF',
               'ADVM': 'ADV M',
               'ADVM/F': 'ADV M/F',
               'ADVMQP': 'ADV MQP',
               'ADVNOM': 'ADV NOM',
               'ADVP': 'ADV P',
               'ADVPCP': 'ADV PCP',
               'ADVPIV': 'ADV PIV',
               'ADVPR': 'ADV PR',
               'ADVPS': 'ADV PS',
               'ADVS': 'ADV S',
               'ADVS/P': 'ADV S/P',
               'ADVSUBJ': 'ADV SUBJ',
               'ADVVFIN': 'ADV VFIN',
               'DET0/1/3S': 'DET 0/1/3S',
               'DET1': 'DET 1',
               'DET1/3S': 'DET 1/3S',
               'DET1P': 'DET 1P',
               'DET1S': 'DET 1S',
               'DET2': 'DET 2',
               'DET2P': 'DET 2P',
               'DET2S': 'DET 2S',
               'DET3': 'DET 3',
               'DET3P': 'DET 3P',
               'DET3S': 'DET 3S',
               'DETACC': 'DET ACC',
               'DETACC/DAT': 'DET ACC/DAT',
               'DETCOND': 'DET COND',
               'DETDAT': 'DET DAT',
               'DETF': 'DET F',
               'DETFUT': 'DET FUT',
               'DETGER': 'DET GER',
               'DETIMP': 'DET IMP',
               'DETIMPF': 'DET IMPF',
               'DETIND': 'DET IND',
               'DETINF': 'DET INF',
               'DETM': 'DET M',
               'DETM/F': 'DET M/F',
               'DETMQP': 'DET MQP',
               'DETNOM': 'DET NOM',
               'DETP': 'DET P',
               'DETPCP': 'DET PCP',
               'DETPIV': 'DET PIV',
               'DETPR': 'DET PR',
               'DETPS': 'DET PS',
               'DETS': 'DET S',
               'DETS/P': 'DET S/P',
               'DETSUBJ': 'DET SUBJ',
               'DETVFIN': 'DET VFIN',
               'EC0/1/3S': 'EC 0/1/3S',
               'EC1': 'EC 1',
               'EC1/3S': 'EC 1/3S',
               'EC1P': 'EC 1P',
               'EC1S': 'EC 1S',
               'EC2': 'EC 2',
               'EC2P': 'EC 2P',
               'EC2S': 'EC 2S',
               'EC3': 'EC 3',
               'EC3P': 'EC 3P',
               'EC3S': 'EC 3S',
               'ECACC': 'EC ACC',
               'ECACC/DAT': 'EC ACC/DAT',
               'ECCOND': 'EC COND',
               'ECDAT': 'EC DAT',
               'ECF': 'EC F',
               'ECFUT': 'EC FUT',
               'ECGER': 'EC GER',
               'ECIMP': 'EC IMP',
               'ECIMPF': 'EC IMPF',
               'ECIND': 'EC IND',
               'ECINF': 'EC INF',
               'ECM': 'EC M',
               'ECM/F': 'EC M/F',
               'ECMQP': 'EC MQP',
               'ECNOM': 'EC NOM',
               'ECP': 'EC P',
               'ECPCP': 'EC PCP',
               'ECPIV': 'EC PIV',
               'ECPR': 'EC PR',
               'ECPS': 'EC PS',
               'ECS': 'EC S',
               'ECS/P': 'EC S/P',
               'ECSUBJ': 'EC SUBJ',
               'ECVFIN': 'EC VFIN',
               'IN0/1/3S': 'IN 0/1/3S',
               'IN1': 'IN 1',
               'IN1/3S': 'IN 1/3S',
               'IN1P': 'IN 1P',
               'IN1S': 'IN 1S',
               'IN2': 'IN 2',
               'IN2P': 'IN 2P',
               'IN2S': 'IN 2S',
               'IN3': 'IN 3',
               'IN3P': 'IN 3P',
               'IN3S': 'IN 3S',
               'INACC': 'IN ACC',
               'INACC/DAT': 'IN ACC/DAT',
               'INCOND': 'IN COND',
               'INDAT': 'IN DAT',
               'INF': 'IN F',
               'INFUT': 'IN FUT',
               'INGER': 'IN GER',
               'INIMP': 'IN IMP',
               'INIMPF': 'IN IMPF',
               'ININD': 'IN IND',
               'ININF': 'IN INF',
               'INM': 'IN M',
               'INM/F': 'IN M/F',
               'INMQP': 'IN MQP',
               'INNOM': 'IN NOM',
               'INP': 'IN P',
               'INPCP': 'IN PCP',
               'INPIV': 'IN PIV',
               'INPR': 'IN PR',
               'INPS': 'IN PS',
               'INS': 'IN S',
               'INS/P': 'IN S/P',
               'INSUBJ': 'IN SUBJ',
               'INVFIN': 'IN VFIN',
               'KC0/1/3S': 'KC 0/1/3S',
               'KC1': 'KC 1',
               'KC1/3S': 'KC 1/3S',
               'KC1P': 'KC 1P',
               'KC1S': 'KC 1S',
               'KC2': 'KC 2',
               'KC2P': 'KC 2P',
               'KC2S': 'KC 2S',
               'KC3': 'KC 3',
               'KC3P': 'KC 3P',
               'KC3S': 'KC 3S',
               'KCACC': 'KC ACC',
               'KCACC/DAT': 'KC ACC/DAT',
               'KCCOND': 'KC COND',
               'KCDAT': 'KC DAT',
               'KCF': 'KC F',
               'KCFUT': 'KC FUT',
               'KCGER': 'KC GER',
               'KCIMP': 'KC IMP',
               'KCIMPF': 'KC IMPF',
               'KCIND': 'KC IND',
               'KCINF': 'KC INF',
               'KCM': 'KC M',
               'KCM/F': 'KC M/F',
               'KCMQP': 'KC MQP',
               'KCNOM': 'KC NOM',
               'KCP': 'KC P',
               'KCPCP': 'KC PCP',
               'KCPIV': 'KC PIV',
               'KCPR': 'KC PR',
               'KCPS': 'KC PS',
               'KCS': 'KC S',
               'KCS/P': 'KC S/P',
               'KCSUBJ': 'KC SUBJ',
               'KCVFIN': 'KC VFIN',
               'KS0/1/3S': 'KS 0/1/3S',
               'KS1': 'KS 1',
               'KS1/3S': 'KS 1/3S',
               'KS1P': 'KS 1P',
               'KS1S': 'KS 1S',
               'KS2': 'KS 2',
               'KS2P': 'KS 2P',
               'KS2S': 'KS 2S',
               'KS3': 'KS 3',
               'KS3P': 'KS 3P',
               'KS3S': 'KS 3S',
               'KSACC': 'KS ACC',
               'KSACC/DAT': 'KS ACC/DAT',
               'KSCOND': 'KS COND',
               'KSDAT': 'KS DAT',
               'KSF': 'KS F',
               'KSFUT': 'KS FUT',
               'KSGER': 'KS GER',
               'KSIMP': 'KS IMP',
               'KSIMPF': 'KS IMPF',
               'KSIND': 'KS IND',
               'KSINF': 'KS INF',
               'KSM': 'KS M',
               'KSM/F': 'KS M/F',
               'KSMQP': 'KS MQP',
               'KSNOM': 'KS NOM',
               'KSP': 'KS P',
               'KSPCP': 'KS PCP',
               'KSPIV': 'KS PIV',
               'KSPR': 'KS PR',
               'KSPS': 'KS PS',
               'KSS': 'KS S',
               'KSS/P': 'KS S/P',
               'KSSUBJ': 'KS SUBJ',
               'KSVFIN': 'KS VFIN',
               'N0/1/3S': 'N 0/1/3S',
               'N1': 'N 1',
               'N1/3S': 'N 1/3S',
               'N1P': 'N 1P',
               'N1S': 'N 1S',
               'N2': 'N 2',
               'N2P': 'N 2P',
               'N2S': 'N 2S',
               'N3': 'N 3',
               'N3P': 'N 3P',
               'N3S': 'N 3S',
               'NACC': 'N ACC',
               'NACC/DAT': 'N ACC/DAT',
               'NCOND': 'N COND',
               'NDAT': 'N DAT',
               'NF': 'N F',
               'NFUT': 'N FUT',
               'NGER': 'N GER',
               'NIMP': 'N IMP',
               'NIMPF': 'N IMPF',
               'NIND': 'N IND',
               'NINF': 'N INF',
               'NM': 'N M',
               'NM/F': 'N M/F',
               'NMQP': 'N MQP',
               'NNOM': 'N NOM',
               'NP': 'N P',
               'NPCP': 'N PCP',
               'NPIV': 'N PIV',
               'NPR': 'N PR',
               'NPS': 'N PS',
               'NS': 'N S',
               'NS/P': 'N S/P',
               'NSUBJ': 'N SUBJ',
               'NUM0/1/3S': 'NUM 0/1/3S',
               'NUM1': 'NUM 1',
               'NUM1/3S': 'NUM 1/3S',
               'NUM1P': 'NUM 1P',
               'NUM1S': 'NUM 1S',
               'NUM2': 'NUM 2',
               'NUM2P': 'NUM 2P',
               'NUM2S': 'NUM 2S',
               'NUM3': 'NUM 3',
               'NUM3P': 'NUM 3P',
               'NUM3S': 'NUM 3S',
               'NUMACC': 'NUM ACC',
               'NUMACC/DAT': 'NUM ACC/DAT',
               'NUMCOND': 'NUM COND',
               'NUMDAT': 'NUM DAT',
               'NUMF': 'NUM F',
               'NUMFUT': 'NUM FUT',
               'NUMGER': 'NUM GER',
               'NUMIMP': 'NUM IMP',
               'NUMIMPF': 'NUM IMPF',
               'NUMIND': 'NUM IND',
               'NUMINF': 'NUM INF',
               'NUMM': 'NUM M',
               'NUMM/F': 'NUM M/F',
               'NUMMQP': 'NUM MQP',
               'NUMNOM': 'NUM NOM',
               'NUMP': 'NUM P',
               'NUMPCP': 'NUM PCP',
               'NUMPIV': 'NUM PIV',
               'NUMPR': 'NUM PR',
               'NUMPS': 'NUM PS',
               'NUMS': 'NUM S',
               'NUMS/P': 'NUM S/P',
               'NUMSUBJ': 'NUM SUBJ',
               'NUMVFIN': 'NUM VFIN',
               'NVFIN': 'N VFIN',
               'PERS0/1/3S': 'PERS 0/1/3S',
               'PERS1': 'PERS 1',
               'PERS1/3S': 'PERS 1/3S',
               'PERS1P': 'PERS 1P',
               'PERS1S': 'PERS 1S',
               'PERS2': 'PERS 2',
               'PERS2P': 'PERS 2P',
               'PERS2S': 'PERS 2S',
               'PERS3': 'PERS 3',
               'PERS3P': 'PERS 3P',
               'PERS3S': 'PERS 3S',
               'PERSACC': 'PERS ACC',
               'PERSACC/DAT': 'PERS ACC/DAT',
               'PERSCOND': 'PERS COND',
               'PERSDAT': 'PERS DAT',
               'PERSF': 'PERS F',
               'PERSFUT': 'PERS FUT',
               'PERSGER': 'PERS GER',
               'PERSIMP': 'PERS IMP',
               'PERSIMPF': 'PERS IMPF',
               'PERSIND': 'PERS IND',
               'PERSINF': 'PERS INF',
               'PERSM': 'PERS M',
               'PERSM/F': 'PERS M/F',
               'PERSMQP': 'PERS MQP',
               'PERSNOM': 'PERS NOM',
               'PERSP': 'PERS P',
               'PERSPCP': 'PERS PCP',
               'PERSPIV': 'PERS PIV',
               'PERSPR': 'PERS PR',
               'PERSPS': 'PERS PS',
               'PERSS': 'PERS S',
               'PERSS/P': 'PERS S/P',
               'PERSSUBJ': 'PERS SUBJ',
               'PERSVFIN': 'PERS VFIN',
               'PROP0/1/3S': 'PROP 0/1/3S',
               'PROP1': 'PROP 1',
               'PROP1/3S': 'PROP 1/3S',
               'PROP1P': 'PROP 1P',
               'PROP1S': 'PROP 1S',
               'PROP2': 'PROP 2',
               'PROP2P': 'PROP 2P',
               'PROP2S': 'PROP 2S',
               'PROP3': 'PROP 3',
               'PROP3P': 'PROP 3P',
               'PROP3S': 'PROP 3S',
               'PROPACC': 'PROP ACC',
               'PROPACC/DAT': 'PROP ACC/DAT',
               'PROPCOND': 'PROP COND',
               'PROPDAT': 'PROP DAT',
               'PROPF': 'PROP F',
               'PROPFUT': 'PROP FUT',
               'PROPGER': 'PROP GER',
               'PROPIMP': 'PROP IMP',
               'PROPIMPF': 'PROP IMPF',
               'PROPIND': 'PROP IND',
               'PROPINF': 'PROP INF',
               'PROPM': 'PROP M',
               'PROPM/F': 'PROP M/F',
               'PROPMQP': 'PROP MQP',
               'PROPNOM': 'PROP NOM',
               'PROPP': 'PROP P',
               'PROPPCP': 'PROP PCP',
               'PROPPIV': 'PROP PIV',
               'PROPPR': 'PROP PR',
               'PROPPS': 'PROP PS',
               'PROPS': 'PROP S',
               'PROPS/P': 'PROP S/P',
               'PROPSUBJ': 'PROP SUBJ',
               'PROPVFIN': 'PROP VFIN',
               'PRP0/1/3S': 'PRP 0/1/3S',
               'PRP1': 'PRP 1',
               'PRP1/3S': 'PRP 1/3S',
               'PRP1P': 'PRP 1P',
               'PRP1S': 'PRP 1S',
               'PRP2': 'PRP 2',
               'PRP2P': 'PRP 2P',
               'PRP2S': 'PRP 2S',
               'PRP3': 'PRP 3',
               'PRP3P': 'PRP 3P',
               'PRP3S': 'PRP 3S',
               'PRPACC': 'PRP ACC',
               'PRPACC/DAT': 'PRP ACC/DAT',
               'PRPCOND': 'PRP COND',
               'PRPDAT': 'PRP DAT',
               'PRPF': 'PRP F',
               'PRPFUT': 'PRP FUT',
               'PRPGER': 'PRP GER',
               'PRPIMP': 'PRP IMP',
               'PRPIMPF': 'PRP IMPF',
               'PRPIND': 'PRP IND',
               'PRPINF': 'PRP INF',
               'PRPM': 'PRP M',
               'PRPM/F': 'PRP M/F',
               'PRPMQP': 'PRP MQP',
               'PRPNOM': 'PRP NOM',
               'PRPP': 'PRP P',
               'PRPPCP': 'PRP PCP',
               'PRPPIV': 'PRP PIV',
               'PRPPR': 'PRP PR',
               'PRPPS': 'PRP PS',
               'PRPS': 'PRP S',
               'PRPS/P': 'PRP S/P',
               'PRPSUBJ': 'PRP SUBJ',
               'PRPVFIN': 'PRP VFIN',
               'SPEC0/1/3S': 'SPEC 0/1/3S',
               'SPEC1': 'SPEC 1',
               'SPEC1/3S': 'SPEC 1/3S',
               'SPEC1P': 'SPEC 1P',
               'SPEC1S': 'SPEC 1S',
               'SPEC2': 'SPEC 2',
               'SPEC2P': 'SPEC 2P',
               'SPEC2S': 'SPEC 2S',
               'SPEC3': 'SPEC 3',
               'SPEC3P': 'SPEC 3P',
               'SPEC3S': 'SPEC 3S',
               'SPECACC': 'SPEC ACC',
               'SPECACC/DAT': 'SPEC ACC/DAT',
               'SPECCOND': 'SPEC COND',
               'SPECDAT': 'SPEC DAT',
               'SPECF': 'SPEC F',
               'SPECFUT': 'SPEC FUT',
               'SPECGER': 'SPEC GER',
               'SPECIMP': 'SPEC IMP',
               'SPECIMPF': 'SPEC IMPF',
               'SPECIND': 'SPEC IND',
               'SPECINF': 'SPEC INF',
               'SPECM': 'SPEC M',
               'SPECM/F': 'SPEC M/F',
               'SPECMQP': 'SPEC MQP',
               'SPECNOM': 'SPEC NOM',
               'SPECP': 'SPEC P',
               'SPECPCP': 'SPEC PCP',
               'SPECPIV': 'SPEC PIV',
               'SPECPR': 'SPEC PR',
               'SPECPS': 'SPEC PS',
               'SPECS': 'SPEC S',
               'SPECS/P': 'SPEC S/P',
               'SPECSUBJ': 'SPEC SUBJ',
               'SPECVFIN': 'SPEC VFIN',
               'V0/1/3S': 'V 0/1/3S',
               'V1': 'V 1',
               'V1/3S': 'V 1/3S',
               'V1P': 'V 1P',
               'V1S': 'V 1S',
               'V2': 'V 2',
               'V2P': 'V 2P',
               'V2S': 'V 2S',
               'V3': 'V 3',
               'V3P': 'V 3P',
               'V3S': 'V 3S',
               'VACC': 'V ACC',
               'VACC/DAT': 'V ACC/DAT',
               'VCOND': 'V COND',
               'VDAT': 'V DAT',
               'VF': 'V F',
               'VFUT': 'V FUT',
               'VGER': 'V GER',
               'VIMP': 'V IMP',
               'VIMPF': 'V IMPF',
               'VIND': 'V IND',
               'VINF': 'V INF',
               'VM': 'V M',
               'VM/F': 'V M/F',
               'VMQP': 'V MQP',
               'VNOM': 'V NOM',
               'VP': 'V P',
               'VPCP': 'V PCP',
               'VPIV': 'V PIV',
               'VPR': 'V PR',
               'VPS': 'V PS',
               'VS': 'V S',
               'VS/P': 'V S/P',
               'VSUBJ': 'V SUBJ',
               'VVFIN': 'V VFIN',}

In [8]:
FILES_ENCODING = {1: 'UTF-8', 2: 'ISO-8859-15'}
PALAVRAS_ENCODING = sys.getfilesystemencoding()
PALAVRAS_PATH = '/opt/palavras/'
base_parser = PALAVRAS_PATH + 'por.pl'
np_parser = PALAVRAS_PATH + 'bin/extract_np.pl'
parser_mode = '--dep'

def palavras_tagger(text):
    process = subprocess.Popen([base_parser, parser_mode], 
                               stdin=subprocess.PIPE, 
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
    text_and_all_tags = []
    text_and_pos_tags = []
    for line in stdout.decode('utf-8').split('\n'):
        line = line.strip() #.decode(PALAVRAS_ENCODING)
        chunks = ''.join([chunk for chunk in line.split() if chunk.startswith('#')])
        if line.isspace() or line == '':
            text_and_all_tags.append(['SPACE','[ignore]','BL','','','',''])
        elif line.startswith('<'):
            text_and_all_tags.append(['STOP','[ignore]','ES','','','',''])
        elif line.startswith('$'):
            non_word = line.split()[0][1:]
            if non_word.isdigit():
                non_word_type = 'NUM'
            else:
                non_word_type = 'NW'
            text_and_all_tags.append([non_word, '[ignore]', non_word_type, '', '', '', chunks])
        elif len(line.split('\t')) < 2:  #Discard malformed lines
            continue
        else:
            info = line.split('\t')
            final = '\t'.join(info[1:]).split()
            word = info[0].strip()    
            lemma = final[0]
            syn_sem_tags = final[1:]
            try:
                pos_tag = ''.join([wc for wc in syn_sem_tags if wc in WORD_CLASSES][0])#avoid picking two Word_Classes
            except IndexError:
                continue
            secondary_tag = ' '.join([sct for sct in syn_sem_tags if sct.startswith('<')])            
            inflexion_tag = ' '.join([it for it in syn_sem_tags if it in INF_TAGS])
            syntactic_tag = ''.join([st for st in syn_sem_tags if st.startswith('@')])
            text_and_all_tags.append([word, lemma, pos_tag, secondary_tag, inflexion_tag, syntactic_tag, chunks])
    num_tokens = len(text_and_all_tags)
    for position in range(num_tokens):
        text_and_pos_tags.append((text_and_all_tags[position][1], text_and_all_tags[position][2]))
    return text_and_all_tags, text_and_pos_tags, stdout #palavras style, nltk style

### pega listas de nomes e ids de documentos que serão usados

In [18]:
names_dataframe_file = path+'/names_dataframe.pkl'
doc_id_list_file = path+'/doc_id_list.pkl'

In [None]:
''' caso queira carregar os arquivos '''
names_data = pickle.load(open(names_dataframe_file, 'rb'))
doc_id_list = pickle.load(open(doc_id_list_file, 'rb'))

In [5]:
names_data[:10]

Unnamed: 0_level_0,alt_names,id,source,has_ambiguous_surname,short_fullname,short_surname,important_name
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a. a. fisher,"[a. a. fisher, fisher]",104303,frus,False,True,False,False
a. azevedo,"[a. azevedo, azevedo]",500001,cpdoc,True,True,False,False
a. bentinck,"[a. bentinck, bentinck]",101196,frus,False,True,False,False
a. f. susin,"[a. f. susin, susin]",113218,frus,False,True,False,False
a. faria,"[a. faria, faria]",500002,cpdoc,True,True,False,False
a. i. de lima,"[a. i. de lima, de lima]",500003,cpdoc,True,True,True,False
a. j. peixoto de castro,"[a. j. peixoto de castro, peixoto de castro]",500004,cpdoc,True,False,False,False
a. l. conrad,"[a. l. conrad, conrad]",102792,frus,False,True,False,False
a. m. malik,"[a. m. malik, malik]",108405,frus,False,True,False,False
abdel meguid,[abdel meguid],108986,frus,False,False,False,False


### faz extração de entidades
conta número de ocorrência de nomes (da lista de nomes) em documentos, com verificação de algumas condições:
* analisamos o tamanho do nome, dos sobrenomes, ambiguidades, etc
* usamos o 'palavras' para analisar se o nome não seria na verdade uma rua (top), cidade (civ) ou organização (org)

In [17]:
persons_list = []
for row in names_data.itertuples():
    name = row[0]
    person_id = row[2]
    source = row[3]
    person = [person_id, name, source]
    persons_list.append(person)

### Extração tipo 1 (usada)
* Número maior de Falsos Positivos. Número menor de Falsos Negativos.

In [None]:
percentil = int(len(doc_id_list)/100)
person_doc_list = []
extraction_validation_list = []

'''itera nos documentos'''
for file in doc_id_list:
    tag_checklist = []
    name_validation = 'no' # set to 'yes' if name is present on document
    doc_id = re.sub('AAS_mre_(.*)\.txt', r'\1', file)
    
    '''mede percentual de conclusão da tarefa'''
    count_file = doc_id_list.index(file)
    if count_file % 100 == 0: 
        clear_output()
        print(int((count_file)/percentil),'% done')
    
    '''cria tags usando o 'palavras' e manipula texto'''
    with open(os.path.join(path_inputs,file), 'r', encoding=encoding_type) as f:
        text = f.read()
        tagger = palavras_tagger(text)
        for i in tagger[0]:
            tag_definition = i[3]
            if '<org>' in tag_definition or '<top>' in tag_definition or '<civ>' in tag_definition or '<inst>' in tag_definition:
                tag = i[1]
                tag = re.sub('[\[\]]', '', tag)
                tag = re.sub('\=', ' ', tag)
                tag = tag.lower()
                tag_checklist.append(tag)
        text = text.lower()
        
        '''itera em pessoas'''
        for row_name in names_data.itertuples():
            name = row_name[0]
            alt_names = names_data.ix[row_name[0], 'alt_names']
            person_id = row_name[2]
            freq_count = 0
            if names_data.ix[row_name[0], 'has_ambiguous_surname'] == 'False' """
                """and names_data.ix[row_name[0], 'short_surname'] == 'False'  """
                """and names_data.ix[row_name[0], 'short_fullname'] == 'False' """
                """or names_data.ix[row_name[0], 'important_name'] == "True": 
                    for alt_name in alt_names:
                        alt_name_lower = alt_name.lower()
                        alt_name_lower = r'\b' + alt_name_lower + r'\b'
                        person_subfreq = len(re.findall(alt_name_lower, text))
                        freq_count += person_subfreq
            else:
                freq_count = text.count(name)
            freq_count_test1 = freq_count # for further analysis
            '''após contagem de frequência de nome em documento, verifica se é entidade correta'''
            '''ou seja, uso o 'palavras' somente quando há alguma chance de encontrar a entidade no documento'''
            for tag_check in tag_checklist:
                if tag_check in alt_names:
                    freq_count -= 1
            freq_count_test2 = freq_count # for further analysis
            '''se nome ocorrer no documento e for entidade correta, adiciona em lista'''
            if freq_count > 0:
                person_doc = [person_id, doc_id, freq_count]
                person_doc_list.append(person_doc)
                
            #'''cria lista de entidades filtradas pelo palavras para analisar depois'''
            elif freq_count_test1 > 0 and freq_count_test2 <= 0:
                extraction_validation = [person_id, doc_id]
                extraction_validation_list.append(extraction_validation)

### alimenta o banco de dados

In [18]:
pass_sshkey = getpass.getpass()
pass_mysql = getpass.getpass()
with SSHTunnelForwarder(('200.20.164.146', 22),
                        ssh_private_key=ssh_priv_key,
                        ssh_private_key_password=pass_sshkey,
                        ssh_username=ssh_user,
                        remote_bind_address=('127.0.0.1', 3306)) as server:
    
    conn = pymysql.connect(host='localhost', 
                           port=server.local_bind_port, 
                           user=sql_user,
                           passwd=pass_mysql,
                           db='CPDOC_AS',
                           use_unicode=True, 
                           charset="utf8")
    cur = conn.cursor()
    '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

    '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

    cur.execute("DROP TABLE IF EXISTS persons2")
    cur.execute('''CREATE TABLE IF NOT EXISTS persons2
               (id VARCHAR(128) PRIMARY KEY, person_name VARCHAR(128), source VARCHAR(128)
               DEFAULT NULL)
               ENGINE=MyISAM DEFAULT CHARSET='utf8';''')
    
    cur.execute("DROP TABLE IF EXISTS person_doc2")
    cur.execute('''CREATE TABLE IF NOT EXISTS person_doc2
               (person_id VARCHAR(128), doc_id VARCHAR(31), person_count SMALLINT(5)
               DEFAULT NULL)
               ENGINE=MyISAM DEFAULT CHARSET='utf8';''')
    
    cur.execute("SELECT * FROM CPDOC_AS.docs WHERE main_language = 'pt' AND (readability > 0.4 OR readability = -1) ")  
    
    text_data = cur.fetchall()
    
    numrows = len(person_doc_list)
    percentil = int(numrows/100)
    
    for row in person_doc_list:
        person_id = row[0]
        doc_id = row[1]
        person_count = row[2]
        query = "INSERT INTO person_doc2 VALUES (%s, %s, %s)"
        cur.execute(query, (person_id, doc_id, person_count))
    
        ### mede percentual de conclusão da tarefa ###
        count = person_doc_list.index(row) + 1
        if count % 100 == 0: 
            clear_output()
            print(int((count)/percentil),'% done on first list')
        
    print('Finished person_doc_list. Wait just a few more minutes to finish dumping data on MySQL.')
    
    for row in persons_list:
        person_id = row[0]
        name = row[1]
        source = row[2]
        query = "INSERT INTO persons2 VALUES (%s, %s, %s)"
        cur.execute(query, (person_id, name, source))

    
    # acrescenta colunas presentes na base do History-Lab (default = null)
    cur.execute("ALTER TABLE persons2 ADD birth_year INT(4)")
    cur.execute("ALTER TABLE persons2 ADD death_year INT(4)")
    cur.execute("ALTER TABLE persons2 ADD description MEDIUMTEXT")
    cur.execute("ALTER TABLE person_doc2 ADD date DATETIME")
    
    print('All Done')

100 % done on first list
Finished person_doc_list. Wait just a few more minutes to finish dumping data on MySQL.
All Done


### drafts

### Extração tipo 2 (não usada)
* Número menor de Falsos Positivos. Número maior de Falsos Negativos.