In [1]:
import nltk
# nltk.download() # para descargar la librería de stopwords en español

In [2]:
import numpy as np
import pandas as pd
import PyPDF2 
import os
from nltk.corpus import stopwords

In [3]:
# https://betterprogramming.pub/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f

# Read files from folder
def readPDF(path):
    files = []
    numPages = []
    texts = []

    for filename in os.listdir(path):
        files.append(filename)

        #open allows you to read the file.
        pdfFileObj = open(path + filename,'rb')
        #The pdfReader variable is a readable object that will be parsed.
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        #Discerning the number of pages will allow us to parse through all the pages.
        num_pages = pdfReader.numPages
        count = 0
        text = ""

        #The while loop will read each page.
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count +=1
            text += pageObj.extractText()

        numPages.append(count)

        #This if statement exists to check if the above library returned words. It's done because PyPDF2 cannot read scanned files.
        if text != "":
            text = text
        #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text.
        else:
            text = textract.process(fileurl, method='tesseract', language='eng')

        texts.append(text)

    consentimientos = {'FileName': files,
                       'NumPages': numPages,
                       'Texto': texts
                      }

    return consentimientos

In [4]:
consentimientos = readPDF('./Docs/')

In [56]:
#Mostrar los 50 términos más "centrales" en la colección
import re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#Para eliminar los números
def numPreprocessor(tokens):
    r = re.sub('(\d)+', '', tokens.lower())
    return r

#Creación del corpus
corpus = []
for texto in consentimientos['Texto']:
    corpus.append(texto)

#Vectorización
vectorizer = TfidfVectorizer(stop_words=stopwords.words("spanish"), min_df=2, preprocessor=numPreprocessor)
vectors = vectorizer.fit_transform(corpus)

vectors.toarray()

# Para mostrar las 50 palabras más frecuentes
import pandas as pd

sums = vectors.sum(axis=0)

data = []
for col, term in enumerate(vectorizer.get_feature_names()):
    data.append((term, sums[0, col]))

ranking = pd.DataFrame(data, columns=['Palabra','Frecuencia'])
rankingSort = ranking.sort_values('Frecuencia', ascending=False)
rankingSort = rankingSort.head(50)

print(rankingSort.to_string(index=False))

         Palabra  Frecuencia
         estudio    1.173745
  consentimiento    0.789769
          nombre    0.611583
           firma    0.559065
     información    0.496565
       informado    0.486470
          médico    0.466141
            hoja    0.429728
        paciente    0.414937
           fecha    0.351803
           datos    0.346841
         riesgos    0.339713
    participante    0.309421
   representante    0.306144
       explicado    0.305176
         persona    0.300392
    investigador    0.273878
   participación    0.266331
           salud    0.242242
        atención    0.215507
           copia    0.193673
        presente    0.191707
       cualquier    0.188731
           mismo    0.182585
 características    0.180306
      beneficios    0.180306
        objetivo    0.176063
        posibles    0.175002
           pueda    0.167907
         acuerdo    0.166556
          médica    0.164972
            edad    0.156265
       preguntas    0.155850
            el

In [60]:
# http://www.corpus.unam.mx/servicio-freeling/
import requests

#Archivo a ser enviado
files = {'file': consentimientos['Texto'][0]}

#Parámetros
params = {'outf': 'tagged', 'format': 'json'}

#Enviar petición
url = "http://www.corpus.unam.mx/servicio-freeling/analyze.php"
r = requests.post(url, files=files, params=params)
#Convertir de formato json
obj = r.json()

In [61]:
for sentence in obj[:5]:
    for word in sentence:
        print(word)

{'token': 'FPNT', 'lemma': 'fpnt', 'tag': 'NCMS000', 'prob': '0.874985'}
{'token': '-', 'lemma': '-', 'tag': 'Fg', 'prob': '1'}
{'token': '07', 'lemma': '7', 'tag': 'Z', 'prob': '1'}
{'token': 'a', 'lemma': 'a', 'tag': 'SP', 'prob': '0.998775'}
{'token': '-', 'lemma': '-', 'tag': 'Fg', 'prob': '1'}
{'token': '31', 'lemma': '31', 'tag': 'Z', 'prob': '1'}
{'token': '(', 'lemma': '(', 'tag': 'Fpa', 'prob': '1'}
{'token': 'A', 'lemma': 'a', 'tag': 'SP', 'prob': '0.998775'}
{'token': ')', 'lemma': ')', 'tag': 'Fpt', 'prob': '1'}
{'token': 'Protocolo', 'lemma': 'protocolo', 'tag': 'NCMS000', 'prob': '0.980769'}
{'token': 'XXXXXXXX', 'lemma': 'xxxxxxxx', 'tag': 'AQ0CN00', 'prob': '0.901599'}
{'token': 'Versión', 'lemma': 'versión', 'tag': 'NCFS000', 'prob': '1'}
{'token': 'X', 'lemma': 'x', 'tag': 'NCFS000', 'prob': '1'}
{'token': 'de', 'lemma': 'de', 'tag': 'SP', 'prob': '0.999961'}
{'token': 'fecha', 'lemma': 'fecha', 'tag': 'NCFS000', 'prob': '0.990741'}
{'token': 'XXXXXXXXX', 'lemma': 'xx

In [58]:
tags = pd.DataFrame(columns=['token', 'lemma', 'tag', 'prob'])
for sentence in obj:
    for word in sentence:
        tags = tags.append(pd.DataFrame(word, index=[0]), ignore_index=True)
    
tags

Unnamed: 0,token,lemma,tag,prob
0,FPNT,fpnt,NCMS000,0.874985
1,-,-,Fg,1
2,07,7,Z,1
3,a,a,SP,0.998775
4,-,-,Fg,1
...,...,...,...,...
2870,documento,documento,NCMS000,0.997159
2871,de,de,SP,0.999961
2872,consentimiento,consentimiento,NCMS000,1
2873,informado,informar,VMP00SM,1


In [62]:
tags.tag.unique()

array(['NCMS000', 'Fg', 'Z', 'SP', 'Fpa', 'Fpt', 'AQ0CN00', 'NCFS000',
       'DA0MS0', 'NCCS000', 'CC', 'VMP00SM', 'Fd', 'DA0FS0', 'AQ0CS00',
       'NCMP000', 'Fp', 'AQ0MS00', 'DI0MP0', 'DA0MP0', 'PP1CP00',
       'VMIS1P0', 'PP2CS0P', 'VMN0000', 'PP3CSD0', 'DI0MS0', 'PR0CN00',
       'P00CN00', 'VMIP3S0', 'VAIP3S0', 'VSP00SM', 'AQ0FS00', 'AQ0MP00',
       'Fc', 'VMIP3P0', 'DP1FSP', 'VSIP3S0', 'RG', 'CS', 'VMSP3S0',
       'DD0MS0', 'PD00S00', 'DD0FS0', 'PP1MP00', 'VMIF1P0', 'DA0FP0',
       'NCFP000', 'VMSP3P0', 'DP3CSN', 'RN', 'DI0CS0', 'PI0MS00',
       'VMIC3S0', 'VMP00SF', 'VSN0000', 'AQ0CP00', 'Fh', 'PT00000',
       'DT0CN0', 'AQ0FP00', 'VMIF3S0', 'VMG0000', 'PT0CP00', 'PP3CN00',
       'PP3MSA0', 'AO0MS00', 'I', 'VMP00PM', 'Fca', 'VMIS3S0', 'Fs',
       'VMP00PF', 'Fct', 'VMSI3S0', 'NCMN000', 'VMIC3P0', 'PI0MP00',
       'VMIF3P0', 'PP3MPA0', 'VMIP1S0', 'DI0FS0', 'DI0FP0', 'VSIC1S0',
       'VMSI3P0', 'NCCP000', 'VMIP1P0', 'DP3CPN', 'VMM03S0', 'VSIF3P0',
       'PT0CS00', 'PD