# Modelo Booleanno de Recuperación de la Información

## Leer m documentos (corpus)

In [None]:
# import fitz # open pdf
import re #regex
import unicodedata # use unicode

class Document:
    def _content(self):
        raw_doc = self._read_raw_doc(self.doc_name)
        content = self._clean_doc(raw_doc)
        self.content = content

    def _freq(self):
        self.freq_table = self.freq_term_table(self.terms(self.content))
        self.terms = self.terms_unique(self.content)

    def __init__(self, path_to_doc):
        self.doc_name = path_to_doc
        self._content()
        self._freq()

    def __str__(self):
        return f"{self.doc_name}"
        
    def terms(self, clean_document):
        return re.split(r'[^\w]+', clean_document)

    def terms_unique(self, clean_document):
        splited_words = self.terms(clean_document)
        terms = []
        for w in splited_words:
            if w not in terms:
                terms.append(w)
        return terms

    def freq_term_table(self, terms):
        freq_table = {}
        for term in terms:
            if term in freq_table:
                freq_table[term] += 1
            else:
                freq_table[term] = 1
        return freq_table

    def _read_raw_doc(self, doc_name):
        doc = fitz.open(doc_name)
        text = []
        for page in doc:
            text.append(page.get_text())
        return ''.join(text)

    def _clean_doc(self, raw_doc):
        # remove accents
        ## decompose unicode glyphs
        normalized_string = unicodedata.normalize('NFKD',  raw_doc)
        ## if a glyhp is compose, use its base form
        no_accent_string = ''.join([c for c in normalized_string if not unicodedata.combining(c)])
        # remove punctuation marks
        no_punctuation_string = re.sub(r'[^\w]+', ' ', no_accent_string)
        # strip text of document to only get the main content
        return no_punctuation_string.strip().lower()

In [None]:
doc = Document("documentos/1984.pdf")

In [None]:
print(doc.content[:100])
print(doc.doc_name)
print(doc.terms)
print(doc.freq_table)

## Generar diccionario de términos de todo el corpus

In [116]:
import os
dir_path = input("Ingresa la ruta a tu carpeta contenedora de archivos pdf:\n") or "documentos/"
file_paths = os.listdir(dir_path)
complete_file_paths = [ os.path.join(dir_path, file) for file in file_paths]

Ingresa la ruta a tu carpeta contenedora de archivos pdf:
 


In [118]:
from concurrent.futures import ThreadPoolExecutor
#import time
# start = time.time()
with ThreadPoolExecutor() as executor:
    corpus = executor.map(lambda path: Document(path), complete_file_paths)
# print(f"Pool : {time.time()-start}")

Pool : 4.6431920528411865


In [119]:
start = time.time()
corpus = [ Document(file_path) for file_path in complete_file_paths ]
print(f"Serial : {time.time()-start}")




Serial : 4.131969928741455


In [103]:
for d in corpus:
    print(d)

documentos/1984.pdf
documentos/facturar-datos.pdf


In [120]:
for d in corpus:
    print(d.content[:100])
    print(d.doc_name)
    print(d.terms)
    print(d.freq_table)

1984 george orwell parte primera capitulo i era un dia luminoso y frio de abril y los relojes daban 
documentos/1984.pdf
['1984', 'george', 'orwell', 'parte', 'primera', 'capitulo', 'i', 'era', 'un', 'dia', 'luminoso', 'y', 'frio', 'de', 'abril', 'los', 'relojes', 'daban', 'las', 'trece', 'winston', 'smith', 'con', 'la', 'barbilla', 'clavada', 'en', 'el', 'pecho', 'su', 'esfuerzo', 'por', 'burlar', 'molestisimo', 'viento', 'se', 'deslizo', 'rapidamente', 'entre', 'puertas', 'cristal', 'casas', 'victoria', 'aunque', 'no', 'suficiente', 'rapidez', 'para', 'evitar', 'que', 'una', 'rafaga', 'polvorienta', 'colara', 'vestibulo', 'olia', 'a', 'legumbres', 'cocidas', 'esteras', 'viejas', 'al', 'fondo', 'cartel', 'colores', 'demasiado', 'grande', 'hallarse', 'interior', 'estaba', 'pegado', 'pared', 'representaba', 'solo', 'enorme', 'rostro', 'mas', 'metro', 'anchura', 'cara', 'hombre', 'unos', 'cuarenta', 'cinco', 'anos', 'gran', 'bigote', 'negro', 'facciones', 'hermosas', 'endurecidas', 'diri

In [None]:
corpus_terms = []
for d in corpus:
    corpus_terms += d.terms

corpus_set = set(corpus_terms)
print( corpus_set)

## Aplicar eliminación de palabras vacías (stopword)

In [69]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [153]:
def is_stop_word(word):
    # if len(word) == 2:
    #     print(f"word: {word}")
    return word in stopwords.words('spanish') or len(word) <= 2

In [154]:
corpus_no_stopwords = [ w for w in corpus_set if  not is_stop_word(w) ]

## Aplicar una técnica de stemming para reducir el "Tamaño" de las palabras

In [155]:
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package punkt to /Users/erick-m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


In [156]:
stemmer = SnowballStemmer("spanish")
stems = [ (stemmer.stem(t), t) for t in corpus_no_stopwords ]
for s, t in stems:
    if len(s) < 4:
        if len(s) <=2:
            print(f"\t\t\t{(s,t)}")
        else:
            print(f"{(s,t)}", end="")

('loc', 'loco')
('pas', 'pasada')
('old', 'old')
('dig', 'digan')
('lav', 'lavar')
('hac', 'hacerlas')
('bes', 'besos')
('cit', 'citarse')
('vid', 'vida')
('cai', 'caia')
('fot', 'foto')
('hac', 'hacemos')
('ali', 'aliada')
('reg', 'regida')
('tic', 'tic')
('sac', 'sacan')
('par', 'parar')
('cuy', 'cuyas')
('sab', 'saberlo')
('pag', 'pagaban')
('dej', 'dejan')
('hel', 'heladas')
('sal', 'salen')
('lom', 'lomo')
('ido', 'ido')
('lat', 'latidos')
('rap', 'rapidos')
('vam', 'vamos')
('bes', 'besarla')
('cre', 'creando')
('caf', 'cafe')
('pas', 'pasados')
('fij', 'fijar')
('ftp', 'ftp')
('cun', 'cunada')
('mov', 'moviera')
('ira', 'ira')
('hac', 'hacerles')
('dud', 'dudabas')
('gir', 'giran')
('pat', 'pato')
('gem', 'gemido')
('dic', 'dices')
('mir', 'mirase')
('sub', 'subes')
('oir', 'oir')
('lev', 'leve')
('hac', 'hacerse')
('gir', 'gira')
('reg', 'regirse')
('sal', 'saliste')
('nin', 'nino')
('dol', 'doler')
('hol', 'hola')
('sup', 'supiera')
('reb', 'rebosante')
('viv', 'vivido')
('nub

## Obtener una matriz binaria de la presencia de los términos en cada documento de todo el corpus

In [127]:
matrix = []
for i, d in enumerate(corpus):
    doc_row = []
    for t in corpus_no_stopwords:
        if t in d.terms:
            doc_row.append(True)
        else:
            doc_row.append(False)
    matrix.append(doc_row)

In [171]:
doc_stem = [stemmer.stem(w) for w in corpus[0].terms]
print(len(doc_stem) == len(corpus[0].terms))
print(len(corpus[0].terms))
print(len(doc_stem))

True
11325
11325


## Diseñar una tabla hash que permita obtener mediante su función. llave -> valor <=> stem -> documentos donde aparece stem

## Leer la consulta booleana Q

### Aplicar stopword y stemming a la consulta Q

### Aplicar la notación postfijo para el procesamiento de recuperación de la consulta dada

## Presentar los nombre de los documentos obtenidos por Q