# Modelo Booleanno de Recuperación de la Información

## Leer m documentos (corpus)

In [None]:
# import fitz # open pdf
import re #regex
import unicodedata # use unicode

class Document:
    def _content(self):
        raw_doc = self._read_raw_doc(self.doc_name)
        content = self._clean_doc(raw_doc)
        self.content = content

    def _freq(self):
        self.freq_table = self.freq_term_table(self.terms(self.content))
        self.terms = self.terms_unique(self.content)

    def __init__(self, path_to_doc):
        self.doc_name = path_to_doc
        self._content()
        self._freq()

    def __str__(self):
        return f"{self.doc_name}"
        
    def terms(self, clean_document):
        return re.split(r'[^\w]+', clean_document)

    def terms_unique(self, clean_document):
        splited_words = self.terms(clean_document)
        terms = []
        for w in splited_words:
            if w not in terms:
                terms.append(w)
        return terms

    def freq_term_table(self, terms):
        freq_table = {}
        for term in terms:
            if term in freq_table:
                freq_table[term] += 1
            else:
                freq_table[term] = 1
        return freq_table

    def _read_raw_doc(self, doc_name):
        doc = fitz.open(doc_name)
        text = []
        for page in doc:
            text.append(page.get_text())
        return ''.join(text)

    def _clean_doc(self, raw_doc):
        # remove accents
        ## decompose unicode glyphs
        normalized_string = unicodedata.normalize('NFKD',  raw_doc)
        ## if a glyhp is compose, use its base form
        no_accent_string = ''.join([c for c in normalized_string if not unicodedata.combining(c)])
        # remove punctuation marks
        no_punctuation_string = re.sub(r'[^\w]+', ' ', no_accent_string)
        # strip text of document to only get the main content
        return no_punctuation_string.strip().lower()

In [None]:
doc = Document("documentos/1984.pdf")

In [None]:
print(doc.content[:100])
print(doc.doc_name)
print(doc.terms)
print(doc.freq_table)

## Generar diccionario de términos de todo el corpus

In [81]:
import os
dir_path = input("Ingresa la ruta a tu carpeta contenedora de archivos pdf:\n") or "documentos/"
files = os.listdir(dir_path)
corpus = []
for file in files:
    path_to_doc = os.path.join(dir_path, file)
    print(path_to_doc)
    corpus.append(Document(path_to_doc))

Ingresa la ruta a tu carpeta contenedora de archivos pdf:
 


documentos/1984.pdf
documentos/facturar-datos.pdf


In [None]:
for d in corpus:
    print(d.content[:100])
    print(d.doc_name)
    print(d.terms)
    print(d.freq_table)

In [None]:
corpus_terms = []
for d in corpus:
    corpus_terms += d.terms

corpus_set = set(corpus_terms)
print( corpus_set)

## Aplicar eliminación de palabras vacías (stopword)

In [69]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
corpus_no_stopwords = [ w for w in corpus_set if w not in stopwords.words('spanish') ]

## Aplicar una técnica de stemming para reducir el "Tamaño" de las palabras

In [76]:
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package punkt to /Users/erick-m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...


In [84]:
stemmer = SnowballStemmer("spanish")
stems = [ stemmer.stem(t) for t in corpus_no_stopwords ]

## Obtener una matriz binaria de la presencia de los términos en cada documento de todo el corpus

In [86]:
matrix = []
for i, d in enumerate(corpus):
    doc_row = []
    for t in corpus_no_stopwords:
        if t in d.terms:
            doc_row.append(True)
        else:
            doc_row.append(False)
    matrix.append(doc_row)

## Diseñar una tabla hash que permita obtener mediante su función. llave -> valor <=> stem -> documentos donde aparece stem

## Leer la consulta booleana Q

### Aplicar stopword y stemming a la consulta Q

### Aplicar la notación postfijo para el procesamiento de recuperación de la consulta dada

## Presentar los nombre de los documentos obtenidos por Q