# Modelo Booleanno de Recuperación de la Información

## Leer m documentos (corpus)

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
import fitz # open pdf
import re #regex
import unicodedata # use unicode
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/erick-m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


In [17]:
def is_stop_word(word):
    return word in stopwords.words('spanish') or len(word) <= 3

class Document:
    def _content(self):
        raw_doc = self._read_raw_doc(self.doc_name)
        content = self._clean_doc(raw_doc)
        self.content = content

    def _freq(self):
        self.freq_table = self.freq_term_table(self.terms(self.content))
        self.terms = self.terms_unique(self.content)

    def __init__(self, path_to_doc):
        self.doc_name = path_to_doc
        self._content()
        self._freq()

    def __str__(self):
        return f"{self.doc_name}"

    def remove_stop_words(self):
        self.clean_terms =  [ w for w in self.terms if not is_stop_word(w) ]
        return self.clean_terms
        
    def stems(self):
        stemmer = SnowballStemmer("spanish")
        self.stems = [stemmer.stem(t) for t in self.clean_terms]
        return self.stems
        
    def terms(self, clean_document):
        return re.split(r'[^\w]+', clean_document)

    def terms_unique(self, clean_document):
        splited_words = self.terms(clean_document)
        terms = []
        for w in splited_words:
            if w not in terms:
                terms.append(w)
        return terms

    def freq_term_table(self, terms):
        freq_table = {}
        for term in terms:
            if term in freq_table:
                freq_table[term] += 1
            else:
                freq_table[term] = 1
        return freq_table

    def _read_raw_doc(self, doc_name):
        doc = fitz.open(doc_name)
        text = []
        for page in doc:
            text.append(page.get_text())
        return ''.join(text)

    def _clean_doc(self, raw_doc):
        # remove accents
        ## decompose unicode glyphs
        normalized_string = unicodedata.normalize('NFKD',  raw_doc)
        ## if a glyhp is compose, use its base form
        no_accent_string = ''.join([c for c in normalized_string if not unicodedata.combining(c)])
        # remove punctuation marks
        no_punctuation_string = re.sub(r'[^\w]+', ' ', no_accent_string)
        # strip text of document to only get the main content
        return no_punctuation_string.strip().lower()

In [18]:
doc = Document("documentos/1984.pdf")

In [21]:
print(doc.content[:100])
print(doc.doc_name)
print(len(doc.terms))
print(len(doc.freq_table))
doc.remove_stop_words()
print(len(doc.stems))

1984 george orwell parte primera capitulo i era un dia luminoso y frio de abril y los relojes daban 
documentos/1984.pdf
11325
11325
11007


## Generar diccionario de términos de todo el corpus

In [None]:
import os
dir_path = input("Ingresa la ruta a tu carpeta contenedora de archivos pdf:\n") or "documentos/"
file_paths = os.listdir(dir_path)
complete_file_paths = [ os.path.join(dir_path, file) for file in file_paths]

In [None]:
from concurrent.futures import ThreadPoolExecutor
#import time
# start = time.time()
with ThreadPoolExecutor() as executor:
    corpus = executor.map(lambda path: Document(path), complete_file_paths)
# print(f"Pool : {time.time()-start}")

In [None]:
lista_corpus = list(corpus)

In [None]:
print( lista_corpus )

In [None]:
for d in lista_corpus:
    print(d.doc_name)
    print(d.terms[:10])

In [None]:
corpus_terms = []
for d in lista_corpus:
    corpus_terms += d.terms

# print( corpus_terms )
# print( set(corpus_terms) )

## Aplicar eliminación de palabras vacías (stopword)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

In [None]:
def is_stop_word(word):
    # if len(word) == 2:
    #     print(f"word: {word}")
    return word in stopwords.words('spanish') or len(word) <= 3

In [None]:
corpus_no_stopwords = [ w for w in corpus_set if  not is_stop_word(w) ]

In [None]:
## Aplicar una técnica de stemming para reducir el "Tamaño" de las palabras

In [None]:
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

In [None]:
stemmer = SnowballStemmer("spanish")
stems = [ (stemmer.stem(t), t) for t in corpus_no_stopwords ]
for s, t in stems:
    if len(s) < 4:
        if len(s) <=2:
            print(f"\t\t\t{(s,t)}")
        else:
            print(f"{(s,t)}", end="")

## Obtener una matriz binaria de la presencia de los términos en cada documento de todo el corpus

In [None]:
matrix = []
for i, d in enumerate(corpus):
    doc_row = []
    for t in corpus_no_stopwords:
        if t in d.terms:
            doc_row.append(True)
        else:
            doc_row.append(False)
    matrix.append(doc_row)

In [None]:
doc_stem = [stemmer.stem(w) for w in corpus[0].terms]
print(len(doc_stem) == len(corpus[0].terms))
print(len(corpus[0].terms))
print(len(doc_stem))

## Diseñar una tabla hash que permita obtener mediante su función. llave -> valor <=> stem -> documentos donde aparece stem

## Leer la consulta booleana Q

### Aplicar stopword y stemming a la consulta Q

### Aplicar la notación postfijo para el procesamiento de recuperación de la consulta dada

## Presentar los nombre de los documentos obtenidos por Q