# Modelo Booleanno de Recuperación de la Información

## Leer m documentos (corpus)

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
import fitz # open pdf
import re #regex
import unicodedata # use unicode
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/erick-m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


In [2]:

class Document:
    def _content(self):
        raw_doc = self._read_raw_doc(self.doc_name)
        content = self._clean_doc(raw_doc)
        self.content = content

    def _freq(self):
        self.freq_table = self.freq_term_table(self.terms(self.content))
        self.terms = self.terms_unique(self.content)

    def __init__(self, path_to_doc):
        self.doc_name = path_to_doc
        self._content()
        self._freq()

    def __str__(self):
        return f"{self.doc_name}"

    def remove_stop_words(self):
        self.clean_terms =  [ w for w in self.terms if not self.is_stop_word(w) ]
        return self.clean_terms
        
    def is_stop_word(self, word):
        return word in stopwords.words('spanish') or len(word) <= 3
        
    def stemming(self):
        self.remove_stop_words()
        stemmer = SnowballStemmer("spanish")
        self.stems = [stemmer.stem(t) for t in self.clean_terms]
        return self.stems
        
    def terms(self, clean_document):
        return re.split(r'[^\w]+', clean_document)

    def terms_unique(self, clean_document):
        splited_words = self.terms(clean_document)
        terms = []
        for w in splited_words:
            if w not in terms:
                terms.append(w)
        return terms

    def freq_term_table(self, terms):
        freq_table = {}
        for term in terms:
            if term in freq_table:
                freq_table[term] += 1
            else:
                freq_table[term] = 1
        return freq_table

    def _read_raw_doc(self, doc_name):
        doc = fitz.open(doc_name)
        text = []
        for page in doc:
            text.append(page.get_text())
        return ''.join(text)

    def _clean_doc(self, raw_doc):
        # remove accents
        ## decompose unicode glyphs
        normalized_string = unicodedata.normalize('NFKD',  raw_doc)
        ## if a glyhp is compose, use its base form
        no_accent_string = ''.join([c for c in normalized_string if not unicodedata.combining(c)])
        # remove punctuation marks
        no_punctuation_string = re.sub(r'[^\w]+', ' ', no_accent_string)
        # strip text of document to only get the main content
        return no_punctuation_string.strip().lower()

In [3]:
doc = Document("documentos/1984.pdf")

In [4]:
print(doc.content[:100])
print(doc.doc_name)
print(len(doc.terms))
print(len(doc.freq_table))
doc.stemming()
print(len(doc.stems))

1984 george orwell parte primera capitulo i era un dia luminoso y frio de abril y los relojes daban 
documentos/1984.pdf
11325
11325
11007


## Generar diccionario de términos de todo el corpus

In [5]:
import os
# dir_path = input("Ingresa la ruta a tu carpeta contenedora de archivos pdf:\n") or "documentos/"
dir_path = "documentos/"
file_paths = os.listdir(dir_path)
complete_file_paths = [ os.path.join(dir_path, file) for file in file_paths]

In [6]:
from concurrent.futures import ThreadPoolExecutor
#import time
# start = time.time()
with ThreadPoolExecutor() as executor:
    corpus = executor.map(lambda path: Document(path), complete_file_paths)
# print(f"Pool : {time.time()-start}")

In [7]:
lista_corpus = list(corpus)

In [8]:
print( lista_corpus )
docs_in_corpus = [ d.doc_name for d in lista_corpus]
print(docs_in_corpus)

[<__main__.Document object at 0x10a667f90>, <__main__.Document object at 0x10a6684d0>]
['documentos/1984.pdf', 'documentos/facturar-datos.pdf']


In [9]:
for d in lista_corpus:
    print(d.doc_name)
    print(d.terms[:10])

documentos/1984.pdf
['1984', 'george', 'orwell', 'parte', 'primera', 'capitulo', 'i', 'era', 'un', 'dia']
documentos/facturar-datos.pdf
['datos', 'necesarios', 'para', 'facturar', 'se', 'tienen', 'que', 'enviar', 'en', 'el']


In [10]:
corpus_terms = []
for d in lista_corpus:
    corpus_terms += d.terms

corpus_set = set(corpus_terms)

In [11]:
print(len(corpus_terms))
print(len(corpus_set))

11488
11394


## Aplicar eliminación de palabras vacías (stopword)

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def is_stop_word(word):
    return word in stopwords.words('spanish') or len(word) <= 3

In [14]:
corpus_no_stopwords = [ w for w in corpus_set if  not is_stop_word(w) ]

In [15]:
lista_corpus_noStopWords = []
for d in lista_corpus:
    d.remove_stop_words()
    lista_corpus_noStopWords += d.clean_terms

set_corpus_noStopWords = set(lista_corpus_noStopWords)
print(len(lista_corpus_noStopWords))
print(len(set_corpus_noStopWords))

11130
11065


## Aplicar una técnica de stemming para reducir el "Tamaño" de las palabras

In [16]:
nltk.download('punkt') # tokenizer data
nltk.download('snowball_data') # Snowball stemmer data
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package punkt to /Users/erick-m/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


In [17]:
stemmer = SnowballStemmer("spanish")
stems = [ (stemmer.stem(t), t) for t in corpus_no_stopwords ]

In [18]:
# for s, t in stems:
#     if len(s) < 4:
#         if len(s) <=2:
#             print(f"\t\t\t{(s,t)}")
#         else:
#             print(f"{(s,t)}", end="")

In [19]:
for d in lista_corpus:
    d.stemming()

In [20]:
stemmed_corpus = []
for d in lista_corpus:
    stemmed_corpus += d.stems
set_stemmed_corpus = set(stemmed_corpus)
print(len(stemmed_corpus))
print(len(set_stemmed_corpus))

11130
5534


## Obtener una matriz binaria de la presencia de los términos en cada documento de todo el corpus

In [21]:
matrix = [set_corpus_noStopWords]
for d in lista_corpus:
    doc_row = []
    for t in set_corpus_noStopWords:
        if t in d.terms:
            doc_row.append("😃")
        else:
            doc_row.append("👤")
    matrix.append(doc_row)

In [22]:
import csv
def write_csv(path, data):
    with open(path, 'w', newline='') as file:
        writer = csv.writer(file)
        for row in data:
            writer.writerow(row)
write_csv('matrix.csv', matrix)

In [23]:
stem_matrix = [set_stemmed_corpus]
for i, d in enumerate(lista_corpus):
    doc_row = []
    for s in set_stemmed_corpus:
        if s in d.stems:
            doc_row.append(True)
        else:
            doc_row.append(False)
    stem_matrix.append(doc_row)

In [24]:
write_csv('stem_matrix.csv', stem_matrix)

## Diseñar una tabla hash que permita obtener mediante su función. llave -> valor <=> stem -> documentos donde aparece stem

In [25]:
import hashtable
tabla = hashtable.HashTable()

tabla.insert("hola", 2)
def list_docs(stem):
    docs = []
    for d in lista_corpus:
        if stem in d.stems:
            docs.append(d.doc_name)
    return docs
        

In [26]:
for s in set_stemmed_corpus:
    tabla.insert(s, list_docs(s))

In [27]:
import pickle
with open("tabla.hash", "wb") as file:
    pickle.dump(tabla, file)

In [28]:
print(tabla.find("factur"))
print(tabla.find("moj"))
print(tabla.find("archiv"))
print(list_docs)

['documentos/facturar-datos.pdf']
['documentos/1984.pdf']
['documentos/1984.pdf', 'documentos/facturar-datos.pdf']
<function list_docs at 0x10a79b420>


## Leer la consulta booleana Q

In [62]:
import query as query
import importlib
importlib.reload(query)

restart


[nltk_data] Downloading package snowball_data to
[nltk_data]     /Users/erick-m/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


<module 'query' from '/Users/erick-m/code/repos/RecuperacionInformacion/modelo_boleano/query.py'>

In [63]:
raw_query = "(FACTURAR u MOJAR) n (!(ARCHIVAR))"
raw_query2 = "(FACTURAR u MOJAR) u (!(ARCHIVAR))"
stemmed_query = query.stemmed_query(raw_query)
query.get_query(stemmed_query, tabla, docs_in_corpus)

[]

In [64]:
stemmed_query2 = query.stemmed_query(raw_query2)
query.get_query(stemmed_query2, tabla, docs_in_corpus)

['documentos/facturar-datos.pdf', 'documentos/1984.pdf']

### Aplicar stopword y stemming a la consulta Q

### Aplicar la notación postfijo para el procesamiento de recuperación de la consulta dada

## Presentar los nombre de los documentos obtenidos por Q