# Boolean Search in Documents
## Objective
Expand the simple term search functionality to include Boolean search capabilities. This will allow users to perform more complex queries by combining multiple search terms using Boolean operators.

## Problem Description
You must enhance the existing search engine from the previous exercise to support Boolean operators: AND, OR, and NOT. This will enable the retrieval of documents based on the logical relationships between multiple terms.

## Requirements
### Step 1: Update Data Preparation
Ensure that the documents are still loaded and preprocessed from the previous task. The data should be clean and ready for advanced querying.

### Step 2: Create an Inverted Index
Create an inverted index from the documents. This index maps each word to the set of document IDs in which that word appears. This facilitates word lookup in the search process.

In [1]:
import os
import re
from collections import defaultdict

def tokenize(text):
    # Tokeniza el texto y elimina signos de puntuación y caracteres especiales
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

def create_inverted_index(directory):
    inverted_index = defaultdict(set)  # Usamos un conjunto para evitar duplicados

    # Recorre los documentos en el directorio
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r') as file:
                document_id = os.path.splitext(filename)[0]
                text = file.read()
                tokens = tokenize(text)
                # Actualiza el índice invertido para cada palabra en el documento
                for token in tokens:
                    inverted_index[token].add(document_id)

    return inverted_index

# Directorio que contiene los documentos de texto
directory = '/content/drive/MyDrive/Datas/Data'

# Crea el índice invertido
inverted_index = create_inverted_index(directory)

# Ejemplo de cómo acceder al índice invertido
word = 'example'
if word in inverted_index:
    print(f"The word '{word}' appears in the following documents:", inverted_index[word])
else:
    print(f"The word '{word}' does not appear in any documents.")


The word 'example' appears in the following documents: {'pg12582', 'pg345', 'pg26073', 'pg6593', 'pg10907', 'pg2852', 'pg27827', 'pg3207', 'pg21012', 'pg46', 'pg996', 'pg2600', 'pg8800', 'pg98', 'pg844', 'pg1259', 'pg61419', 'pg28054', 'pg59468', 'pg67979', 'pg41445', 'pg43', 'pg50038', 'pg30254', 'pg768', 'pg48191', 'pg2160', 'pg120', 'pg5200', 'pg44388', 'pg10676', 'pg62119', 'pg1998', 'pg100', 'pg16', 'pg47312', 'pg44837', 'pg394', 'pg45540', 'pg59469', 'pg76', 'pg25344', 'pg35899', 'pg84', 'pg408', 'pg7370', 'pg4085', 'pg514', 'pg42933', 'pg73448', 'pg1342', 'pg45848', 'pg2554', 'pg2641', 'pg4300', 'pg18893', 'pg29728', 'pg73447', 'pg600', 'pg45', 'pg2701', 'pg6130', 'pg205', 'pg1400', 'pg2814', 'pg1260', 'pg37106', 'pg219', 'pg41070', 'pg16389', 'pg5197', 'pg6761', 'pg145', 'pg1661', 'pg52882', 'pg21700', 'pg1232', 'pg1184'}


### Step 3: Implementing Boolean Search
* Enhance Input Query: Modify the function to accept complex queries that can include the Boolean operators AND, OR, and NOT.

* Implement Boolean Logic:

 * AND: The document must contain all the terms. For example, python AND programming should return documents containing both "python" and "programming".

 * OR: The document can contain any of the terms. For example, python OR programming should return documents containing either "python", "programming", or both.

 * NOT: The document must not contain the term following NOT. For example, python NOT snake should return documents that contain "python" but not "snake".

In [11]:
def boolean_search(query, inverted_index):
    # Tokeniza la consulta
    query_terms = query.lower().split()

    # Inicializa el conjunto de documentos que coinciden con la consulta
    result_docs = set()

    # Itera sobre los términos de la consulta
    for term in query_terms:
        # Si el término es una operación booleana, ignóralo
        if term in ['and', 'or', 'not']:
            continue

        # Obtiene los documentos que contienen el término
        term_docs = inverted_index.get(term, set())

        # Aplica la lógica booleana a los términos
        if 'not' in query_terms:
            # Si hay un NOT en la consulta, elimina los documentos que contienen el término después de NOT
            not_index = query_terms.index('not')
            if term == query_terms[not_index + 1]:
                result_docs -= term_docs
            else:
                result_docs |= term_docs
        else:
            if 'and' in query_terms:
                # Si hay un AND en la consulta, los documentos deben contener todos los términos
                and_index = query_terms.index('and')
                if term == query_terms[and_index - 1] or term == query_terms[and_index + 1]:
                    result_docs &= term_docs
            elif 'or' in query_terms:
                # Si hay un OR en la consulta, los documentos pueden contener cualquiera de los términos
                or_index = query_terms.index('or')
                if term == query_terms[or_index - 1] or term == query_terms[or_index + 1]:
                    result_docs |= term_docs
            else:
                # Si no hay operadores booleanos, simplemente agrega los documentos que contienen el término
                result_docs |= term_docs

    return result_docs

# Ejemplo de uso
query = "her OR him"
matching_docs = boolean_search(query, inverted_index)
print("Documentos que coinciden con la consulta:", matching_docs)


Documentos que coinciden con la consulta: {'pg12582', 'pg345', 'pg41287', 'pg26073', 'pg6593', 'pg10907', 'pg2852', 'pg27827', 'pg3207', 'pg21012', 'pg46', 'pg996', 'pg2600', 'pg55', 'pg8800', 'pg98', 'pg844', 'pg64317', 'pg1727', 'pg1259', 'pg61419', 'pg28054', 'pg1952', 'pg59468', 'pg74', 'pg67979', 'pg174', 'pg41445', 'pg43', 'pg50038', 'pg30254', 'pg768', 'pg48191', 'pg2160', 'pg120', 'pg244', 'pg5200', 'pg44388', 'pg10676', 'pg62119', 'pg1080', 'pg1998', 'pg100', 'pg16', 'pg47312', 'pg44837', 'pg1513', 'pg394', 'pg45540', 'pg59469', 'pg76', 'pg25344', 'pg2591', 'pg35899', 'pg84', 'pg408', 'pg7370', 'pg4085', 'pg514', 'pg2542', 'pg42933', 'pg73448', 'pg1342', 'pg45848', 'pg2554', 'pg2641', 'pg4300', 'pg18893', 'pg29728', 'pg73447', 'pg600', 'pg45', 'pg2701', 'pg6130', 'pg73442', 'pg205', 'pg1400', 'pg2814', 'pg1260', 'pg37106', 'pg219', 'pg11', 'pg41070', 'pg16389', 'pg67098', 'pg5197', 'pg6761', 'pg145', 'pg1661', 'pg52281', 'pg52882', 'pg21700', 'pg1232', 'pg1184', 'pg73444'}
