<a href="https://colab.research.google.com/github/Elshamysamira/Information-Extraction-and-Retrieval/blob/nasti/Information_Extraction_and_Retrieval_with_SQLite_%26_CLASSES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Version with SQLite and Classes**

In [None]:
!pip install nltk
!pip install chardet
import nltk
import chardet
nltk.download('punkt')  # This downloads necessary datasets for tokenization

from nltk.tokenize import word_tokenize
from collections import defaultdict

from pathlib import Path
import os
import sqlite3



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from pathlib import Path
import os
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
class InvertedIndex:
    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor() # cursor is used to execute SQL commands and navigate through the database records.
        self.create_table()

    def create_table(self):
        self.cursor.execute("CREATE TABLE IF NOT EXISTS inverted_index (
                                word TEXT PRIMARY KEY,
                                document_ids TEXT
                            )")

    def save_index(self, inverted_index):   # takes an inverted index as input and saves it to the database
        for word, document_ids in inverted_index.items(): # This line iterates over the items (key-value pairs) in the inverted_index dictionary
            doc_ids_str = ' '.join(str(doc_id) for doc_id in document_ids)  ## This line converts the set of document IDs associated with a word into a space-separated string (doc_ids_str).
            self.cursor.execute("INSERT INTO inverted_index (word, document_ids) VALUES (?, ?)", (word, doc_ids_str))
        self.conn.commit()

    def close_connection(self):
        self.conn.close()

In [None]:

class DocumentTokenizer:
    def __init__(self, documents):
        self.documents = documents

    def tokenize(self):
        tokenized_docs = {}
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()

                tokens = word_tokenize(document_content)
                tokenized_docs[documentID] = tokens

            except Exception as e:
                print(f"Error processing document {document_path}: {e}")

        return tokenized_docs


In [None]:
class InvertedIndexBuilder:
    def __init__(self, documents):
        self.documents = documents
        self.inverted_index = defaultdict(set)

    def build_index(self):
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()

                for word in document_content.lower().split():
                    self.inverted_index[word].add(documentID)

            except Exception as e:
                print(f"Error processing document {document_path}: {e}")

    def get_index(self):
        return self.inverted_index

In [None]:
import re

class SearchEngine:
    def __init__(self, index_db_path, documents_mapping):
        self.index_db_path = index_db_path
        self.documents_mapping = documents_mapping

    def lookup_word(self, word):
        conn = sqlite3.connect(self.index_db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT document_ids FROM inverted_index WHERE word=?", (word,))
        result = cursor.fetchone()
        conn.close()

        if result:
            document_ids = set(map(int, result[0].split(',')))
            return document_ids
        else:
            return set()

    def preprocess_query(self, query):
        # Replace common contractions with their full forms
        query = re.sub(r"can't", "cannot", query, flags=re.IGNORECASE)
        query = re.sub(r"n't", " not", query, flags=re.IGNORECASE)  # Replace "n't" with " not" (e.g., "can't" -> "cannot")
        # Add more replacements as needed for other contractions

        return query

    def search(self, query):
        # Preprocess the query
        query = self.preprocess_query(query)

        # Tokenize the query
        query_tokens = word_tokenize(query.lower())
        print(f"Tokenized Query: {query_tokens}")  # Print tokenized query

        # Look up each token in the inverted index
        document_sets = [self.lookup_word(token) for token in query_tokens]
        common_documents = set.intersection(*document_sets) if document_sets else set()

        if common_documents:
            print(f"Congratulations! The word(s) '{query}' appear together in the following document ID(s): {common_documents}")
            for doc_id in common_documents:
                print(f"Document ID: {doc_id}, Document Name: {self.documents_mapping.get(doc_id, 'Unknown')}")
        else:
            print(f"I'm sorry, the word(s) '{query}' do not appear together in any document.")


In [None]:

class DocumentManager:
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def get_document_paths(self):
        return [os.path.join(self.folder_path, file) for file in os.listdir(self.folder_path)]

In [None]:
folder_path = '/content/drive/My Drive/Documents'
index_db_path = '/content/drive/My Drive/Documents/inverted_index.db'

# Initialize DocumentManager
doc_manager = DocumentManager(folder_path)
document_paths = doc_manager.get_document_paths()

In [None]:
# Tokenize documents
doc_tokenizer = DocumentTokenizer(document_paths)
tokenized_docs = doc_tokenizer.tokenize()

In [None]:
# Save tokenized documents
output_directory = '/content/drive/My Drive/Documents/tokenized'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for doc_id, tokens in tokenized_docs.items():
    output_file_path = os.path.join(output_directory, f"tokenized_document_{doc_id}.txt")
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(' '.join(tokens))

In [None]:
# Build inverted index
index_builder = InvertedIndexBuilder(document_paths)
index_builder.build_index()
inverted_index = index_builder.get_index()

In [None]:
# Save inverted index to SQLite
index_db = InvertedIndex(index_db_path)
index_db.save_index(inverted_index)
index_db.close_connection()

In [None]:
# Document mapping
document_mapping = {documentID: Path(document_path).name for documentID, document_path in enumerate(document_paths)}

In [None]:
# Initialize SearchEngine
search_engine = SearchEngine(index_db_path, document_mapping)

In [None]:
# Search
search_engine.search('HateD')
print('\n')
search_engine.search('HateD Applied')
print('\n')
search_engine.search("Can't")
print('\n')
search_engine.search("didn't")
print('\n')
search_engine.search("state-of-the-art")
print('\n')
search_engine.search("Elliott-Fisher")
print('\n')
search_engine.search("Mr.")
print('\n')
search_engine.search("Mr")


Tokenized Query: ['hated']
Congratulations! The word(s) 'HateD' appear together in the following document ID(s): {1, 4, 13}
Document ID: 1, Document Name: Dumbells of Business by Louis Custer Martin Reed.txt
Document ID: 4, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 13, Document Name: Fifty years in Wall Street by Henry Clews.txt


Tokenized Query: ['hated', 'applied']
Congratulations! The word(s) 'HateD Applied' appear together in the following document ID(s): {4, 13}
Document ID: 4, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 13, Document Name: Fifty years in Wall Street by Henry Clews.txt


Tokenized Query: ['can', 'not']
Congratulations! The word(s) 'cannot' appear together in the following document ID(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Document ID: 0, Document Name: A thousand ways to make money.txt
Document ID: 1, Document Name: Dumbells of Business by Louis Custer Mar

By default, NLTK's word_tokenize function splits contractions like "Can't" into two separate tokens: "ca" and "n't".