<a href="https://colab.research.google.com/github/Elshamysamira/Information-Extraction-and-Retrieval/blob/nasti/Information_Extraction_and_Retrieval_with_SQLite_%26_CLASSES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Version with SQLite and Classes**

In [1]:
!pip install nltk
!pip install chardet
import nltk
import chardet
nltk.download('punkt')  # This downloads necessary datasets for tokenization

from nltk.tokenize import word_tokenize
from collections import defaultdict

import pandas as pd

from pathlib import Path
import os
import sqlite3



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from pathlib import Path
import os
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
class InvertedIndex:
    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS inverted_index (
                                word TEXT PRIMARY KEY,
                                document_ids TEXT
                            )''')

    def save_index(self, inverted_index):
        for word, document_ids in inverted_index.items():
            doc_ids_str = ','.join(str(doc_id) for doc_id in document_ids)
            self.cursor.execute("INSERT INTO inverted_index (word, document_ids) VALUES (?, ?)", (word, doc_ids_str))
        self.conn.commit()

    def close_connection(self):
        self.conn.close()

In [4]:

class DocumentTokenizer:
    def __init__(self, documents):
        self.documents = documents

    def tokenize(self):
        tokenized_docs = {}
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()

                tokens = word_tokenize(document_content)
                tokenized_docs[documentID] = tokens

            except Exception as e:
                print(f"Error processing document {document_path}: {e}")

        return tokenized_docs


In [5]:
class InvertedIndexBuilder:
    def __init__(self, documents):
        self.documents = documents
        self.inverted_index = defaultdict(set)

    def build_index(self):
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()

                for word in document_content.lower().split():
                    self.inverted_index[word].add(documentID)

            except Exception as e:
                print(f"Error processing document {document_path}: {e}")

    def get_index(self):
        return self.inverted_index

In [6]:
import re

class SearchEngine:
    def __init__(self, index_db_path, documents_mapping):
        self.index_db_path = index_db_path
        self.documents_mapping = documents_mapping

    def lookup_word(self, word):
        conn = sqlite3.connect(self.index_db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT document_ids FROM inverted_index WHERE word=?", (word,))
        result = cursor.fetchone()
        conn.close()

        if result:
            document_ids = set(map(int, result[0].split(',')))
            return document_ids
        else:
            return set()

    def preprocess_query(self, query):
        # Replace common contractions with their full forms
        query = re.sub(r"can't", "cannot", query, flags=re.IGNORECASE)
        query = re.sub(r"n't", " not", query, flags=re.IGNORECASE)  # Replace "n't" with " not" (e.g., "can't" -> "cannot")
        # Add more replacements as needed for other contractions

        return query

    def search(self, query):
        # Preprocess the query
        query = self.preprocess_query(query)

        # Tokenize the query
        query_tokens = word_tokenize(query.lower())
        print(f"Tokenized Query: {query_tokens}")  # Print tokenized query

        # Look up each token in the inverted index
        document_sets = [self.lookup_word(token) for token in query_tokens]
        common_documents = set.intersection(*document_sets) if document_sets else set()

        if common_documents:
            print(f"Congratulations! The word(s) '{query}' appear together in the following document ID(s): {common_documents}")
            for doc_id in common_documents:
                print(f"Document ID: {doc_id}, Document Name: {self.documents_mapping.get(doc_id, 'Unknown')}")
        else:
            print(f"I'm sorry, the word(s) '{query}' do not appear together in any document.")


In [7]:

class DocumentManager:
    def __init__(self, books_path):
        self.books_path = books_path

    def get_doc_paths(self):
        return [os.path.join(self.books_path, file) for file in os.listdir(self.books_path)]

In [8]:
books_path = '/content/drive/My Drive/Documents'
index_db_path = '/content/drive/My Drive/Documents/inverted_index.db'

# Initialize DocumentManager
doc_manager = DocumentManager(books_path)     # returns a list of paths to all the docs within the dir
doc_paths = doc_manager.get_doc_paths()

In [9]:
# Tokenize documents
doc_tokenizer = DocumentTokenizer(doc_paths)  # initialize the DocumentTokenizer object, which is responsible for tokenizing the documents
tokenized_docs = doc_tokenizer.tokenize()     # tokenizes each document and returns a dictionary where the keys are document IDs and the values are lists of tokens

In [10]:
# Save tokenized documents
output_dir = '/content/drive/My Drive/Documents/tokenized'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


for doc_id, tokens in tokenized_docs.items():
    output_file_path = os.path.join(output_dir, f"tokenized_document_{doc_id}.txt")
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(' '.join(tokens))

In [11]:
# Build inverted index
index_builder = InvertedIndexBuilder(doc_paths)
index_builder.build_index()
inverted_index = index_builder.get_index()

In [12]:
# Save inverted index to SQLite
index_db = InvertedIndex(index_db_path)
index_db.save_index(inverted_index)
index_db.close_connection()

In [13]:
# Document mapping
document_mapping = {documentID: Path(document_path).name for documentID, document_path in enumerate(doc_paths)}

In [14]:
# Initialize SearchEngine
search_engine = SearchEngine(index_db_path, document_mapping)

In [15]:
# Search
search_engine.search('HateD')
print('\n')
search_engine.search('HateD Applied')
print('\n')
search_engine.search("Can't")
print('\n')
search_engine.search("didn't")
print('\n')
search_engine.search("state-of-the-art")
print('\n')
search_engine.search("Elliott-Fisher")
print('\n')
search_engine.search("Mr.")
print('\n')
search_engine.search("Mr")


Tokenized Query: ['hated']
Congratulations! The word(s) 'HateD' appear together in the following document ID(s): {1, 4, 13}
Document ID: 1, Document Name: Dumbells of Business by Louis Custer Martin Reed.txt
Document ID: 4, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 13, Document Name: Fifty years in Wall Street by Henry Clews.txt


Tokenized Query: ['hated', 'applied']
Congratulations! The word(s) 'HateD Applied' appear together in the following document ID(s): {4, 13}
Document ID: 4, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 13, Document Name: Fifty years in Wall Street by Henry Clews.txt


Tokenized Query: ['can', 'not']
Congratulations! The word(s) 'cannot' appear together in the following document ID(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Document ID: 0, Document Name: A thousand ways to make money.txt
Document ID: 1, Document Name: Dumbells of Business by Louis Custer Mar

Otherwise by default, NLTK's word_tokenize function splits contractions like "Can't" into two separate tokens: "ca" and "n't".

# Now let's check what is inside our db:

In [28]:
connect_db = sqlite3.connect(index_db_path)

tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", connect_db)

for table in tables['name']:
    print(f"Table Name: {table}")

    query = "SELECT * FROM inverted_index"
    df = pd.read_sql_query(query, connect_db)

    df = pd.read_sql(f"SELECT * FROM {table}", connect_db)
    display(df)
    print('\n')

connect_db.close()


Table Name: inverted_index


Unnamed: 0,word,document_ids
0,the,0123456789101112131415161718
1,project,0123456789101112131415161718
2,gutenberg,0123456789101112131415161718
3,ebook,0123456789101112131415161718
4,of,0123456789101112131415161718
...,...,...
78520,bread-stuffs,18
78521,proportiona,18
78522,at|,18
78523,comma,18




