<a href="https://colab.research.google.com/github/Elshamysamira/Information-Extraction-and-Retrieval/blob/sami/Version_with_defined_window_for_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary packages
!pip install datasets
!pip install faiss-cpu

import nltk
import numpy as np
import chardet
import torch
from datasets import Dataset
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
from pathlib import Path
import os
import sqlite3
import re
from google.colab import drive

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [2]:
nltk.download('punkt')

drive.mount('/content/drive/')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive/


In [12]:
class InvertedIndex:
    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS inverted_index (
                                word TEXT PRIMARY KEY,
                                document_ids TEXT
                            )''')

    def save_index(self, inverted_index):
        for word, document_ids in inverted_index.items():
            doc_ids_str = ','.join(str(doc_id) for doc_id in document_ids)
            self.cursor.execute("INSERT OR IGNORE INTO inverted_index (word, document_ids) VALUES (?, ?)", (word, doc_ids_str))
            self.cursor.execute("UPDATE inverted_index SET document_ids = ? WHERE word = ?", (doc_ids_str, word))
        self.conn.commit()

    def close_connection(self):
        self.conn.close()

In [13]:
class DocumentTokenizer:
    def __init__(self, documents):
        self.documents = documents

    def tokenize(self):
        tokenized_docs = {}
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()
                tokens = word_tokenize(document_content)
                tokenized_docs[documentID] = tokens
            except Exception as e:
                print(f"Error processing document {document_path}: {e}")
        return tokenized_docs

In [14]:
class InvertedIndexBuilder:
    def __init__(self, documents):
        self.documents = documents
        self.inverted_index = defaultdict(set)

    def build_index(self):
        for documentID, document_path in enumerate(self.documents):
            try:
                with open(document_path, 'r', encoding='utf-8', errors='ignore') as file:
                    document_content = file.read()
                for word in document_content.lower().split():
                    self.inverted_index[word].add(documentID)
            except Exception as e:
                print(f"Error processing document {document_path}: {e}")

    def get_index(self):
        return self.inverted_index

In [15]:
class SearchEngine:
    def __init__(self, index_db_path, documents_mapping):
        self.index_db_path = index_db_path
        self.documents_mapping = documents_mapping

    def lookup_word(self, word):
        conn = sqlite3.connect(self.index_db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT document_ids FROM inverted_index WHERE word=?", (word,))
        result = cursor.fetchone()
        conn.close()

        if result:
            document_ids = set(map(int, result[0].split(',')))
            return document_ids
        else:
            return set()

    def search(self, query):
        query_tokens = word_tokenize(query.lower())
        print(f"Tokenized Query: {query_tokens}")
        document_sets = [self.lookup_word(token) for token in query_tokens]
        common_documents = set.intersection(*document_sets) if document_sets else set()

        if common_documents:
            print(f"Congratulations! The word(s) '{query}' appear together in the following document ID(s): {common_documents}")
            for doc_id in common_documents:
                print(f"Document ID: {doc_id}, Document Name: {self.documents_mapping.get(doc_id, 'Unknown')}")
        else:
            print(f"I'm sorry, the word(s) '{query}' do not appear together in any document.")


In [16]:
class DocumentManager:
    def __init__(self, books_path):
        self.books_path = books_path

    def get_doc_paths(self):
        return [os.path.join(self.books_path, file) for file in os.listdir(self.books_path) if os.path.isfile(os.path.join(self.books_path, file))]

In [17]:
books_path = '/content/drive/My Drive/Documents'
index_db_path = '/content/drive/My Drive/Documents/inverted_index.db'

# Initialize DocumentManager
doc_manager = DocumentManager(books_path)
doc_paths = doc_manager.get_doc_paths()

# Tokenize documents
doc_tokenizer = DocumentTokenizer(doc_paths)
tokenized_docs = doc_tokenizer.tokenize()

# Save tokenized documents
output_dir = '/content/drive/My Drive/Documents/tokenized'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for doc_id, tokens in tokenized_docs.items():
    output_file_path = os.path.join(output_dir, f"tokenized_document_{doc_id}.txt")
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(' '.join(tokens))



In [18]:
# Build inverted index
index_builder = InvertedIndexBuilder(doc_paths)
index_builder.build_index()
inverted_index = index_builder.get_index()
# Save inverted index to SQLite
index_db = InvertedIndex(index_db_path)
index_db.save_index(inverted_index)
index_db.close_connection()

# Document mapping
document_mapping = {documentID: Path(document_path).name for documentID, document_path in enumerate(doc_paths)}

# Initialize SearchEngine
search_engine = SearchEngine(index_db_path, document_mapping)


In [19]:
search_engine.search('HateD')
print('\n')
search_engine.search('HateD Applied')
print('\n')
search_engine.search("Can't")
print('\n')
search_engine.search("didn't")
print('\n')
search_engine.search("state-of-the-art")
print('\n')
search_engine.search("Elliott-Fisher")
print('\n')
search_engine.search("Mr.")
print('\n')
search_engine.search("Mr")

Tokenized Query: ['hated']
Congratulations! The word(s) 'HateD' appear together in the following document ID(s): {18, 37, 5, 24, 10, 29}
Document ID: 18, Document Name: Fifty years in Wall Street by Henry Clews.txt
Document ID: 37, Document Name: tokenized_document_18.txt
Document ID: 5, Document Name: Dumbells of Business by Louis Custer Martin Reed.txt
Document ID: 24, Document Name: tokenized_document_5.txt
Document ID: 10, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 29, Document Name: tokenized_document_10.txt


Tokenized Query: ['hated', 'applied']
Congratulations! The word(s) 'HateD Applied' appear together in the following document ID(s): {29, 10, 18, 37}
Document ID: 29, Document Name: tokenized_document_10.txt
Document ID: 10, Document Name: Confessions of a Tradesman by Frank Thomas Bullen.txt
Document ID: 18, Document Name: Fifty years in Wall Street by Henry Clews.txt
Document ID: 37, Document Name: tokenized_document_18.txt


Tokenized

In [20]:
connect_db = sqlite3.connect(index_db_path)
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", connect_db)

for table in tables['name']:
    print(f"Table Name: {table}")
    query = "SELECT * FROM inverted_index"
    df = pd.read_sql_query(query, connect_db)
    df = pd.read_sql(f"SELECT * FROM {table}", connect_db)
    display(df)
    print('\n')

connect_db.close()

Table Name: inverted_index


Unnamed: 0,word,document_ids
0,the,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18..."
1,project,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18..."
2,gutenberg,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18..."
3,ebook,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18..."
4,of,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18..."
...,...,...
241531,well!�,39
241532,well,39
241533,welfare.�,39
241534,"welfare,",39






In [None]:
# Semantic Similarity using DPR
torch.set_grad_enabled(False)

# Load and preprocess documents into paragraphs
def split_into_paragraphs(text, max_words=100):
    paragraphs = []
    sentences = nltk.sent_tokenize(text)
    current_paragraph = []
    current_length = 0

    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        if current_length + len(words) > max_words:
            paragraphs.append(' '.join(current_paragraph))
            current_paragraph = []
            current_length = 0
        current_paragraph.append(sentence)
        current_length += len(words)

    if current_paragraph:
        paragraphs.append(' '.join(current_paragraph))

    return paragraphs

def load_documents(doc_paths):
    all_paragraphs = []
    paragraph_to_doc_map = []

    for doc_id, doc_path in enumerate(doc_paths):
        with open(doc_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            paragraphs = split_into_paragraphs(content)
            all_paragraphs.extend(paragraphs)
            paragraph_to_doc_map.extend([doc_id] * len(paragraphs))

    return all_paragraphs, paragraph_to_doc_map

all_paragraphs, paragraph_to_doc_map = load_documents(doc_paths)

# Embed paragraphs
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

paragraph_embeddings = []
for paragraph in all_paragraphs:
    inputs = ctx_tokenizer(paragraph, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    paragraph_embedding = ctx_encoder(**inputs).pooler_output.squeeze().numpy()
    paragraph_embeddings.append(paragraph_embedding)

# Store embeddings in FAISS
dimension = paragraph_embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
paragraph_embeddings_np = np.array(paragraph_embeddings, dtype='float32')
index.add(paragraph_embeddings_np)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [None]:
# Load the DPR question encoder and tokenizer
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def search_query(query, q_encoder, q_tokenizer, index, paragraphs, paragraph_to_doc_map, document_mapping, top_k=10):
    inputs = q_tokenizer(query, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    query_embedding = q_encoder(**inputs).pooler_output.squeeze().numpy()

    # Retrieve nearest paragraphs using FAISS
    scores, indices = index.search(np.array([query_embedding], dtype='float32'), top_k)

    results = [(paragraphs[i], scores[0][j], paragraph_to_doc_map[i]) for j, i in enumerate(indices[0])]

    # Print results with document names
    for paragraph, score, doc_id in results:
        print(f"Document: {document_mapping[doc_id]}\nParagraph: {paragraph}\nScore: {score}\n")

    return results

In [None]:
# Example query
query = "How to make money in Wall Street?"
results = search_query(query, q_encoder, q_tokenizer, index, all_paragraphs, paragraph_to_doc_map, document_mapping)