<a href="https://colab.research.google.com/github/Elshamysamira/Information-Extraction-and-Retrieval/blob/main/Paragraphs_withouth_nDCG_25_05_2022%5Bcleaned%5D_with_Metadata_withoutSAVING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
!pip install datasets
!pip install faiss-cpu

import nltk
import numpy as np
import chardet
import torch
from datasets import Dataset
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
from pathlib import Path
import os
import sqlite3
import re
from google.colab import drive

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [None]:
nltk.download('punkt')

drive.mount('/content/drive/')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive/


In [None]:
class InvertedIndex:
    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS inverted_index (
                                word TEXT PRIMARY KEY,
                                document_ids TEXT
                            )''')

    def save_index(self, inverted_index):
        for word, document_ids in inverted_index.items():
            doc_ids_str = ','.join(str(doc_id) for doc_id in document_ids)
            self.cursor.execute("INSERT INTO inverted_index (word, document_ids) VALUES (?, ?)", (word, doc_ids_str))
        self.conn.commit()

    def close_connection(self):
        self.conn.close()

class DocumentTokenizer:
    def __init__(self, documents):
        self.documents = documents

    def tokenize(self):
        tokenized_docs = {}
        for documentID, document_content in enumerate(self.documents):
            try:
                tokens = word_tokenize(document_content)
                tokenized_docs[documentID] = tokens
            except Exception as e:
                print(f"Error processing document {documentID}: {e}")
        return tokenized_docs

class InvertedIndexBuilder:
    def __init__(self, documents):
        self.documents = documents
        self.inverted_index = defaultdict(set)

    def build_index(self):
        for documentID, tokens in self.documents.items():
            for word in tokens:
                self.inverted_index[word.lower()].add(documentID)

    def get_index(self):
        return self.inverted_index

class SearchEngine:
    def __init__(self, index_db_path, documents_mapping):
        self.index_db_path = index_db_path
        self.documents_mapping = documents_mapping

    def lookup_word(self, word):
        conn = sqlite3.connect(self.index_db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT document_ids FROM inverted_index WHERE word=?", (word,))
        result = cursor.fetchone()
        conn.close()

        if result:
            document_ids = set(map(int, result[0].split(',')))
            return document_ids
        else:
            return set()

    def search(self, query):
        query_tokens = word_tokenize(query.lower())
        print(f"Tokenized Query: {query_tokens}")  # Print tokenized query

        document_sets = [self.lookup_word(token) for token in query_tokens]
        common_documents = set.intersection(*document_sets) if document_sets else set()

        if common_documents:
            print(f"Congratulations! The word(s) '{query}' appear together in the following document ID(s): {common_documents}")
            for doc_id in common_documents:
                print(f"Document ID: {doc_id}, Document Name: {self.documents_mapping.get(doc_id, 'Unknown')}")
        else:
            print(f"I'm sorry, the word(s) '{query}' do not appear together in any document.")

class DocumentManager:
    def __init__(self, books_path):
        self.books_path = books_path

    def get_doc_paths(self):
        return [os.path.join(self.books_path, file) for file in os.listdir(self.books_path) if os.path.isfile(os.path.join(self.books_path, file))]

def clean_text(text):
    return re.sub(r'\n\s*\n+', '\n\n', text)

def load_and_clean_documents(doc_paths):
    cleaned_docs = []
    for doc_id, doc_path in enumerate(doc_paths):
        with open(doc_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            cleaned_text = clean_text(text)
            cleaned_docs.append((doc_id, cleaned_text, Path(doc_path).stem))
    return cleaned_docs

# Paths
books_path = '/content/drive/My Drive/Documents'
index_db_path = '/content/drive/My Drive/Documents/inverted_index.db'

# Initialize DocumentManager
doc_manager = DocumentManager(books_path)
doc_paths = doc_manager.get_doc_paths()

# Clean documents and generate embeddings
cleaned_documents = load_and_clean_documents(doc_paths)
cleaned_texts = [doc[1] for doc in cleaned_documents]

# Tokenize documents
doc_tokenizer = DocumentTokenizer(cleaned_texts)
tokenized_docs = doc_tokenizer.tokenize()

# Build inverted index
index_builder = InvertedIndexBuilder(tokenized_docs)
index_builder.build_index()
inverted_index = index_builder.get_index()

# Save inverted index to SQLite
index_db = InvertedIndex(index_db_path)
index_db.save_index(inverted_index)
index_db.close_connection()

# Document mapping
document_mapping = {doc_id: doc_title for doc_id, _, doc_title in cleaned_documents}

# Initialize SearchEngine
search_engine = SearchEngine(index_db_path, document_mapping)
search_engine.search('HateD')
print('\n')
search_engine.search('HateD Applied')
print('\n')
search_engine.search("Can't")
print('\n')
search_engine.search("didn't")
print('\n')
search_engine.search("state-of-the-art")
print('\n')
search_engine.search("Elliott-Fisher")
print('\n')
search_engine.search("Mr.")
print('\n')
search_engine.search("Mr")

connect_db = sqlite3.connect(index_db_path)
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", connect_db)

for table in tables['name']:
    print(f"Table Name: {table}")
    query = "SELECT * FROM inverted_index"
    df = pd.read_sql_query(query, connect_db)
    display(df)
    print('\n')

connect_db.close()

# Semantic Similarity using DPR
torch.set_grad_enabled(False)

# Load the DPR context encoder and tokenizer
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Load and prepare dataset from cleaned documents
def load_paragraphs_for_dpr(cleaned_docs):
    paragraphs = []
    for doc_id, content, doc_title in cleaned_docs:
        for para_id, paragraph in enumerate(content.split('\n\n')):
            paragraphs.append({"doc_id": doc_id, "doc_title": doc_title, "para_id": para_id, "content": paragraph})
    return paragraphs

paragraphs = load_paragraphs_for_dpr(cleaned_documents)

# Print the first few entries of the paragraphs list for inspection
for paragraph in paragraphs[:50]:
    print(paragraph['content'])
    print('[END of paragraph]')  # Add an empty line for better readability

ds = Dataset.from_dict({
    "doc_id": [para["doc_id"] for para in paragraphs],
    "doc_title": [para["doc_title"] for para in paragraphs],
    "para_id": [para["para_id"] for para in paragraphs],
    "content": [para["content"] for para in paragraphs],
})

Tokenized Query: ['hated']
Congratulations! The word(s) 'HateD' appear together in the following document ID(s): {0}
Document ID: 0, Document Name: Dumbells of Business by Louis Custer Martin Reed


Tokenized Query: ['hated', 'applied']
I'm sorry, the word(s) 'HateD Applied' do not appear together in any document.


Tokenized Query: ['ca', "n't"]
I'm sorry, the word(s) 'Can't' do not appear together in any document.


Tokenized Query: ['did', "n't"]
I'm sorry, the word(s) 'didn't' do not appear together in any document.


Tokenized Query: ['state-of-the-art']
I'm sorry, the word(s) 'state-of-the-art' do not appear together in any document.


Tokenized Query: ['elliott-fisher']
I'm sorry, the word(s) 'Elliott-Fisher' do not appear together in any document.


Tokenized Query: ['mr', '.']
Congratulations! The word(s) 'Mr.' appear together in the following document ID(s): {0}
Document ID: 0, Document Name: Dumbells of Business by Louis Custer Martin Reed


Tokenized Query: ['mr']
Congratul

Unnamed: 0,word,document_ids
0,the,01
1,project,01
2,gutenberg,01
3,ebook,01
4,of,01
...,...,...
7654,overcome,1
7655,manly,1
7656,visualizing,1
7657,clearness,1






The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


The Project Gutenberg eBook of Dumbells of Business
[END of paragraph]
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.
[END of paragraph]
Title: Dumbells of Business
[END of paragraph]
Author: Louis Custer Martin Reed
[END of paragraph]
Release date: May 1, 2015 [eBook #48842]
[END of paragraph]
Language: English
[END of paragraph]
Credits: Produced by Juliet Sutherland and the Online Distributed
        Proofreading Team at http://www.pgdp.net
[END of paragraph]
*** START OF THE PROJECT GUTENBERG EBOOK DUMBELLS OF BUSINESS ***
[END of paragraph]
Produced by Juliet Sutherland and the Online Distrib

In [None]:
# Add embeddings to the dataset with truncation
max_length = 512

def add_embeddings(example):
    inputs = ctx_tokenizer(example["content"], truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
    embeddings = ctx_encoder(**inputs).pooler_output.squeeze().numpy()
    return {"embeddings": embeddings}

ds_with_embeddings = ds.map(add_embeddings)
ds_with_embeddings.add_faiss_index(column='embeddings')

Map:   0%|          | 0/1083 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['doc_id', 'doc_title', 'para_id', 'content', 'embeddings'],
    num_rows: 1083
})

In [None]:
# Load the DPR question encoder and tokenizer
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Example query
question = "How to make money on Wall Street?"
inputs = q_tokenizer(question, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
question_embedding = q_encoder(**inputs).pooler_output.squeeze().numpy()

# Retrieve nearest examples using FAISS
scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', question_embedding, k=10)

In [None]:
# Display the retrieved paragraphs and their document titles
for i in range(len(retrieved_examples["content"])):
    print(f"Document Title: {retrieved_examples['doc_title'][i]}")
    print(f"Paragraph: {retrieved_examples['content'][i]}\n")

Document Title: Creating Capital by Frederick L. Lipman
Paragraph: The object of this paper is to discuss money-making; to examine its
prevalence as an aim among people generally and the moral standards
which obtain among those who consciously seek to make money.

Document Title: Creating Capital by Frederick L. Lipman
Paragraph: We can now summarize the attitude and policy of the typical business
man in his money-making aim as follows:

Document Title: Dumbells of Business by Louis Custer Martin Reed
Paragraph: But gradually they got used to the upset and flurry, and when the
monthly Balance Sheet began to smile and then to grin, it poked their
Ambition in the ribs and the first thing they knew they were actually
craning their shaved necks for business in the Domestic as well as in
the Export arena.

Document Title: Dumbells of Business by Louis Custer Martin Reed
Paragraph: If the Credit Man lost only 1/40th of 1 per cent on the year’s
accounts, he would dodge around in front of the 

In [None]:
# Access the FAISS index
faiss_index = ds_with_embeddings.get_index('embeddings').faiss_index

# Use the FAISS index for range search
limits, distances, indices = faiss_index.range_search(x=question_embedding.reshape(1, -1), thresh=0.95)

# Save FAISS index to Google Drive
faiss_index_path = '/content/drive/My Drive/Documents/my_index.faiss'
ds_with_embeddings.save_faiss_index('embeddings', faiss_index_path)

# Save dataset with embeddings
dataset_with_embeddings_path = '/content/drive/My Drive/Documents/dataset_with_embeddings.pt'
torch.save(ds_with_embeddings, dataset_with_embeddings_path)

# Save and load FAISS index from Google Drive
ds = Dataset.from_dict({
    "doc_id": [para["doc_id"] for para in paragraphs],
    "doc_title": [para["doc_title"] for para in paragraphs],
    "para_id": [para["para_id"] for para in paragraphs],
    "content": [para["content"] for para in paragraphs]
})
ds.load_faiss_index('embeddings', faiss_index_path)


In [None]:
import numpy as np
import os

# Function to export embeddings and metadata for TensorFlow Projector
def export_to_tsv(dataset, output_dir):
    embeddings_path = os.path.join(output_dir, 'embeddings.tsv')
    metadata_path = os.path.join(output_dir, 'metadata.tsv')

    with open(embeddings_path, 'w', encoding='utf-8') as embeddings_file, \
         open(metadata_path, 'w', encoding='utf-8') as metadata_file:

        # Write headers
        metadata_file.write("doc_id\tdoc_title\tpara_id\tcontent\n")

        for example in dataset:
            if 'embeddings' in example and example['embeddings'] is not None:
                # Write embeddings
                embeddings_file.write('\t'.join(map(str, example['embeddings'])) + '\n')
                # Write metadata
                content_snippet = example['content'][:30].replace('\n', ' ') + '...'
                metadata_file.write(f"{example['doc_id']}\t{example['doc_title']}\t{example['para_id']}\t{content_snippet}\n")

# Create output directory
export_dir = '/content/drive/My Drive/Documents/tensorflow_projector'
if not os.path.exists(export_dir):
    os.makedirs(export_dir)

# Export embeddings and metadata to the specified directory
export_to_tsv(ds_with_embeddings, export_dir)

print(f"Embeddings and metadata exported to {export_dir} for TensorFlow Projector.")


Embeddings and metadata exported to /content/drive/My Drive/Documents/tensorflow_projector for TensorFlow Projector.
