<a href="https://colab.research.google.com/github/BobGanti/ColabNotebooks/blob/main/CRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation For CRAG <br>(Corrective Retrieval Augmentation Generation)

### Setup

#### Installing Required Libraries

In [None]:
!pip install -q \
torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pyarrow==14.0.1 datasets==2.13.0 \
dill \
openai \
requests \
datasets \
accelerate \
beautifulsoup4 \
PyPDF2 PyMuPDF \
faiss-cpu faiss-gpu \
google-api-python-client \
transformers sentence-transformers sentencepiece rank-bm25

In [3]:
import os
import re
import torch
import pickle
import pymupdf
import uuid
import numpy as np
from openai import OpenAI
from torch.utils.data import Dataset
import transformers
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter

#### Mounting the drive and setting up the environment variables (Colab specific)

In [2]:

from google.colab import drive, userdata
drive = drive.mount('/content/drive', force_remount=True)

DATA_DIR = userdata.get('ROOT_DIR')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
GOOGLE_SEARCH_API_KEY = userdata.get('GOOGLE_SEARCH_API_KEY')
GOOGLE_CSE_ID = userdata.get('GOOGLE_CSE_ID')
arxiv_DIR = DATA_DIR + "/arxiv"
PDF_DIR = arxiv_DIR + "/PDFs"


Mounted at /content/drive


### Class Definitions

In [4]:
global_chunk_counter = 0

# Chunk class
class Chunk:
    def __init__(self, text, metadata, embedding):
        self.text = text
        self.embedding = embedding
        self.metadata = metadata

    def __repr__(self):
        return f"Chunk({self.metadata['chunk_id']}, {self.metadata['title']}, Page {self.metadata['page_number']})"

class Document:

    def __init__(self, path):
        self.path = path
        self.title = os.path.splitext(os.path.basename(path))[0]
        self.document_id = f"{self.title.replace(' ', '_')}#{str(uuid.uuid4())[:10]}"
        self.chunks = []
        self._extract_and_chunk_text(chunk_size=2500, chunk_overlap=100)


    def _extract_and_chunk_text(self, chunk_size=2500, chunk_overlap=100):
        global global_chunk_counter
        document = pymupdf.open(self.path)
        separators = ['\n\n', '\n', ' ', '']
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators
        )
        print(f"\n***** Processing: {self.title} *****\n")
        num_chunks = 0
        for num in range(len(document)):
            page = document.load_page(num)
            page_text = self.get_page_text(page)
            page_chunks = text_splitter.split_text(page_text)
            for i, chunk in enumerate(page_chunks):
                page_number = num + 1
                global_chunk_counter += 1

                metadata = {
                    "title": self.title,
                    "document_id": self.document_id,
                    "page_number": page_number,
                    "chunk_id": f"Pge-{page_number}-Chnk{num_chunks + 1}",
                    "global_chunk_number": global_chunk_counter
                }
                embed = EmbeddingModels(MODEL_NAME, chunk, flag='gpt').embeddings
                newchunk = Chunk(chunk, metadata, embed)
                self.chunks.append(newchunk)
                print(f"Chunk {global_chunk_counter} created: {metadata}")
                num_chunks += 1
        print(f"Total chunks created: {num_chunks}")



    def remove_section(self, text):
        # Heuristic to remove the References section in the pdf file
        text = re.split(r'References|Bibliography', text, flags=re.IGNORECASE)[0]
        return text

    def get_page_text(self, page):
        text = page.get_text()
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = self.remove_section(text)
        return text

    def get_chunks(self):
        return self.chunks

    def save_document(self, dir):
        os.makedirs(dir, exist_ok=True)

        file_path = os.path.join(dir, f'{self.document_id}.pkl')
        with open(file_path, 'wb') as file:
            pickle.dump(self, file)

    def __repr__(self):
        return f"Document({self.title}, {len(self.chunks)} chunks)"

# EmbeddingModels class
class EmbeddingModels:
    def __init__(self, model, text, flag='gpt'):
        self.model = model
        self.flag = flag
        if self.flag == 'gpt':
            self.embeddings = self.__create_gpt_embedding(text)
        elif self.flag == 'sentence':
            self.embeddings = self.__create_sentence_embedding(text)

    def __create_gpt_embedding(self, text):
        client = OpenAI(api_key=OPENAI_API_KEY)
        embedding = client.embeddings.create(input=[text], model=self.model).data[0].embedding
        return embedding

    def __create_sentence_embedding(self, text):
        embedding = self.model.encode(text, convert_to_tensor=True)
        embedding_np = np.array(embedding).astype('float32')
        return embedding_np

# VectorStore class
class VectorStore:
    def __init__(self, dimension, index_file=arxiv_DIR + '/Vectors/faiss_index.idx'):
        self.index = faiss.IndexFlatL2(dimension)
        self.index_file = index_file
        self.metadata_store = []

    def add_embeddings(self, embeddings, metadata):
        embeddings = np.array(embeddings).astype('float32')
        if len(embeddings.shape) == 1:
            embeddings = np.expand_dims(embeddings, axis=0)
            metadata = [metadata]
        self.index.add(embeddings)
        self.metadata_store.extend(metadata)

    def query(self, embedding, top_k=5):
        embedding_np = np.array([embedding]).astype('float32')
        vectors = self.load_index()
        distances, indices = vectors.search(embedding_np, top_k)
        return distances, indices

    def save_index(self):
        try:
            faiss.write_index(self.index, self.index_file)
            return "Index saved to disk."
        except Exception as e:
            return f"Error saving index: {e}"


    def load_index(self):
        self.index = faiss.read_index(self.index_file)
        return self.index

# Custom dataset class
class GPT2Dataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

class EvaluatorEncoder:
    # Function to encode the data using the fine-tuned tokenizer
    def encode_data(self, tokenizer, data, max_length=512, is_test=False):
        inputs = tokenizer(
            [f"Question: {q} Context: {c}" for q, c in zip(data['Query'].tolist(), data['Chunk'].tolist())],
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        if not is_test:  # Only convert labels if not test data
            labels = torch.tensor(data['Relevance'].tolist())
            inputs['labels'] = labels
        return inputs


### PDFs Preprocessing Functions

#### Create Documents and Saving them locally

In [8]:

# Function to read PDFs from a directory and create Document objects
def create_documents_from_pdf(pdf_dir, save_dir):
    document_paths = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith('.pdf')]
    print("Number of PDFs: ", len(document_paths))

    documents = []
    for path in document_paths:
        document = Document(path)
        document.save_document(save_dir)
        documents.append(document)
    return documents

SAVE_DIR = arxiv_DIR + "/Documents"
os.makedirs(SAVE_DIR, exist_ok=True)

# Creating documents and save them locally
created_documents = create_documents_from_pdf(PDF_DIR, SAVE_DIR)
print("Number of Documents Created: ", len(created_documents))


In [6]:

SAVE_DIR = arxiv_DIR + "/Documents"
os.makedirs(SAVE_DIR, exist_ok=True)

# Creating documents and save them locally
created_documents = create_documents_from_pdf(PDF_DIR, SAVE_DIR)
print("Number of Documents Created: ", len(created_documents))


#### Loading Saved Documents from Local Dir and explore its features

---



In [17]:
# Function to load saved Document objects
def load_saved_documents(dir):
    documents = []
    for filename in os.listdir(dir):
        if filename.endswith('.pkl'):
            file_path = os.path.join(dir, filename)
            with open(file_path, 'rb') as file:
                document = pickle.load(file)
                documents.append(document)
    return documents
documents = load_saved_documents(SAVE_DIR)
print("Number of Documents: ", len(documents))

Number of Documents:  23


In [None]:
# Directory paths
LOAD_DIR = arxiv_DIR + "/Documents"
MODEL_NAME = "text-embedding-3-small"
FLAG = 'gpt'

# Load documents from the local directory
loaded_documents = load_saved_documents(LOAD_DIR)
print("Number of Documents: ", len(loaded_documents))

# Verify embeddings

# Printing last record
print(loaded_documents[-1].chunks[-1].metadata)
print(f"Embeddings: {loaded_documents[-1].get_chunks()[-1].embedding[:5]}")
print("Text: ", loaded_documents[-1].chunks[0].text[:200])


### Vector Store

#### Vectorstore Credentials Setup

In [5]:
def get_vectorstore_credentials(FLAG='gpt'):

    EmbeddingCredentials = {
            'gpt':{
                "MODEL_NAME":"text-embedding-3-small",
                "FAISS_INDEX_FILE":arxiv_DIR + '/Vectors/faiss_index.idx'
            },
            "sentence":{
                "MODEL_NAME":'all-MiniLM-L6-v2',
                "FAISS_INDEX_FILE":arxiv_DIR + '/Vectors/faiss_index_sentence.idx'
            }
    }
    return EmbeddingCredentials

FLAG='gpt'
EmbeddingCredentials = get_vectorstore_credentials(FLAG)
MODEL_NAME = EmbeddingCredentials[FLAG]['MODEL_NAME']
FAISS_INDEX_FILE = EmbeddingCredentials[FLAG]['FAISS_INDEX_FILE']
os.makedirs(os.path.dirname(FAISS_INDEX_FILE), exist_ok=True)
print(MODEL_NAME)
print(FAISS_INDEX_FILE)


text-embedding-3-small
/content/drive/MyDrive/Datasets/arxiv/Vectors/faiss_index.idx


#### Initialising the Vectorstore

In [10]:
# Initialising the Vectorstore
DIM = len(EmbeddingModels(MODEL_NAME, "hello", flag='gpt').embeddings)
vector_store = VectorStore(dimension=DIM, index_file=FAISS_INDEX_FILE)

vector_store.load_index()
vectorstore_records = vector_store.index.ntotal
print("Num vectores after loading embeddings: ", vectorstore_records)

Num vectores after loading embeddings:  775


In [12]:
# Addiding Documents
def add_documents_to_vectorstore():
    for doc in loaded_documents:
        for chunk in doc.get_chunks():
            vector_store.add_embeddings(chunk.embedding, chunk.metadata)

    vector_store.save_index()
    return vector_store

if vectorstore_records == 0:
    vector_store = add_documents_to_vectorstore()
    vector_store.save_index()

print("Num vectores after adding documents: ", vector_store.index.ntotal)


Num vectores after adding documents:  775


### Querying And Retrieving Similar Vectors and Their Distances from Query

In [19]:
import time

# query_text = "What are the three different ways multi-head attention is utilized in the Transformer model, and how does self-attention in the decoder maintain the auto-regressive property?"
query_text = "What are the three different ways multi-head attention is utilized in the Transformer model, and how does self-attention in the decoder maintain the auto-regressive property?"
documents = loaded_documents

start_time = time.time()

query_embedding = EmbeddingModels(MODEL_NAME, query_text, flag='gpt').embeddings

distances, indices = vector_store.query(query_embedding, top_k=5)

chunks = []
indices = indices[0]
for idx in indices:
    chunk_idx = idx
    for doc in documents:
        chunks.extend(doc.get_chunks())
        chunk_idx -= len(doc.get_chunks())

retrieved_chunks = [chunks[idx] for idx in indices]

end_time = time.time()
print(f"*** Time taken: {end_time - start_time} seconds *****")


*** Time taken: 0.5313231945037842 seconds *****


In [20]:

print("\nIndices: ", indices)
print("\nDistances: ", distances[0])
for chunk in retrieved_chunks:#
    print("Chunk: ", chunk)
    print("     Chars: ", len(chunk.text))
    print()



Indices:  [105 109 106 102 573]

Distances:  [0.5638442  0.6012912  0.6202766  0.80954206 0.8601509 ]
Chunk:  Chunk(Pge-2-Chnk4, Attention Is All You Need, Page 2)
     Chars:  1859

Chunk:  Chunk(Pge-5-Chnk8, Attention Is All You Need, Page 5)
     Chars:  2498

Chunk:  Chunk(Pge-3-Chnk5, Attention Is All You Need, Page 3)
     Chars:  1826

Chunk:  Chunk(Pge-1-Chnk1, Attention Is All You Need, Page 1)
     Chars:  2497

Chunk:  Chunk(Pge-4-Chnk7, Leave No Context Behind, Page 4)
     Chars:  751



#### Detecting Knowledge in the Retrieved Chunks, discarding Chunks without knowledge

In [21]:
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def detect_knowledge(query_text, retrieved_chunks):

    eva_query_embedding = query_embedding
    retrieved_chunk_texts = [chunk.text for chunk in retrieved_chunks]
    eva_document_embeddings = [chunk.embedding for chunk in retrieved_chunks]

    # Computing cosine similarities
    cosine_scores = util.cos_sim(eva_query_embedding, eva_document_embeddings)
    cosine_normalizer = MinMaxScaler()
    cosine_scores_normalized = cosine_normalizer.fit_transform(cosine_scores.cpu().numpy()[0].reshape(-1, 1)).flatten()

    # BM25-Based Keyword Matching
    tokenized_docs = [doc.split(" ") for doc in retrieved_chunk_texts]
    bm25 = BM25Okapi(tokenized_docs)

    tokenized_query = query_text.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)

    # Normalize the Scores
    bm25_normalizer = MinMaxScaler()
    bm25_scores_normalized = bm25_normalizer.fit_transform(bm25_scores.reshape(-1, 1)).flatten()

    # Combine Normalized Scores to Create Confidence Scores
    confidence_scores = bm25_scores_normalized + cosine_scores_normalized

    # Rank all chunks by their confidence scores
    ranked_indices = np.argsort(-confidence_scores)

    # Discarding Irrelevant Chunks based on confidence score
    is_relevant = False
    label = ""
    retained_chunks = []
    for idx in ranked_indices:
        if (cosine_scores_normalized[idx] != 0) and (bm25_scores_normalized[idx] != 0):
            if confidence_scores[idx] >= 1.0:
                label = "Relevant"
                is_relevant = True
            elif 0.5 <= confidence_scores[idx] < 1.0:
                label = "Ambiguous"
                is_relevant = True
            else:
                label = "Irrelevant"
                is_relevant = False
        else:
            label = "Highly Irrelevant"
            is_relevant = False

        if is_relevant:
            retained_chunks.append(retrieved_chunks[idx])
        else:
            continue
    return retained_chunks


In [22]:
# Detect Knowledge
retained_chunks = detect_knowledge(query_text, retrieved_chunks)

# Print Results
print("Num Chunks before knowledge detection: ", len(retrieved_chunks))
print("Num Chunks after knowledge detection: ", len(retained_chunks))
for retained_subchunk in retained_chunks:
    print("\nRetained chunk: ", retained_subchunk)
    print("     Chars: ", len(retained_subchunk.text))


Num Chunks before knowledge detection:  5
Num Chunks after knowledge detection:  3

Retained chunk:  Chunk(Pge-5-Chnk8, Attention Is All You Need, Page 5)
     Chars:  2498

Retained chunk:  Chunk(Pge-3-Chnk5, Attention Is All You Need, Page 3)
     Chars:  1826

Retained chunk:  Chunk(Pge-2-Chnk4, Attention Is All You Need, Page 2)
     Chars:  1859


#### Evaluating Retrieval Accuracy using Fine-tuned GPT-2 and Trigger for Websearch

In [38]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer
import numpy as np
import pandas as pd
from scipy.special import softmax

import os
os.environ["WANDB_MODE"] = "disabled"

def revaluate_retrieved_chunks(query, chunks, model, tokenizer):
    # Preparing data for inference
    new_test_data = pd.DataFrame({
        'Query': [query] * len(chunks),
        'Chunk': [chunk.text for chunk in chunks]
    })

    # Encoding
    new_test_inputs = EvaluatorEncoder().encode_data(tokenizer, new_test_data, is_test=True)
    new_test_dataset = GPT2Dataset(new_test_inputs)

    trainer = Trainer(model=model)

    # Predict relevance
    new_preds = trainer.predict(new_test_dataset)
    logits = new_preds.predictions

    # Interpreting Scores
    probabilities = softmax(logits, axis=1)
    max_probs = probabilities.max(axis=1)

    chunk_dict = []
    for chunk, score in zip(chunks, max_probs):
        chunk_dict.append({
            "chunk": chunk,
            "accuracy": f"{score:.4f}"
        })
    rated_chunks = sorted(chunk_dict, key=lambda x: x['accuracy'], reverse=True)

    return rated_chunks

# Run
tokenizer = GPT2Tokenizer.from_pretrained(arxiv_DIR + '/gpt-model')
model = GPT2ForSequenceClassification.from_pretrained(arxiv_DIR + '/gpt-model')
model.config.pad_token_id = model.config.eos_token_id

query = query_text

rated_chunks = revaluate_retrieved_chunks(query, retained_chunks, model, tokenizer)



In [39]:
print("Query: ", query, "\n")

for chunk in rated_chunks:
    print(chunk['chunk'])
    print(chunk['accuracy'])
    print('************************')


Query:  What are the three different ways multi-head attention is utilized in the Transformer model, and how does self-attention in the decoder maintain the auto-regressive property? 

Chunk(Pge-3-Chnk5, Attention Is All You Need, Page 3)
0.9955
************************
Chunk(Pge-2-Chnk4, Attention Is All You Need, Page 2)
0.9907
************************
Chunk(Pge-5-Chnk8, Attention Is All You Need, Page 5)
0.9893
************************


#### Knowledge Refinement



In [41]:
import nltk
import math

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Decompose the chunk into sentences
def decompose_chunk(subchunk):
    segments = sent_tokenize(subchunk)
    return segments

# Evaluate relevance of each segment
def evaluate_segments_heuristic(segments, query):
    query_terms = query.lower().split()
    relevance_scores = []
    for segment in segments:
        segment_terms = segment.lower().split()
        score = sum(1 for term in segment_terms if term in query_terms)
        relevance_scores.append(score)
    return relevance_scores

# Recompose relevant segments
def recompose_segments(segments, relevance_scores, threshold=5):
    relevant_segments = [segment for segment, score in zip(segments, relevance_scores) if score >= threshold]
    recomposed_document = ' '.join(relevant_segments)
    return recomposed_document

internal_knowledge = []
for chunk in rated_chunks:
    if float(chunk['accuracy']) > 0.5:
        internal_knowledge.append(chunk['chunk'].text)

print("\ninternal knowledge: ", len(internal_knowledge))
refined_internal_knowledge = []
query = query_text
for knowledge in internal_knowledge:
    segments = decompose_chunk(knowledge)
    relevance_scores = evaluate_segments_heuristic(segments, query)
    recomposed_document = recompose_segments(segments, relevance_scores)
    refined_internal_knowledge.append(recomposed_document)

print("Len Doc: ", len(refined_internal_knowledge))
for i, refined in enumerate(refined_internal_knowledge):
    print("Before Refine: ", len(rated_chunks[i]['chunk'].text))
    print("Text: ", rated_chunks[i]['chunk'].text[:50])
    print("Refined chunk Chars: ", len(refined))
    print("Text: ", refined[:50])
    print("Accuracy: ", rated_chunks[i]['accuracy'])
    print()
    print("Text: \n", refined)
    print()
    print(rated_chunks[i]['chunk'].text)

print()



internal knowledge:  3
Len Doc:  3
Before Refine:  1826
Text:  Figure 1: The Transformer - model architecture. Th
Refined chunk Chars:  956
Text:  The Transformer follows this overall architecture 
Accuracy:  0.9955

Text: 
 The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively. The first is a multi-head self-attention mechanism, and the second is a simple, position- wise fully connected feed-forward network. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent po

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Web Searches and scraping for External knowledge

In [60]:
import requests
import PyPDF2
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from googleapiclient.discovery import build

web_refined_knowledge = []

# Setup Trigger for Websearch
trigger_web = len(retained_chunks) < round((len(retrieved_chunks)/2)+0.01)
print("Websearch Triggered: ", trigger_web)

def google_web_search(query, api_key, cse_id, num_results=3):
    scraping_allowed = False
    # Function to scrape text from a URL
    def __scrape_text_from_url(url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            scraping_allowed = True
        except RequestException as e:
            #print(f"Request denied for {url}: {e}")
            scraping_allowed = False
            return ""

        if scraping_allowed:
            content_type = response.headers.get('Content-Type', '').lower()
            full_text = ""
            if 'pdf' in content_type or url.endswith('.pdf'):
                # Handle PDF
                try:
                    with open('temp.pdf', 'wb') as f:
                        f.write(response.content)
                        pdf_reader = PyPDF2.PdfReader('temp.pdf')
                        for page in pdf_reader.pages:
                            text = page.extract_text()
                            if text:
                                full_text += text
                    os.remove('temp.pdf')
                except Exception as e:
                    print(f"Failed to extract text from PDF at {url}: {e}")
                    return ""
            elif 'html' in content_type:
                # Handle HTML
                try:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    paragraphs = soup.find_all('p')
                    full_text = ' '.join([para.get_text() for para in paragraphs])
                except Exception as e:
                    print(f"Failed to extract text from HTML at {url}: {e}")
                    return ""
            else:
                print(f"Unsupported content type at {url}: {content_type}")
                return ""

            return full_text

    # Google Custom Search API
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=query, cx=cse_id, num=num_results).execute()
    search_results = []
    for item in res['items']:
        result = {
            'title': item['title'],
            'snippet': item['snippet'],
            'link': item['link'],
            'text': __scrape_text_from_url(item['link'])
            }
        search_results.append(result)
        print(f"Title: {result['title']}")
        print(f"Snippet: {result['snippet']}")
        print(f"Link: {result['link']}")
        print(f"Text: {result['text'][:100]}...")
    return search_results

def chunk_text(text, chunk_size, overlap_size):
    wchunks = []
    for i in range(0, len(text), chunk_size - overlap_size):
        chunk = text[i:i + chunk_size]
        wchunks.append(chunk)
    return wchunks

web_text = []
if trigger_web:
    res = google_web_search(query_text, GOOGLE_SEARCH_API_KEY, GOOGLE_CSE_ID, num_results=3)
    for r in res:
        if r['text'] != "":
            web_text.append({
                "title":r['title'],
                "snippet":r['snippet'],
                "link":r['link'],
                "text":r['text']
            })
        else:
            continue

    print("Num Web Results: ", len(web_text))
    print()
    for re in web_text:
        print(re['title'])
        print(re['snippet'])
        print(re['link'])
        print(re['text'][:100])
        print()

    # Chunking Scraped Text from Websearches.
    chunk_size = 1000
    overlap_size = 100
    external_knowledge = []
    for webtext in web_text:
        webchunks = chunk_text(webtext['text'], chunk_size, overlap_size)
        external_knowledge.append(webchunks)

    wchunks = [chunk for sublist in external_knowledge for chunk in sublist]

    # Retrieve Similar Web Chunks
    model = SentenceTransformer('all-MiniLM-L6-v2')

    query_text = query_text
    # embedding = model.encode(query_text, convert_to_tensor=True)
    query_embedding = np.array(model.encode(query_text)).astype('float32').reshape(1, -1)
    web_embeddings = np.array(model.encode(wchunks)).astype('float32').reshape(len(wchunks), -1)

    dimension = web_embeddings.shape[1]
    web_faiss_index = faiss.IndexFlatL2(dimension)
    web_faiss_index.add(web_embeddings)

    # Refining Web Data

    web_k = 3
    distances, indices = web_faiss_index.search(query_embedding, web_k)
    print("Distances: ", distances[0])
    print("Indices: ", indices[0])


    web_retrieved_chunks = []
    for i, idx in enumerate(indices[0]):
        web_retrieved_chunks.append({
            "distance":f"{distances[0][i]:.4f}",
            "text":wchunks[idx]
        })


    # Refine Web Retrieved Chunks
    web_knowledge_corpus = web_retrieved_chunks
    query = query_text
    for knowledge in web_knowledge_corpus:
        segments = decompose_chunk(knowledge['text'])
        relevance_scores = evaluate_segments_heuristic(segments, query)
        recomposed_document = recompose_segments(segments, relevance_scores)
        web_refined_knowledge.append(recomposed_document)
        print(relevance_scores)


Websearch Triggered:  False


### Reconciling the Knowledge Base

In [61]:
knowledge_base = []

if len(refined_internal_knowledge) > 0:
    for internal in refined_internal_knowledge:
        knowledge_base.append(internal)

if len(web_refined_knowledge) > 0:
    for web in web_refined_knowledge:
        knowledge_base.append(web_refined_knowledge[0])

print("Knowledge Base: ", len(knowledge_base))
for kb in knowledge_base:
        print(len(kb))
        print(kb[:100])


Knowledge Base:  3
956
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
939
In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced
1510
These are concatenated and once again projected, resulting in the final values, as depicted in Figur


### GPT Response Generator

In [62]:
from openai import OpenAI

llm = OpenAI(api_key=OPENAI_API_KEY)

def generate_gpt_response(query, text_chunks):
    response = llm.chat.completions.create(
        model="gpt-4o-mini",
         messages=[
            {
                "role": "system",
                "content": "You are an AI assistant with expertise in generating meaningful responses to queries based on given context."
            },
            {
                "role": "user",
                "content": "Generate a response to the following query, limiting your knowledge on the given context only. context:\n"
            },
            {
                "role": "assistant",
                "content": f"Query: {query}\n\nContext: {text_chunks}\n\nResponse:"
            }
        ]
    )
    return response.choices[0].message.content

query = query_text
context = knowledge_base
response = generate_gpt_response(query, knowledge_base)
print("Question: ", query)
print()
print("Response: ", response)
print()


Question:  What are the three different ways multi-head attention is utilized in the Transformer model, and how does self-attention in the decoder maintain the auto-regressive property?

Response:  In the Transformer model, multi-head attention is utilized in three distinct ways:

1. **Encoder-Decoder Attention**: In this layer, the queries come from the decoder's previous layer, while the keys and values are derived from the output of the encoder. This setup enables each position in the decoder to attend to all positions in the input sequence, effectively linking the encoder's output to the decoder's input.

2. **Self-Attention in the Encoder**: Here, all the keys, values, and queries originate from the same input, which is the previous layer's output in the encoder. This allows each position in the encoder to attend to all other positions in the encoder, creating a rich internal representation of the input sequence.

3. **Self-Attention in the Decoder**: Similar to the encoder's self