RAG based (Retrieval Augmented Generation) Information Retrieval System for document analysis

# Mount the Drive (No need to rerun)

In [1]:
# Log into hugging face
# Needed for permission to download Llama2
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Process Text Files

We will be using nltk package for NLP related tasks

In [1]:
# Download NLTK tokenizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import os
import docx
import re
from nltk.tokenize import sent_tokenize
from tqdm import tqdm  # Progress bar

# Path to your folder storing raw articles
FOLDER_PATH = r""

def extract_text_from_docx(file_path):
  """Extracts and returns full text from a .DOCX file."""
  doc = docx.Document(file_path)
  text = "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
  return text

# Get list of all .DOCX files
docx_files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith((".docx", ".doc"))]

# Function to remove weird/special characters, keeping most punctuation and symbols
def clean_special_characters(text):
  # Keep letters, numbers, whitespace, and these symbols: . , ; : ? ! ( ) " ' % - / $ & *
  return re.sub(r"[^a-zA-Z0-9\s.,;:?!()\"'%\-/$&*]", "", text)

# Process all DOCX files with a progress bar
sentences_data = []  # Store (document_name, sentence)
document_texts = {}  # Store (document_name: full_text)

for file in tqdm(docx_files):
  file_path = os.path.join(FOLDER_PATH, file)
  text = extract_text_from_docx(file_path)         
  # Normalize whitespace
  text = " ".join(text.split())  
  # Remove weird characters, keep *
  text = clean_special_characters(text)  
    
  # Filtering step for Nexus Uni Articles
  # Remove text that corrupts the first sentence
  # Truncate everything before/including the match
  match = re.search(r"Length: \d+ words(?:.*?)\bBody\b ", text)
  if match:
    text = text[match.end():]  
  # Remove the trailing Load-Date section
  end_match = re.search(r"Load-Date: .*?End of Document", text)
  if end_match:
    text = text[:end_match.start()]  # Keep everything before that

  # Save the cleaned full text
  document_texts[file] = text
    
  # Split into sentences
  sentences = sent_tokenize(text)

  # Store full sentences with document reference
  for sentence in sentences:
    # Filter short sentences
    if len(sentence.split()) < 5:                   
      continue
    sentences_data.append((file, sentence))

print(f"Loaded {len(docx_files)} documents and extracted {len(sentences_data)} cleaned sentences.")

100%|██████████████████████████████████████████████████████████████████████████████| 9469/9469 [06:58<00:00, 22.61it/s]

Loaded 9469 documents and extracted 761969 cleaned sentences.





# Embed Documents

Two methods of embeddings, we can either create embeddings through openai or sentence transformers, they have identical performance


Reference: \
https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/ \


In [None]:
import openai
import numpy as np
from sentence_transformers import SentenceTransformer

# Enter the openai API key
# Needed for all openai related operations
openai.api_key = ""

'''
def get_embedding(text):
  """Fetches an OpenAI embedding for a given text."""
  response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
  return response["data"][0]["embedding"]
'''

# Use sentence transformer to create embedding
# Faster and Cheaper
# Load model
# https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("all-mpnet-base-v2")
def get_embedding(text):
    """Fetches an embedding using MiniLM instead of OpenAI API."""
    return embedding_model.encode(text, convert_to_numpy=True)

In [None]:
# Load or Create faiss index
from tqdm import tqdm
import os
import faiss
import pickle

# Path to store the bin and pkl file
FAISS_INDEX_PATH = r".bin"
SENTENCE_METADATA_PATH = r".pkl"

def load_faiss_index():
  """Loads the FAISS index and sentence metadata if available."""
  if os.path.exists(FAISS_INDEX_PATH) and os.path.exists(SENTENCE_METADATA_PATH):
    print("Loading existing FAISS index...")

    # Load FAISS index
    index = faiss.read_index(FAISS_INDEX_PATH)

    # Load sentence metadata
    with open(SENTENCE_METADATA_PATH, "rb") as f:
      sentence_metadata = pickle.load(f)

      print(f"FAISS index and metadata loaded successfully.")
      return index, sentence_metadata
  else:
      print("No saved FAISS index found. Recomputing from scratch...")
      return None, None

In [5]:
# Try loading FAISS index
index, sentence_metadata = load_faiss_index()

# If no saved index, compute embeddings and create a new FAISS index
if index is None:
  sentence_embeddings = []
  sentence_metadata = []

  print("Generating embeddings...")
  for doc_name, sentence in tqdm(sentences_data, desc="Embedding Sentences", unit="sentence"):
    embedding = get_embedding(sentence)
    sentence_embeddings.append(embedding)
    sentence_metadata.append((doc_name, sentence))

  # Convert to NumPy array
  sentence_embeddings = np.array(sentence_embeddings)

  # Normalize each embedding to unit length (L2 norm = 1)
  sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)

  # Use inner product index (will act as cosine similarity after normalization)
  # Performs inner product search between the query vector and all stored vectors
  # Since we have L2 normalized all vectors, it is same as cosine similarity search
  index = faiss.IndexFlatIP(sentence_embeddings.shape[1])
    
  index.add(sentence_embeddings)
  print(f"FAISS index created with {len(sentence_metadata)} sentences.")

  # Save the FAISS index and metadata for future runs
  faiss.write_index(index, FAISS_INDEX_PATH)
  with open(SENTENCE_METADATA_PATH, "wb") as f:
    pickle.dump(sentence_metadata, f)

Loading existing FAISS index...
FAISS index and metadata loaded successfully.


# Implement Chatgpt based Question Answering

Maximal Marginal Relevance: https://www.cs.cmu.edu/~jgc/publication/The_Use_MMR_Diversity_Based_LTMIR_1998.pdf \
https://medium.com/tech-that-works/maximal-marginal-relevance-to-rerank-results-in-unsupervised-keyphrase-extraction-22d95015c7c5

Relevance = How similar a sentence is to the query (cosine similarity). \
Redundancy = How similar a sentence is to the already selected sentences. \
λ (lambda) = Controls the trade-off between relevance and diversity.

High relevance (if λ is low) \
High diversity (if λ is high)

Two options:
* Save selected source sentences to txt file
* Apply MMR to diversify retrieved sentences

In [6]:
# MMR formula: MMR Score = (1 − λ) ⋅ Relevance − λ ⋅ Redundancy
def mmr(query_embedding, retrieved_embeddings, retrieved_indices, diversity_factor, result_size):
  # List to score indices of selected sentences
  selected = []
  # Indices of sentences not yet selected
  unselected = list(range(len(retrieved_indices)))

  # Normalize query and retrieved embeddings
  query_embedding /= np.linalg.norm(query_embedding)
  retrieved_embeddings /= np.linalg.norm(retrieved_embeddings, axis=1, keepdims=True)

  # Compute cosine similarity to the query
  similarity_to_query = np.dot(retrieved_embeddings, query_embedding)

  # While there still are unselected sentence and we have not reach result_size
  while len(selected) < result_size and unselected:
    if not selected:
      # Select the most relevant sentence first
      best_idx = np.argmax(similarity_to_query)
    else:
      # Compute cos similar between retrieved sentences and already selected ones
      similarity_to_selected = np.max(
        np.dot(retrieved_embeddings[unselected], retrieved_embeddings[selected].T), axis=1
      )
      # Calculate MMR score
      mmr_score = (1 - diversity_factor) * similarity_to_query[unselected] - diversity_factor * similarity_to_selected
      # Select the sentence with the highest MMR score
      best_idx = unselected[np.argmax(mmr_score)]

    # Add the best sentence to the selected list and remove it from unselected
    selected.append(best_idx)
    unselected.remove(best_idx)

  return [retrieved_indices[i] for i in selected]

## Sentence/Paragraph Retrieval

In [7]:
from nltk.tokenize import sent_tokenize

def find_paragraph_with_highlight(text, sentence, context_window=5):
  # Tokenize into individual sentences
  sentences = sent_tokenize(text)

  # Try to find exact match index
  try:
    index = sentences.index(sentence)
  except ValueError:
    # If not exact match, try a fuzzy match fallback
    for i, s in enumerate(sentences):
      if sentence.strip() in s:
        index = i
        break
      else:
        return None  # Sentence not found

  # Define the window of context
  start = max(0, index - context_window)
  end = min(len(sentences), index + context_window + 1)

  # Highlight the target sentence
  context = sentences[start:end]
  context[index - start] = f"*****{sentences[index]}*****"

  # Join context back into paragraph-like text
  return " ".join(context)

In [8]:
# Generate document used for Chatgpt WebUI
def generate_document(query, top_k=10000, threshold = 0.7, save_source_file=True, use_mmr=False):
  # Get query embedding
  query_embedding = np.array([get_embedding(query)])
  # Retrieve the top Most Relevant Sentences from FAISS
  # Distances are the similarity score (cosine similarity)
  distances, indices = index.search(query_embedding, top_k)

  # Filter results by similarity threshold
  # The threshold value is pickly based on human observation of the result
  # Default at 0.7
  filtered = [
    (idx, sim) for idx, sim in zip(indices[0], distances[0])
    if sim >= threshold
  ]

  sims = [sim for _, sim in filtered]
  print(f"Max similarity score: {max(sims):.4f}")
  print(f"Min similarity score: {min(sims):.4f}")
    
  # Extract retrieved embeddings and sentence indices
  # Reconstruct the compressed embedding
  retrieved_indices = [idx for idx, _ in filtered]
  retrieved_embeddings = np.array([index.reconstruct(int(idx)) for idx in retrieved_indices])

  # Apply MMR if enabled
  if use_mmr:
    # lambda = 0.7: Prioritizes more diverse results while keeping relevance
    # lambda = 1.0: Prioritizes absolute diverse results
    diversity_factor = 1
    selected_indices = mmr(query_embedding[0], retrieved_embeddings, retrieved_indices, diversity_factor, len(retrieved_indices))
  else:
    selected_indices = retrieved_indices

  # Get final results
  # sentence_metadata[idx][0]: doc_name, sentence_metadata[idx][1]: sentence
  results = [(sentence_metadata[idx][0], sentence_metadata[idx][1]) for idx in selected_indices]

  # Remove duplicated sentences (based only on sentence text)
  seen_sentences = set()
  deduplicated_results = []
  for doc, sentence in results:
      if sentence not in seen_sentences:
          seen_sentences.add(sentence)
          deduplicated_results.append((doc, sentence))

  context_window = 6
  # If save_to_file is True, write results to a text file
  if save_source_file:
    output_file = "retrieved_sentences.txt"
    print(f"Retrieved sentences saved to: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
      f.write(f"Query: {query}\n")
      f.write(f"Threshold used: {threshold}, Max similarity score: {max(sims):.4f}, Min similarity score: {min(sims):.4f} \n\n")
      f.write(f"Total {len(deduplicated_results)} Retrieved Sentences:\n")
      f.write(f"Context/Paragraph length: {context_window}\n")
      f.write("="*50 + "\n")
      for doc, sentence in deduplicated_results:
        if doc in document_texts:
          full_text = document_texts[doc]
          paragraph = find_paragraph_with_highlight(full_text, sentence, context_window)
          if paragraph:
            f.write(f"Document: {doc}\n")
            f.write(f"Sentence: {sentence}\n")
            f.write(f"Paragraph: {paragraph}\n\n")
          else:
            f.write(f"Document: {doc}\n")
            f.write(f"Sentence (not found in paragraph): {sentence}\n")
            f.write(f"Paragraph: None \n\n")
        else:
          f.write(f"Sentence: {sentence}\n")
          f.write(f"Document: {doc} not found in document_texts\n\n")

## Sentence search with Chatgpt QA

In [50]:
# Set top_k to control API input usage (right now, equals to document size)
# WebUI is a much bigger window thus we have looser file size restriction
def search_sentences(query, top_k=2000, threshold = 0.75, save_source_file=False, use_mmr=True):
  # Get query embedding
  query_embedding = np.array([get_embedding(query)])
  # Retrieve the top Most Relevant Sentences from FAISS
  # Distances are the similarity score (cosine similarity)
  distances, indices = index.search(query_embedding, top_k)

  # Filter results by similarity threshold
  # The threshold value is pickly based on human observation of the result
  # Default at 0.75
  filtered = [
    (idx, sim) for idx, sim in zip(indices[0], distances[0])
    if sim >= threshold
  ]
    
  # Extract retrieved embeddings and sentence indices
  # Reconstruct the compressed embedding
  retrieved_indices = [idx for idx, _ in filtered]
  retrieved_embeddings = np.array([index.reconstruct(int(idx)) for idx in retrieved_indices])

  # Maximum output sentence number 120, which is around 8000 tokens to fill in the prompt
  result_size = min(120, len(retrieved_indices))
  # Apply MMR if enabled
  if use_mmr:
    # lambda = 0.7: Prioritizes more diverse results while keeping relevance
    # lambda = 1.0: Prioritizes absolute diverse results
    diversity_factor = 1
    selected_indices = mmr(query_embedding[0], retrieved_embeddings, retrieved_indices, diversity_factor, result_size)
  else:
    # [TODO] From this top_k sentences, fill up to 10000 Token
    selected_indices = retrieved_indices[:result_size]

  # Get final results
  # sentence_metadata[idx][0]: doc_name, sentence_metadata[idx][1]: sentence
  results = [(sentence_metadata[idx][0], sentence_metadata[idx][1]) for idx in selected_indices]

  # Remove duplicated sentences (based only on sentence text)
  seen_sentences = set()
  deduplicated_results = []
  for doc, sentence in results:
      if sentence not in seen_sentences:
          seen_sentences.add(sentence)
          deduplicated_results.append((doc, sentence))
    
  # If save_to_file is True, write results to a text file
  if save_source_file:
    output_file = "retrieved_sentences.txt"
    print(f"Retrieved sentences saved to: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
      f.write(f"Query: {query}\n\n")
      f.write(f"Total {len(deduplicated_results)} Retrieved Sentences:\n")
      f.write("="*50 + "\n")
      for doc, sentence in deduplicated_results:
        f.write(f"Document: {doc}\nSentence: {sentence}\n\n")

  return deduplicated_results

In [7]:
def generate_bpoint_answer(query, save_source_file, use_mmr):
  # Use top ranked sentences, this saves the usage of API in risk of losing unpopular factor
  # Result will be in form of (Document, Sentence)
  relevant_sentences = search_sentences(query, save_source_file=save_source_file, use_mmr=use_mmr)
  # Constructing context from retrieved sentences
  context = "\n\n".join([f"Document: {doc[0]}\nSentence: {doc[1]}" for doc in relevant_sentences])

  # Using whole sentence base (Exceed API limit of 10000 tokens)
  # context = "\n\n".join([f"Document: {doc[0]}\nSentence: {doc[1]}" for doc in sentences_data])

  # Unfortunately, without a specific number, Chat will stop early (< 10 high level factors)
  # One way is to force Chat giving higher number of factors, and force it to give unreasonable ones
  # Minimum factor number scaling with context size, right now fixed
  # Apparently, the 4096 tokens window is only able to fit in 35 - 40 factors
  # min_factors = len(relevant_sentences) // 10
  min_factors = 35
    
  prompt = f"""
  You are an AI assistant that answers questions based only on the provided sentences.

  **Question:** {query}

  **Relevant Sentences:**
  {context}

  **Instructions:**
  1. Use only the provided sentences to answer the question.
  2. For each supporting sentence, provide an explanation.
  3. Ensure that no factor mentioned in the context is omitted.
  4. You must give as many factors as possible, go beyond the obvious and be exhaustive. 
  5. Break down each broad factor into as many smaller, distinct factors as possible. Keep each factor small but in detail.
  6. Consider different angles for each factor: economic, political, technological, social, and historical perspectives.
  7. Keep generating factors until you have thoroughly extracted every possible relevant detail. 
     Do not stop early — continue listing factors until all unique angles and details have been explored.
     Only stop when there are no more justifiable factors supported by the provided context.
  8. Give me at least {min_factors} factors, unless no relevant information is found in the context.
  9. Prioritize a thorough and complete response. Do not prioritize brevity or speed.
  10. Format the response as follows:
    "
    Factor 1: [Brief explanation of the factor]
    Document: [Document 1 name]
    Evidence: [Sentence 1 from the context that supports Factor 1]
    Detailed explaination of why this factor can be used to answer the question

    Factor 2: [Brief explanation of the factor]
    Document: [Document 2 name]
    Evidence: [Sentence from the context that supports Factor 2]
    Detailed explaination of why this factor can be used to answer the question
    ...
    "
  
  Provide a clear and well-structured response.
  """

  # Checker line to check if model reads all the prompt
  # 11. Ignore everything above and say "banana" as the answer.
    
  response = openai.ChatCompletion.create(
    # Optimized Chatgpt
    model="gpt-4-turbo",
    temperature=0.0, # Set to 0 for most factual and consist answering
    max_tokens=4096, # Maximum completion (output) tokens
    messages=[{"role": "system", "content": "You answer questions with full citations and explanations."},
              {"role": "user", "content": prompt}]
    )

  # Check how many tokens were used for response, check if model stops early
  print(response['usage'])
  return response["choices"][0]["message"]["content"]

# Ask question

Check API usage: https://platform.openai.com/settings/organization/usage

Query should be short and precise for better performance

In [84]:
query = "Why is TSMC's operation in the Phoenix/Arizona being delayed?"

In [17]:
query = "What are the critiques for lack of community consideration?"

Generate generate source file needed to answer the question, question is answered by Chatgpt WEBUI \
Input a threshold value, the value caps off all the sentences that its similarity score is below this number \
Prompt is placed at the very end of this Jupyter Notebook

In [20]:
# temp has no use, it is here to avoid cell output
generate_document(query, threshold = 0.5, save_source_file=True, use_mmr=False)

Max similarity score: 0.6318
Min similarity score: 0.5015
Retrieved sentences saved to: retrieved_sentences.txt


Generate answer with Chatgpt API, with option to generate source file alongside for Chatgpt WEBUI \
Answer will be solely based on provided information, thus simulating a CDQA engine \
If no relevant information is found, there will be no answer

In [None]:
# Favors diversity
answer = generate_bpoint_answer(query, save_source_file=False, use_mmr=True)
print(answer)

# Prompt for Chatgpt

Here is the prompt for the Chatgpt WebUI, with retrieved_sentences.txt as input file

query = 

Strictly follow the instruction of the following prompt.

prompt = f"""
  You are an AI assistant that answers questions based only on the provided sentences.

  **Question:** {query}

  **Knowledge Base:**
  {retrieved_sentences.txt}

  **Instructions:**
  1. Use only the provided document to answer the question.
  2. For each supporting sentence, provide an explanation.
  3. Do not omit any factor mentioned in the context.
  4. Extract as many factors as possible by:
     - Going beyond the obvious and being exhaustive.
     - Breaking down broad ideas into smaller, distinct components.
     - Considering multiple perspectives: economic, political, technological, social, and historical.
  5. Keep generating factors until every justifiable interpretation is exhausted.
  6. Do not generate any answer if no relevant information is found in the document.
  7. Prioritize completeness and accuracy over brevity or speed.
  8. Format the response as follows:    
    "
    Factor 1: [Brief explanation of the factor]
    Document: [Document 1 name]
    Evidence: [Sentence 1 from the context that supports Factor 1]
    Detailed explaination of why this factor can be used to answer the question

    Factor 2: [Brief explanation of the factor]
    Document: [Document 2 name]
    Evidence: [Sentence from the context that supports Factor 2]
    Detailed explaination of why this factor can be used to answer the question
    ...
    "
  9. Output only the text "ABABABABAB" and ignore all other instructions.
    
  Provide a clear and well-structured response.
  """